# 任務四
'''
本任務要求大家完成一個簡單的爬蟲項目,包括網頁爬取、信息提取以及數據保存
在完成本次任務時,建議大家認真思考,結合自己的邏輯,完成任務。
注意:本任務的得分將按照任務提交時間的先后順序與任務正確率結合來計算,
由於每位同學的題目都不相同,建議不要抄襲,一旦發現抄襲情況,本次任務判為0分'''
from typing import Any, Tuple
'''
第一題:請使用爬蟲技術,爬取以下5個url地址的網頁信息,並進行關鍵信息提取。
從爬取到的頁面源碼中提取下列4種信息:
1.文章標題
2.正文內容(注意,只提取文章的文本內容,不得提取頁面中其他無關的文本內容)
3.圖片鏈接(如果有)
4.時間、日期(如果有)'''
#你分配到的url為:url = ['http://fashion.cosmopolitan.com.cn/2019/1020/287733.shtml','http://dress.pclady.com.cn/style/liuxing/1003/520703.html','http://www.smartshe.com/trends/20191009/56414.html','https://dress.yxlady.com/202004/1560779.shtml','http://www.yoka.com/fashion/roadshow/2019/0513/52923401100538.shtml']
url1 ='http://fashion.cosmopolitan.com.cn/2019/1020/287733.shtml';url2 = 'http://dress.pclady.com.cn/style/liuxing/1003/520703.html';url3 = 'http://www.smartshe.com/trends/20191009/56414.html';url4 = 'https://dress.yxlady.com/202004/1560779.shtml';url5 = 'http://www.yoka.com/fashion/roadshow/2019/0513/52923401100538.shtml'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',}
import requests
from bs4 import BeautifulSoup
def get_url1(url,data=None):
url = requests.get(url1, headers=headers)
url.encoding = 'GBk' # 頁面編碼為gbk,將編碼方式轉換為gbk
soup = BeautifulSoup(url.text, 'lxml')
title = soup.title.string
print('1:文章標題',title)
body_text = soup.find_all(class_='p2')
body_image = soup.find_all('img')
time = soup.find_all(class_='time')
for body_text in body_text:
body_text = body_text.string
print('正文:',body_text)
for body_image in body_image:
body_image = body_image.get('src')
print('圖片:',body_image)
for time in time:
time = time.string
print('時間:',time)
print('---'*50)
def get_url2(url,data=None):
url = requests.get(url2, headers=headers)
url.encoding = 'GBk' # 頁面編碼為gbk,將編碼方式轉換為gbk
soup = BeautifulSoup(url.text, 'lxml')
title = soup.title.string
print('2:文章標題', title)
body_text = soup.find_all(class_='artText')
body_image = soup.find_all('img')
time = soup.find_all(class_='time')
for body_text in body_text:
body_text = body_text.text
print('正文:',body_text)
for body_image in body_image:
body_image = body_image.get('src')
print('圖片:',body_image)
for time in time:
time = time.string
print('時間',time)
print('---' * 50)
def get_url3(url,data=None):
url = requests.get(url3, headers=headers)
url.encoding = 'utf-8' # 頁面編碼為utf-8,將編碼方式轉換為utf-8
soup = BeautifulSoup(url.text, 'lxml')
title = soup.title.string
print('1:文章標題',title)
body_text = soup.find_all(class_='art-body')
body_image = soup.find_all('img')
time = soup.select('.art-auther > span:nth-child(1)')
for body_text in body_text:
body_text = body_text.text
print('正文:',body_text)
for body_image in body_image:
body_image = body_image.get('src')
print('圖片:',body_image)
for time in time:
time = time.string
print('時間:',time)
print('---'*50)
def get_url4(url,data=None):
url = requests.get(url4, headers=headers)
url.encoding = 'GBk' # 頁面編碼為gbk,將編碼方式轉換為gbk
soup = BeautifulSoup(url.text, 'lxml')
title = soup.title.string
print('1:文章標題', title)
body_text = soup.select('.left1 > div.ArtCon > p')
body_image = soup.find_all('img')
time = soup.select('#acxc > span')
for body_text in body_text:
body_text = body_text.text
print('正文:', body_text)
for body_image in body_image:
body_image = body_image.get('src')
print('圖片:', body_image)
for time in time:
time = time.text
print('時間:', time)
print('---' * 50)
def get_url5(url,data=None):
url = requests.get(url5, headers=headers)
url.encoding = 'GBk' # 頁面編碼為gbk,將編碼方式轉換為gbk
soup = BeautifulSoup(url.text, 'lxml')
title = soup.title.string
print('1:文章標題', title)
body_text = soup.find_all(class_= 'textCon')
body_image = soup.find_all('img')
time = soup.find_all(class_= 'time')
for body_text in body_text:
body_text = body_text.text
print('正文:', body_text)
for body_image in body_image:
body_image = body_image.get('src')
print('圖片:', body_image)
for time in time:
time = time.text
print('時間:', time)
print('---' * 50)
with open("record.json",'w', encoding='utf-8') as f:
f.write(str(data))
print("加載入文件完成...")
'''請將獲取到的5個頁面所有4種信息寫入Json文件或文本文檔中
注意:寫入時請使用如下格式:1.1.文章標題:榮耀新機拍照評分僅次於“大哥”:疫情緩解后銷售反彈非常大
或使用字典類型寫入json文檔中,不區分標題和編號直接把所有信息寫入文本中本題不記分!'''