import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
def spider(url, headers):
print("正在抓取url: " + url)
datas = requests.get(url=url, headers=headers).text
# 解析url
soup = BeautifulSoup(datas, 'lxml')
# 獲取數據集合,find_all 返回的是集合類型,所以取[0], 找table標簽下 的 屬性是 id:tbContent
moives_tables = soup.find_all('table', {'id': 'tbContent'})[0]
# 獲取每一個子節點 tr標簽
moives = moives_tables.findAll('tr')
# 獲取電影名字,電影名字在每個tr標簽里面的第一個td標簽里面,由於是有多個td所以要用for遍歷
names = [tr.find_all('td')[0].a.get('title') for tr in moives[1:]]
# 獲取電影的詳情頁url地址,而且下面提供給獲取導演使用,因為導演信息不在主頁面上
hrefs = [tr.find_all('td')[0].a.get('href') for tr in moives[1:]]
# 獲取電影類型
types = [tr.find_all('td')[1].string for tr in moives[1:]]
# 獲取票房數據
box_offices = [int(tr.find_all('td')[2].string) for tr in moives[1:]]
# 獲取平均票價
Average_fare = [tr.find_all('td')[3].string for tr in moives[1:]]
# 獲取上映日期
show_time = [tr.find_all('td')[6].string for tr in moives[1:]]
# print(names, hrefs, types, box_offices, Average_fare, show_time)
# print(len(hrefs))
daoyans = []
for href in hrefs:
try:
daoyan_datas = requests.get(href)
# 出現錯誤的原因是因為這里的daoyan_datas是requests對象,無法用BeautifulSoup解析,可以在daoyan_datas后面加上content
soup = BeautifulSoup(daoyan_datas.content, 'lxml')
# 獲取導演,由於數據是帶換行的,所以要用replace("\n","") 取消換行
daoyan = soup.select('dl.dltext dd')[0].get_text().replace("\n", "")
#print(daoyan)
daoyans.append(daoyan)
#print(len(daoyans))
time.sleep(0.5)
except:
daoyans.append("獲取失敗")
# 數據拼接,得到的數據類型是 <class 'pandas.core.frame.DataFrame'> ,所以要用 DataFrame() 函數來寫入excel
df = pd.DataFrame({
'name': names,
'href': hrefs,
'type': types,
'box_office': box_offices,
'Average_fare': Average_fare,
'show_time': show_time,
'directors': daoyans
})
download(df)
'''
問題是不能連續存儲,都是重新創建文件csv, os文件操作 mode='a'
'''
def download(df):
df.to_csv('D://box_office.csv', mode='a', index=False, header=False)
print("done")
if __name__ == "__main__":
start_time = time.time()
headers = {
'Cookie': 'Hm_lvt_daabace29afa1e8193c0e3000d391562=1570691612; Hm_lpvt_daabace29afa1e8193c0e3000d391562=1570691612',
'Host': 'www.cbooo.cn',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
base_url = "http://www.cbooo.cn/year?year="
for i in range(2008, 2020):
url = base_url + str(i)
spider(url, headers)
time.sleep(2)
print(round((time.time() - start_time), 3))