1-2 用Python爬取貓眼票房網上的電影票房信息

本文轉載自查看原文 2016-05-16 18:04 2092 爬蟲/ 爬蟲項目

 1 piaofang.py
 2 #-*- coding:utf-8 -*-
 3 '''
 4 該腳本可以抓取貓眼票房網站上的電影票房數據
 5 使用的數據為豆瓣上爬取的電影，見文件：doubanMovies_IMDBScore.csv
 6 '''
 7 import requests
 8 import lxml.html
 9 import time
10 from pandas import DataFrame
11 import pandas as pd
12 
13 headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
14 def getDoc(url):
15     resp=requests.get(url,headers=headers)  #得到網頁響應
16     time.sleep(0.1)   #暫停0.1秒，防止抓取太頻繁被封IP
17     content=resp.text  #獲取相應內容
18     doc = lxml.html.fromstring(content)
19     return doc
20 
21 #函數：輸入為電影名字，輸出為該電影在貓眼網上的票房
22 #說明：如果貓眼上沒有該電影的信息，則標記：notfound
23 #如果貓眼上可以搜到該電影，但是沒有票房數據，則標記：withoutData
24 def getPiaofang(title):
25     #根據電影名字形成貓眼上該電影的搜索結果頁面
26     url = 'http://pf.maoyan.com/search?_v_=yes&key='+title
27     #由於編碼格式比較混亂，所以此處嘗試兩種編碼格式
28     try:
29         url=url.decode('gbk').encode('utf-8')
30     except:
31         url=url.encode('utf-8')
32     finally:
33         tempList=[]  #初始化函數中暫時用到的列表
34         doc=getDoc(url)  #解析網頁
35         #抓取到的后綴名，可能為'萬票房'、'人想看'、'暫無票房數據'
36         temp_back=doc.xpath('//*[@id="search-list"]/article/em/text()')
37         #某一部電影搜索結果頁面，由於會有名字相近的電影會被搜索出來，所以要進行判斷
38         temp_name=doc.xpath('//*[@id="search-list"]/article/div/text()')
39         if temp_name!=[]:  #首先結果頁抓到的電影列表要不為空，即能搜索到該電影
40             #如果為空，則標記為'notfound'
41             for i in range(len(temp_name)):  #對搜索出的電影名字進行判定，取出與搜索的電影名字完全相同的一項
42                 temp1=(temp_name[i]).encode('utf-8')
43                 if temp1==title:  #如果循環到第i個名字，找到了與搜索的電影名字完全相同的一項，則接着對數字的后綴進行判斷
44                     #如果沒有找到與搜索的電影的名字完全相同的一項，則標記為'withoutData'
45                     temp2=unicode(temp_back[i]).encode('utf-8')
46                     if temp2=='萬票房':  #如果后綴名為'萬票房'，則該數據可能就是我們要找的數據
47                         temp_num = doc.xpath('//*[@id="search-list"]/article['+str(i+1)+']/em/span/text()')
48                         if temp_num!=[]:  #如果可以抓取到數據，則轉換為int類型后的數據即為所找的票房數據
49                             #如果抓取不到，則標記為'withoutData'
50                             print int(temp_num[0])
51                             tempList.append(int(temp_num[0]))
52                         else:
53                             tempList.append('withoutData')
54                     else:
55                         tempList.append('withoutData')
56                 else:
57                     tempList.append('withoutData')
58         else:
59             tempList.append('notfound')
60         return tempList[0]
61 
62 df=pd.read_csv('doubanMovies_IMDBScore.csv')  #打開豆瓣上爬取到的電影列表文件
63 piaofangList=[]  #初始化票房列表
64 errorNum=0  #初始化錯誤數
65 for i in range(0,len(df)):
66     try:
67         temp=df.ix[i,'title']
68         temp=temp.decode('gbk').encode('utf-8')  #進行編碼格式轉換
69         piaofangList.append(getPiaofang(temp))  #調用getPiaofang函數，得到票房數據
70     except:
71         errorNum+=1  #出錯，則錯誤數加1
72         piaofangList.append('error')  #將票房數字標記為'error'
73         print 'error No.',errorNum
74     finally:
75         df1=DataFrame({'title':df.ix[:i,'title'],'piaofang':piaofangList})
76         df1.to_csv('test.csv',index=False)
77         print i+1  #打印標記

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python網絡爬蟲（11）近期電影票房或熱度信息爬取 python爬取近十年中國電影票房數據與分析 kaggle——TMDB 電影票房收入預測 python實現的電影票房數據可視化 python實現的、帶GUI界面電影票房數據可視化程序 Python3爬取起貓眼電影實時票房信息，解決文字反爬~~~附源代碼【python數據分析實戰】電影票房數據分析(二)數據可視化【python數據分析實戰】電影票房數據分析(一)數據采集【數據可視化】一、分析歷年電影票房（數據讀取、過濾、分類、繪圖）讓電影票房飛一會兒，五一換個姿勢重溫經典