- 首先找到這個網頁https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=20&page_start=0
- 然后F12在network下找到這個內容頁,打開后發現參數可調,所以電影數量和ID可以爬取
設計代碼:
def askUrl(url): head={ # 'Host': 'movie.douban.com', # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88", # "Connection": "keep-alive", # "Cache-Control": "max-age = 0", # "Accept-Language": "zh - CN, zh;q = 0.9", # "Accept-Encoding": "gzip, deflate, br", # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reasen) return html #爬取網頁信息 def get_info(baseurl): html = askUrl(baseurl) bs = BeautifulSoup(html, "html.parser") return bs #soup處理並轉換成字符串 def transport(bs, info): ex_info = bs.find_all(class_=info) info = str(ex_info) return ex_info, info ''' TODO:獲取豆瓣電影ID ''' if __name__ == '__main__': #獲取一千個電影ID url= 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=10&page_start=0' res = get_info(url) response_data = json.loads(res.text) # 制作對應電影的評論鏈接 highComment = [] middleComment = [] lowComment = [] for k in response_data['subjects']: id = k['id'] highUrl = "https://movie.douban.com/subject/%s/comments?percent_type=h&start=20&limit=20&status=P&sort=new_score"%(id) print(highUrl)
-
大量獲得豆瓣電影ID 用來制作其影評的鏈接
- 首先找到這個網頁https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=20&page_start=0
- 然后F12在network下找到這個內容頁,打開后發現參數可調,所以電影數量和ID可以爬取-
-
設計代碼,爬取2000個電影的ID信息。
-
代碼如下
-
def askUrl(url):
head={
# 'Host': 'movie.douban.com',
# "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
# "Connection": "keep-alive",
# "Cache-Control": "max-age = 0",
# "Accept-Language": "zh - CN, zh;q = 0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reasen)
return html
#爬取網頁信息
def get_info(baseurl):
html = askUrl(baseurl)
bs = BeautifulSoup(html, "html.parser")
return bs
#soup處理並轉換成字符串
def transport(bs, info):
ex_info = bs.find_all(class_=info)
info = str(ex_info)
return ex_info, info
'''
TODO:獲取豆瓣電影ID
'''
if __name__ == '__main__':
#獲取一千個電影ID
url= 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=10&page_start=0'
res = get_info(url)
response_data = json.loads(res.text)
# 制作對應電影的評論鏈接
highComment = []
middleComment = []
lowComment = []
for k in response_data['subjects']:
id = k['id']
highUrl = "https://movie.douban.com/subject/%s/comments?percent_type=h&start=20&limit=20&status=P&sort=new_score"%(id)
print(highUrl)
-
-
當我們獲取電影對應的評論鏈接后,我們似乎被豆瓣發現了,如果不登錄不讓繼續瀏覽了,所以我們要用python登錄豆瓣
-
首先我們找到登錄頁面
-
-
然后我們輸入錯誤的用戶名和密碼
-
-
然后我們找到登錄的接口
-
-
然后我們往下面拉,查看一下登陸需要的參數
-
-
這樣,我們就可以開始寫我們的登錄代碼了!
-
用python模擬登錄的方法有很多,例如下面這一種
-
s = requests.Session()
if __name__ == '__main__':
login_url = 'https://accounts.douban.com/j/mobile/login/basic'
headers = {
# 'Host': 'movie.douban.com',
# "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
# "Connection": "keep-alive",
# "Cache-Control": "max-age = 0",
# "Accept-Language": "zh - CN, zh;q = 0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
data = {
'name': '15081644800',
'password': 'Qwer1234!',
'remember': 'false',
'tc_app_id': '2044348370',
'randstr':'',
'ticket' : '',
'ck': ''
}
try:
r1 = s.get(url=login_url,headers=headers)
r = s.post(url=login_url, headers=headers,data=data)
r.raise_for_status()
except:
print('爬取失敗')
-
-
但是這時候我們又發現了一個問題,那就是登錄的時候往往需要一個圖形驗證碼,使用代碼可以解決這個問題,但是我們不妨轉換思維,
我們的目的是爬取影評,而不是寫代碼登錄豆瓣,所以而豆瓣判斷用戶是否登錄是通過響應頭中有沒有攜帶用戶信息的token來判斷的,所以我們
不妨自己在豆瓣網頁上登錄一下,然后復制一下載cookie中的token信息,然后放在我們的響應頭之中,這樣我們就可以直接爬取我們需要的信息了
-
首先登錄好后來到下面這個頁面找到我們的cookie
-
-
然后放到我們的代碼之中就可以了
-
-
4.接下來我們就可以正式爬蟲了,我們的思路如下:
-
獲取json格式的電影信息,
-
循環遍歷每個電影並制作其好,中,差三個評價的鏈接同時爬取影評。
-
放入文件
-
這里里面還有一個需要注意的問題,當我們的快速的訪問過多網頁的時候,會被豆瓣懷疑是機器人,然后讓你輸入驗證碼自證清白,
為了防止爬蟲因為這個終止,我們在每次訪問鏈接之前先停頓兩秒鍾,這會讓我們表現得更像人。
-
完整代碼如下:
# -*-coding:utf-8-*-
# @Time :2021/11/20 13:58
# @Author:shuaichao
# @File :.py
# @Software: PyCharm
import urllib.request
from bs4 import BeautifulSoup # 網頁解析,獲悉數據.231
import urllib.request, urllib.error # 制定URL,獲取網頁數據
import time
import os
import requests
from lxml import etree
import json
from urllib.request import Request
from urllib.request import urlopen
def askUrl(url):
headers = {
"Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
request = urllib.request.Request(url, headers=headers)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reasen)
return html
# 爬取網頁信息
def get_info(baseurl):
html = askUrl(baseurl)
bs = BeautifulSoup(html, "html.parser")
return bs
# soup處理並轉換成字符串
def transport(bs, info):
ex_info = bs.find_all(class_=info)
info = str(ex_info)
return ex_info, info
def getImg(url, imgName):
headers = {
"Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
try:
req_timeout = 5
req = Request(url=url, headers=headers)
f = urlopen(req, None, req_timeout)
pic = f.read()
# pic= Request.get(url, timeout=10)
imgPath = './imgs/%s.jpg' % (imgName)
fp = open(imgPath, 'wb')
fp.write(pic)
fp.close()
except Request.exceptions.ConnectionError:
print(u'鏈接失敗') ##再寫一個爬去豆瓣登錄頁面的代碼,並調用上述所寫的方法
'''
TODO:獲取豆瓣電影ID
'''
if __name__ == '__main__':
print("開始")
headers = {
"Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
# 獲取一千個電影ID
# 熱門類型的
url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=1300&page_start=0'
# 國產類型的
url_guochan = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&page_limit=150&page_start=0'
# 豆瓣高分
url_douban='https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&page_limit=300&page_start=0'
# 美劇
url_meiju='https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BE%8E%E5%89%A7&page_limit=300&page_start=0'
res = get_info(url_meiju)
response_data = json.loads(res.text)
# 存放評論
comment_high = []
comment_middle = []
comment_low = []
try:
for index, k in enumerate(response_data['subjects']):
# if index <= 1000:
# print(index)
# continue
# 存放評論
comment_high = []
comment_middle = []
comment_low = []
print(index)
if index % 2 == 0:
time.sleep(5)
id = k['id']
highUrl = "https://movie.douban.com/subject/%s/comments?percent_type=h&limit=20&status=P&sort=new_score" % (
id)
middleUrl = "https://movie.douban.com/subject/%s/comments?percent_type=m&limit=20&status=P&sort=new_score" % (
id)
lowUrl = "https://movie.douban.com/subject/%s/comments?percent_type=l&limit=20&status=P&sort=new_score" % (
id)
print(highUrl)
'''
獲取高評價評論
'''
# 循環請求接口
for i in range(0, 10):
time.sleep(2)
urlTmp = highUrl + "&start=" + str(i * 20)
re = requests.get(url=urlTmp, headers=headers).text
# 構造了一個XPath解析對象並對HTML文本進行自動修正
html = etree.HTML(re)
# XPath使用路徑表達式來選取用戶名
comment = html.xpath('//div[@class="comment"]')
print("開始好評")
for content in comment:
names = content.xpath('.//a[@class=""]')
grades = content.xpath('.//span[contains(@class,"rating")]')
texts = content.xpath('.//span[@class="short"]')
name = names[0].xpath('./text()')[0]
if len(grades) > 0:
grade = grades[0].xpath('./@class')[0][7:8] + '星'
else:
grade = '暫無評價'
text = texts[0].xpath('./text()')[0]
comment_high.append(text)
print(text)
print(len(comment_high))
'''
獲取中評價評論
'''
for i in range(0, 10):
time.sleep(2)
urlTmp = middleUrl + "&start=" + str(i * 20)
re = requests.get(url=urlTmp, headers=headers).text
# 構造了一個XPath解析對象並對HTML文本進行自動修正
html = etree.HTML(re)
# XPath使用路徑表達式來選取用戶名
print("開始中評")
comment = html.xpath('//div[@class="comment"]')
for content in comment:
names = content.xpath('.//a[@class=""]')
grades = content.xpath('.//span[contains(@class,"rating")]')
texts = content.xpath('.//span[@class="short"]')
name = names[0].xpath('./text()')[0]
if len(grades) > 0:
grade = grades[0].xpath('./@class')[0][7:8] + '星'
else:
grade = '暫無評價'
text = texts[0].xpath('./text()')[0]
print(text)
comment_middle.append(text)
print(len(comment_middle))
'''
獲取低評價評論
'''
for i in range(0, 10):
time.sleep(2)
urlTmp = lowUrl + "&start=" + str(i * 20)
re = requests.get(url=urlTmp, headers=headers).text
# 構造了一個XPath解析對象並對HTML文本進行自動修正
html = etree.HTML(re)
# XPath使用路徑表達式來選取用戶名
comment = html.xpath('//div[@class="comment"]')
print("開始差評")
for content in comment:
names = content.xpath('.//a[@class=""]')
grades = content.xpath('.//span[contains(@class,"rating")]')
texts = content.xpath('.//span[@class="short"]')
name = names[0].xpath('./text()')[0]
if len(grades) > 0:
grade = grades[0].xpath('./@class')[0][7:8] + '星'
else:
grade = '暫無評價'
text = texts[0].xpath('./text()')[0]
comment_low.append(text)
print(text)
print(len(comment_low))
# 文件夾不存在,則創建文件夾
save_path = './douban'
folder = os.path.exists(save_path)
if not folder:
os.makedirs(save_path)
print("開始寫入文件")
with open('./douban/comments_high.txt', 'a+', encoding='utf-8') as f:
for v in comment_high:
print(v)
f.write('%s high\n' % v)
with open('./douban/comments_middle.txt', 'a+', encoding='utf-8') as f:
for v in comment_middle:
print(v)
f.write('%s middle\n' % v)
with open('./douban/comments_low.txt', 'a+', encoding='utf-8') as f:
for v in comment_low:
print(v)
f.write('%s low\n' % v)
except:
with open('./douban/comments_high.txt', 'a+', encoding='utf-8') as f:
for v in comment_high:
print(v)
f.write('%s high\n' % v)
with open('./douban/comments_middle.txt', 'a+', encoding='utf-8') as f:
for v in comment_middle:
print("寫入文件")
f.write('%s middle\n' % v)
with open('./douban/comments_low.txt', 'a+', encoding='utf-8') as f:
for v in comment_low:
print("寫入文件")
f.write('%s low\n' % v)
經過長時間的爬取,我們獲得了類似這種格式的信息: