小練習爬取豆瓣電影熱度排行榜
原因:
從火狐瀏覽器中復制header內容是復制出來的帶省略號,這個需要注意(這類問題似乎就是url或者header中有類似的錯誤)
還會遇到這樣的錯誤:ssl-warnings InsecureRequestWarning,但似乎並不影響最終的結果,但似乎慢一點
網上有許多解決方案,比如加入如下代碼:
import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
練習代碼:
#coding=utf-8 import requests import json import urllib3 from easygui import msgbox def getUrls(total): n = total / 20 if int(n) != n: n = int(n) + 1 url_list = [] for i in range(int(n)): page_start = str(i * 20) url_list.append('https://movie.douban.com/j/search_subjects?type=movie&tag=熱門&sort=recommend&page_limit=20&page_start='+ page_start) return url_list def getContent(total): title = "警告:" msg = "查詢條目數應該為整數!!!" if not isinstance(total, int): msgbox(msg=msg, title=title) return headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0'} url_list = getUrls(total) content_list = [] urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) for url in url_list: response = requests.get(url, headers, verify=False) content = response.content.decode() #這里得到一個json字符串 content_list.extend(json.loads(content)['subjects']) if len(content_list) < total: return content_list else: return content_list[:total] if __name__ == "__main__": total = int(input("請輸入要查看的條目數:")) for i, item in enumerate(getContent(total)): print(i + 1, "、", item['title'], item['rate'], item['url'])