爬取電影天堂最新電影,地址https://www.dytt8.net/html/gndy/dyzz/list_23_1.html
1 import requests 2 from lxml import etree 3 4 BASE_DOMAIN = 'https://www.dytt8.net' 5 HEADERS = { 6 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' 7 } 8 9 10 def get_detail_urls(url): 11 response = requests.get(url, headers=HEADERS) 12 text = response.content.decode(encoding='gbk', errors='ignore') 13 html = etree.HTML(text) 14 detail_urls = html.xpath('//table[@class="tbspan"]//a/@href') 15 detail_urls = map(lambda url: BASE_DOMAIN+url, detail_urls) 16 return detail_urls 17 18 19 def get_detail_info(url): 20 movie = {} 21 response = requests.get(url, headers=HEADERS) 22 text = response.content.decode(encoding='gbk', errors='ignore') 23 html = etree.HTML(text) 24 movie['title'] = html.xpath('//div[@class="title_all"]//font/text()')[0] 25 img = html.xpath("//div[@id='Zoom']//img/@src") 26 movie['cover'] = img[0] 27 movie['screenshot'] = img[1] 28 infos = html.xpath('//div[@id="Zoom"]//p/text()') 29 # 提取信息 30 is_actors = False 31 actors = [] 32 for info in infos: 33 # print(info) 34 if info.startswith('◎年 代'): 35 movie['year'] = info.replace("◎年 代", "").strip() 36 elif info.startswith('◎產 地'): 37 movie['country'] = info.replace("◎產 地", "").strip() 38 elif info.startswith('◎類 別'): 39 movie['category'] = info.replace("◎類 別", "").strip() 40 elif info.startswith('◎豆瓣評分'): 41 movie['douban_rating'] = info.replace("◎豆瓣評分", "").strip() 42 elif info.startswith('◎片 長'): 43 movie['duration'] = info.replace("◎片 長", "").strip() 44 elif info.startswith('◎導 演'): 45 movie['director'] = info.replace("◎導 演", "").strip() 46 elif info.startswith('◎主 演'): 47 actors = [info.replace("◎主 演", "").strip()] 48 is_actors = True 49 elif is_actors: 50 if info.startswith('◎'): 51 is_actors = False 52 movie['actors'] = actors 53 continue 54 actors.append(info.strip()) 55 movie['download'] = html.xpath("//div[@id='Zoom']//tbody//a/text()") 56 movie['magnet'] = html.xpath("//div[@id='Zoom']//a/@href")[0] 57 58 return movie 59 60 61 def spider(): 62 base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' 63 movies = [] 64 for i in range(1, 2): 65 url = base_url.format(i) 66 detail_urls = get_detail_urls(url) 67 for detail_url in detail_urls: 68 # 對詳情頁提取信息 69 # print(detail_url) 70 movies.append(get_detail_info(detail_url)) 71 print(movies) 72 73 74 if __name__ == '__main__': 75 spider()
學習的視頻中代碼有幾處跟我的有不同,可以學習
一、
其中提取主演的代碼不同,如下
for index,info in enumerate(infos): if info.startswith("◎年 代"): info = parse_info(info,"◎年 代") movie['year'] = info # .......省略 elif info.startswith("◎主 演"): info = parse_info(info,"◎主 演") actors = [info] for x in range(index+1,len(infos)): actor = infos[x].strip() if actor.startswith("◎"): break actors.append(actor) movie['actors'] = actors
采用的是index的方式.
enumerate() 函數用於將一個可遍歷的數據對象(如列表、元組或字符串)組合為一個索引序列,同時列出數據和數據下標,一般用在 for 循環當中。
二、
還有
detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
這段代碼沒怎么用過.記錄下.
三、
base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' for i in range(1, 10): url = base_url.format(i)
以前寫的時候沒這么寫過,都是直接弄成
url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_'+i+'.html'這樣.
四、
在自己寫代碼時有個錯誤,也需要記錄下,在movie的字典賦值的時候,
如下
for info in infos: # print(info) if info.startswith('◎年 代'): year = info.replace("◎年 代", "").strip() elif info.startswith('◎產 地'): country = info.replace("◎產 地", "").strip() elif info.startswith('◎類 別'): category = info.replace("◎類 別", "").strip() elif info.startswith('◎豆瓣評分'): douban_rating = info.replace("◎豆瓣評分", "").strip()
賦值的時候使用
movie = { 'year': year, 'country': country, 'category': category, 'douban_rating': douban_rating }
會報錯,因為其中有一個豆瓣評分是不存在的,不會對其賦值,所以movie賦值的時候會錯誤.
UnboundLocalError: local variable 'douban_rating' referenced before assignment
當然,個人覺得用正則可以更容易解決。
在記錄下爬取豆瓣正在上映的電影用xpath的代碼
import requests from lxml import etree headers = { 'Referer': 'https://movie.douban.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } url = "https://movie.douban.com/cinema/nowplaying/changsha/" response = requests.get(url, headers=headers) text = response.text html = etree.HTML(text) lis = html.xpath("//div[@id='nowplaying']//ul[@class='lists']/li") movies = [] for li in lis: # print(etree.tostring(li, encoding='utf-8').decode('utf-8')) title = li.xpath("@data-title")[0] score = li.xpath("@data-score")[0] duration = li.xpath("@data-duration")[0] director = li.xpath("@data-director")[0] actor = li.xpath("@data-actors")[0] img = li.xpath(".//img/@src")[0] movie = { 'title': title, 'score': score, 'duration': duration, 'director': director, 'actor': actor, 'img': img } movies.append(movie) print(movies)