主要用到lxml的etree解析網頁代碼,xpath獲取HTML標簽。
代碼如下:

1 #!/user/bin env python 2 # author:Simple-Sir 3 # time:2019/7/17 22:08 4 # 獲取豆瓣網正在上映電影最熱評論 5 import requests 6 from lxml import etree 7 8 # 偽裝瀏覽器 9 headers ={ 10 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 11 'Referer':'https://movie.douban.com/' 12 } 13 # 獲取首頁網頁信息並解析 14 url = 'https://movie.douban.com/cinema/nowplaying/chengdu/' 15 16 def getUrlText(url): 17 respons = requests.get(url,headers=headers) # 獲取網頁信息 18 urlText = respons.text 19 html = etree.HTML(urlText) # 使用lxml解析網頁 20 return html 21 22 # 提取電影名稱及詳情地址鏈接列表 23 def getWallUrl(url): 24 hrefUrl = getUrlText(url) 25 ul = hrefUrl.xpath('//ul[@class="lists"]')[0] # 獲取ul標簽 26 liList = ul.xpath('./li') # # 獲取li標簽列表 27 liHrefs = [] 28 for li in liList: 29 liHref = li.xpath('.//@href')[0] 30 name = li.xpath('@data-title')[0] 31 msg = { 32 name:liHref 33 } 34 liHrefs.append(msg) 35 return liHrefs 36 37 # 解析電影詳情地址 38 def downPL(url): 39 moveUrl = getWallUrl(url) 40 n=0 41 for murl in moveUrl: 42 n+=1 43 for d in murl: 44 plHtml = getUrlText(murl[d]) 45 plTextFull = plHtml.xpath('//div[@id="hot-comments"]//span[@class="hide-item full"]//text()') 46 plTextShort = plHtml.xpath('//div[@id="hot-comments"]//span[@class="short"]//text()') 47 if(len(plTextFull)==0 and len(plTextShort)>0): 48 print('正在寫入《{}》的評論。'.format(d)) 49 with open('豆瓣評論.txt','a+',encoding='utf-8') as fp: 50 fp.write('{}、《{}》的最熱評論是:\n{}\n\n'.format(n,d,plTextShort[0])) 51 elif(len(plTextFull)>0): 52 print('正在寫入《{}》的評論。'.format(d)) 53 with open('豆瓣評論.txt','a+',encoding='utf-8') as fp: 54 fp.write('{}、《{}》的最熱評論是:\n{}\n\n'.format(n,d,plTextShort[0])) 55 else: 56 print('正在寫入《{}》的評論。'.format(d)) 57 with open('豆瓣評論.txt','a+',encoding='utf-8') as fp: 58 fp.write('{}、《{}》暫無評論!\n\n'.format(n,d)) 59 return print('{}部電影的所有評論已全部寫入“豆瓣評論.txt”,請查看。'.format(n)) 60 61 if __name__ == '__main__': 62 downPL(url)
執行效果:
文件詳情: