工具
python3.5
BeautifulSoup
步驟:
1、根據url抓取豆瓣電影html,並解析
2、BeautifulSoup截取節點,寫入字典
3、保存字典信息
# -*- coding='utf-8' -*- import requests from bs4 import BeautifulSoup import json #發送request,返回response def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def getMovieInfo(mlist, html): soup = BeautifulSoup(html, 'html.parser') #解析成html lists = soup.find_all('li', attrs={'class':'list-item'}) for ls in lists: if ls.attrs['data-category']== 'nowplaying': #判斷正熱播的電影 mdict = {} mdict['電影名'] = ls.attrs['data-title'] mdict['評分'] = ls.attrs['data-score'] mdict['時長'] = ls.attrs['data-duration'] mdict['主演'] = ls.attrs['data-actors'] mlist.append(mdict) #寫入txt文件 def saveMovieInfo(mlist, path): with open(path, 'w', encoding='utf-8') as f: f.write(str(mlist)) f.close() def main(): mlist = [] url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/' path = 'D://pachong//movie.txt' html = getHTMLText(url) print(len(html)) getMovieInfo(mlist, html) print() saveMovieInfo(mlist, path) if __name__ == '__main__': main()