工具
python3.5
BeautifulSoup
步驟:
1、根據url抓取豆瓣電影html,並解析
2、BeautifulSoup截取節點,寫入字典
3、保存字典信息
# -*- coding='utf-8' -*-
import requests
from bs4 import BeautifulSoup
import json
#發送request,返回response
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getMovieInfo(mlist, html):
soup = BeautifulSoup(html, 'html.parser') #解析成html
lists = soup.find_all('li', attrs={'class':'list-item'})
for ls in lists:
if ls.attrs['data-category']== 'nowplaying': #判斷正熱播的電影
mdict = {}
mdict['電影名'] = ls.attrs['data-title']
mdict['評分'] = ls.attrs['data-score']
mdict['時長'] = ls.attrs['data-duration']
mdict['主演'] = ls.attrs['data-actors']
mlist.append(mdict)
#寫入txt文件
def saveMovieInfo(mlist, path):
with open(path, 'w', encoding='utf-8') as f:
f.write(str(mlist))
f.close()
def main():
mlist = []
url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/'
path = 'D://pachong//movie.txt'
html = getHTMLText(url)
print(len(html))
getMovieInfo(mlist, html)
print()
saveMovieInfo(mlist, path)
if __name__ == '__main__':
main()
