爬蟲小試之一(抓取豆瓣電影)


工具

  python3.5

  BeautifulSoup

步驟:

  1、根據url抓取豆瓣電影html,並解析

  2、BeautifulSoup截取節點,寫入字典

  3、保存字典信息

 

# -*- coding='utf-8' -*-
import requests
from bs4 import BeautifulSoup
import json

#發送request,返回response
def getHTMLText(url):
	try:
		r = requests.get(url, timeout=30)
		r.raise_for_status()
		r.encoding = r.apparent_encoding
		return r.text
	except:
		return ""


def getMovieInfo(mlist, html):
	soup = BeautifulSoup(html, 'html.parser')         #解析成html
	lists = soup.find_all('li', attrs={'class':'list-item'})   
	for ls in lists:
			if ls.attrs['data-category']== 'nowplaying':  #判斷正熱播的電影
				mdict = {}
				mdict['電影名'] = ls.attrs['data-title']
				mdict['評分'] = ls.attrs['data-score']
				mdict['時長'] = ls.attrs['data-duration']
				mdict['主演'] = ls.attrs['data-actors']
				mlist.append(mdict)

#寫入txt文件
def saveMovieInfo(mlist, path):    
	with open(path, 'w', encoding='utf-8') as f:
		f.write(str(mlist))
		f.close()


def main():
	mlist = []
	url = 'https://movie.douban.com/cinema/nowplaying/shenzhen/'
	path = 'D://pachong//movie.txt'
	html = getHTMLText(url)
	print(len(html))
	getMovieInfo(mlist, html)
	print()
	saveMovieInfo(mlist, path)

if __name__ == '__main__':
	main()

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM