''' @author :Eric-chen @contact:809512722@qq.com @time :2018/1/3 17:55 @desc :通過爬取http://movie.douban.com/top250/得到豆瓣Top 250的電影,並輸出到文件movies.txt ''' import codecs import requests from bs4 import BeautifulSoup DOWNLOAD_URL = 'http://movie.douban.com/top250/' def download_page(url): return requests.get(url).content def parse_html(html): soup = BeautifulSoup(html,"lxml") movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'}) movie_name_list = [] for movie_li in movie_list_soup.find_all('li'): detail = movie_li.find('div', attrs={'class': 'hd'}) movie_name = detail.find('span', attrs={'class': 'title'}).getText() movie_name_list.append(movie_name) next_page = soup.find('span', attrs={'class': 'next'}).find('a') if next_page: return movie_name_list, DOWNLOAD_URL + next_page['href'] return movie_name_list, None def main(): url = DOWNLOAD_URL with codecs.open('movies.txt', 'wb', encoding='utf-8') as fp: while url: html = download_page(url) movies, url = parse_html(html) fp.write(u'{movies}\n'.format(movies='\n'.join(movies))) if __name__ == '__main__': main()
本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。