獲取responseDemo:
async def fetch(session, url): # 設置超時時間 with async_timeout.timeout(10): async with session.get(url) as response: return await response.text() async def main(): async with aiohttp.ClientSession() as session: html = await fetch(session, 'http://python.org') print(html) loop = asyncio.get_event_loop() loop.run_until_complete(main())
抓取豆瓣Demo:
import aiohttp import asyncio import async_timeout from bs4 import BeautifulSoup url = 'https://movie.douban.com/top250' datadict = {} # 獲取response async def fetch(session, url): # 設置超時時間 with async_timeout.timeout(10): async with session.get(url) as response: return await response.text() # 網頁解析 def parser(html): global datalist title = [] rating_num = [] range_num = [] data = {} # bs4解析html soup = BeautifulSoup(html, "html.parser") for li in soup.find("ol", attrs={'class': 'grid_view'}).find_all("li"): title.append(li.find("span", class_="title").text) rating_num.append(li.find("div", class_='star').find( "span", class_='rating_num').text) range_num.append(li.find("div", class_='pic').find("em").text) data['title'] = title data['rating_num'] = rating_num data['range_num'] = range_num datalist = data # 下載器 async def download(url): async with aiohttp.ClientSession() as session: html = await fetch(session, url) parser(html) loop = asyncio.get_event_loop() loop.run_until_complete(download(url)) # print(datalist) # print(len(datalist['rating_num'])) for index in range(len(datalist['title'])): print("{0}\t\t{1}\t\t{2}\t\t\n".format(datalist['title'][index], datalist['rating_num'][index], datalist['range_num'][index]))
運行效果:
aiohttp小總結:
1.並沒有學到多少新知識。。
2.不會使用beautifulsoup
常識爬取多個相同網頁:
看看耗時多少:
爬取250條記錄使用aiohttp耗時2.3秒,不知道使用scrapy會耗時多少