获取responseDemo:
async def fetch(session, url): # 设置超时时间 with async_timeout.timeout(10): async with session.get(url) as response: return await response.text() async def main(): async with aiohttp.ClientSession() as session: html = await fetch(session, 'http://python.org') print(html) loop = asyncio.get_event_loop() loop.run_until_complete(main())
抓取豆瓣Demo:
import aiohttp import asyncio import async_timeout from bs4 import BeautifulSoup url = 'https://movie.douban.com/top250' datadict = {} # 获取response async def fetch(session, url): # 设置超时时间 with async_timeout.timeout(10): async with session.get(url) as response: return await response.text() # 网页解析 def parser(html): global datalist title = [] rating_num = [] range_num = [] data = {} # bs4解析html soup = BeautifulSoup(html, "html.parser") for li in soup.find("ol", attrs={'class': 'grid_view'}).find_all("li"): title.append(li.find("span", class_="title").text) rating_num.append(li.find("div", class_='star').find( "span", class_='rating_num').text) range_num.append(li.find("div", class_='pic').find("em").text) data['title'] = title data['rating_num'] = rating_num data['range_num'] = range_num datalist = data # 下载器 async def download(url): async with aiohttp.ClientSession() as session: html = await fetch(session, url) parser(html) loop = asyncio.get_event_loop() loop.run_until_complete(download(url)) # print(datalist) # print(len(datalist['rating_num'])) for index in range(len(datalist['title'])): print("{0}\t\t{1}\t\t{2}\t\t\n".format(datalist['title'][index], datalist['rating_num'][index], datalist['range_num'][index]))
运行效果:
aiohttp小总结:
1.并没有学到多少新知识。。
2.不会使用beautifulsoup
常识爬取多个相同网页:
看看耗时多少:
爬取250条记录使用aiohttp耗时2.3秒,不知道使用scrapy会耗时多少