能看懂的aiohttp(簡單版)


asyncio可以實現單線程並發IO操作。如果僅用在客戶端,發揮的威力不大。如果把asyncio用在服務器端,例如Web服務器,由於HTTP連接就是IO操作,因此可以用單線程+coroutine實現多用戶的高並發支持。 
asyncio實現了TCP、UDP、SSL等協議,aiohttp則是基於asyncio實現的HTTP框架。

使用協程的異步請求以其低時消耗和對硬件的高利用而著稱,翻看了很多論壇,發現協程在進行爬蟲以及高頻網絡請求時的耗時比單多進程和單多線程還要好。本文將使用requests和使用aiohttp+asyncio進行比較,比較一下具體使用協程和不使用協程能差距多少。

本文測試所使用目標網址是廖雪峰老師python3教程的評論頁 目前一共有2318頁。

1.使用requests獲取單個網址

import requests,time

r=requests.get(url="https://www.liaoxuefeng.com/discuss/001409195742008d822b26cf3de46aea14f2b7378a1ba91000?page=1")
print(len(r.text)) #結果 40007 用時0.3s

2.使用 aiohttp+asyncio 獲取單個網址
import aiohttp, asyncio

async def fn():
async with aiohttp.get(url='https://www.liaoxuefeng.com/discuss/001409195742008d822b26cf3de46aea14f2b7378a1ba91000?page=1') as resp:
text=await resp.text()
print(len(text))

loop = asyncio.get_event_loop()
loop.run_until_complete(fn()) #結果 40007 用時0.3s

看樣對於單個請求來說,阻塞還是不阻塞沒有實際意義,因為最終結果都是要等待網絡訪問,所以實際上對於整個需求來說,都是一個阻塞模型。而且async的方式代碼量相對大。
3.使用requests獲取50個網址

import requests

a=[len(requests.get(url="https://www.liaoxuefeng.com/discuss/001409195742008d822b26cf3de46aea14f2b7378a1ba91000?page={}".format(i)).text) for i in range(1,51)]
print(a) #結果 [40007, 40444, 40367, 40820, 40534, 40505, 40735, 40454, 40768, 40636, 40600, 40888, 41277, 41390, 41222, 40899, 40853, 40616, 40654, 40870, 41249, 40840, 40782, 41326, 41136, 40511, 40504, 40609, 41038, 41054, 40486, 40556, 41083, 40975, 40861, 40877, 40166, 40899, 40598, 40920, 40902, 40994, 40735, 40714, 41064, 40719, 40991, 40748, 40652, 40799]
用時:5.3s

4.使用aiohttp+asyncio獲取50個網址

import aiohttp, asyncio

async def fn(num):
async with aiohttp.get(url='https://www.liaoxuefeng.com/discuss/001409195742008d822b26cf3de46aea14f2b7378a1ba91000?page={}'.format(num)) as resp:
text=await resp.text()
result.append(len(text))

result=[]
loop = asyncio.get_event_loop()
tasks=[asyncio.ensure_future(fn(i)) for i in range(1,51)]
loop.run_until_complete(asyncio.wait(tasks))
print(result)
#結果:[40636, 41326, 40853, 41277, 41249, 40735, 40616, 40454, 40888, 40899, 40007, 40768, 40486, 40870, 40820, 40444, 40367, 41136, 40609, 40975, 40504, 40166, 40920, 40598, 40556, 40652, 41083, 40735, 40799, 40899, 40902, 40748, 41064, 40505, 40654, 40511, 40600, 40534, 41390, 40782, 41222, 40840, 41038, 40714, 41054, 40877, 40719, 40991, 40861, 40994]
#用時:3.4s


有趣的是,協程最后返回的list的順序和requests的完全順序不一樣,多試幾次就發現,協程每次返回的順序都不一樣,因為它是異步的,不一定這些網絡請求任務中哪個先完成。而requests每次得到的都是相同順序的list。同時,服務器響應越慢,協程的優勢體現的越明顯,節省了很多網絡等待的時間,而requests會阻塞的更久。
5.發送2318個請求

既然頁面總共有2318個,我們不如全部使用爬一下,老方法requests:

import requests
a=[len(requests.get(url="https://www.liaoxuefeng.com/discuss/001409195742008d822b26cf3de46aea14f2b7378a1ba91000?page={}".format(i)).text) for i in range(1,2319)]
print(a)
#用時:497s

aiohttp+asyncio:
由於同時並發2318個請求並交給網卡去處理會出現負荷過重,aiohttp默認同時最大支持1024個協程的進行,但是考慮到硬件的承受能力,我們采用維護一個“協程池”的方法。使用 asyncio.Semaphore來控制同時進行的最大IO量。每當有一個協程完成時,便加入一個新的協程:

import aiohttp, asyncio

async def fn(num,sem):
async with sem:
async with aiohttp.get(url="https://www.liaoxuefeng.com/discuss/001409195742008d822b26cf3de46aea14f2b7378a1ba91000?page={}".format(num)) as resp:
text=await resp.text()
result.append(len(text))

loop=asyncio.get_event_loop()
result=[]
sem=asyncio.Semaphore(100) #維持100個信號量
tasks=[ asyncio.ensure_future(fn(i,sem)) for i in range(1,2319)]
loop.run_until_complete(asyncio.wait(tasks))
print(result)
#用時 預計200s左右


在用async測試的過程中,可能並發的請求服務器壓力太大,廖雪峰老師的網站出現了504網關超時報錯,第二天測試的時候,剛進行到一半服務器又崩潰了,服務器應該沒做優化,這樣一點並發流量怎么就垮了。

 

參考:https://blog.csdn.net/getcomputerstyle/article/details/78438246

  https://blog.csdn.net/getcomputerstyle/article/details/71515331

 

上面是粘貼復制的  里面代碼有問題   

async def fn(num):
async with aiohttp.request('get',
url='http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/index_%s.html'% num) as resp:

 

 第一種寫法

import aiohttp, asyncio
from lxml import etree
import requests
import time
import os
import hashlib
from apscheduler.schedulers.blocking import BlockingScheduler #定時任務
headers = {"User-Agent": "Mozilla/5.0"}
pat=os.path.dirname(os.path.abspath(__file__))
ll=['技術研發']
async def main(pool): # 啟動
sem = asyncio.Semaphore(pool)
async with aiohttp.ClientSession() as session: # 給所有的請求,創建同一個session
tasks = []
for i in range(1,70):
url=control_sem(sem,'http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/index_%s.html'% i, session)
tasks.append(url)
await asyncio.wait(tasks)

async def control_sem(sem, url, session): # 限制信號量
async with sem:
await fetch(url, session)

async def fetch(url, session): # 開啟異步請求
async with session.get(url) as resp:
page= await resp.text()
tree = etree.HTML(page)
ul_list = tree.xpath('//ul[@class="xinxi_ul"]/li')
for ul in ul_list:
name = ul.xpath('./a/text()')[0].strip()
print(name)
for ii in ll:
if ii in name:
new_url = 'http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/' + \
ul.xpath('./a/@href')[0].split('/')[-1]
datatime = ul.xpath('./span/text()')[0].strip()
pag = requests.get(url=new_url, headers=headers).text
source = name + datatime
source_id = hashlib.md5(source.encode()).hexdigest() # 設置唯一id 做去重
filename = os.path.join(pat, 'a', '%s.html') % source_id
with open(filename, 'w')as f:
f.write(pag)
print( name, datatime, new_url,source)

def start():
s = time.time()
# loop = asyncio.get_event_loop() #加win定時就換下面2行代碼
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(main(pool=5))
e = time.time() - s
print('>>>>>>>>>>>>>>>>>>',e)

if __name__ == '__main__':
print('等待中......')
scheduler = BlockingScheduler()
scheduler.add_job(start, 'cron', hour=16, minute=43) # 'interval', seconds=40
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
pass
except Exception as e:
print(e)




第二種寫法

import aiohttp, asyncio
from conf.setting import *
from lxml import etree
import requests
import hashlib
import time

async def fn(num, sem):
async with sem:
async with aiohttp.request('get',url='http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/index_%s.html'% num,)as resp:
page = await resp.text()
tree = etree.HTML(page)
ul_list = tree.xpath('//ul[@class="xinxi_ul"]/li')
for ul in ul_list:
name = ul.xpath('./a/text()')[0].strip()
print(name)
for ii in ll:
if ii in name:
new_url = 'http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/sjzbjggg/' + \
ul.xpath('./a/@href')[0].split('/')[-1]
datatime = ul.xpath('./span/text()')[0].strip()
pag = requests.get(url=new_url, headers=headers).text
source = name + datatime
source_id = hashlib.md5(source.encode()).hexdigest() # 設置唯一id 做去重
pat = os.path.dirname(os.path.abspath(__file__))
filename = os.path.join(pat, 'a', '%s.html') % source_id
with open(filename, 'w')as f:
f.write(pag)
print(name, datatime, new_url, source)





loop = asyncio.get_event_loop()
s=time.time()
sem = asyncio.Semaphore(100) # 維持100個信號量
tasks = [asyncio.ensure_future(fn(i, sem)) for i in range(1, 143)]
loop.run_until_complete(asyncio.wait(tasks))
e=time.time()-s

print('>>>>>>>>>>>>>>>>>>>>>',e)


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM