在執行python腳本的多進程程序時,會隨機性的觸發程序執行完,無法主動退出的情況。尤其是當進程數足夠大時,處理的數據量足夠多時。出現退出異常的概率越大。下面的腳本用於解決退出異常問題。
import argparse
import requests
import getpass
from multiprocessing import Pool
import datetime
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
def get_parameter():
parser=argparse.ArgumentParser(description='利用requests庫批量訪問url(v3)')
parser.add_argument('-f',dest='inputFile',type=str,default='',help='url文件')
parser.add_argument('-o',dest='outputFile',type=str,default='result.txt',help='結果保存文件')
parser.add_argument('-p',dest='proxyServer',type=str,default='',help='設置代理服務器,默認不指定')
parser.add_argument('-n',dest='processNum',type=int,default='1',help='指明腳本進程數,缺省為1')
args=parser.parse_args()
inputFile=args.inputFile
outputFile=args.outputFile
proxyServer=args.proxyServer
processNum=args.processNum
return inputFile,outputFile,proxyServer,processNum
def set_proxy(urlProxy):
if not urlProxy:
proxies={}
else:
username=input('username:')
password=getpass.getpass('password:')
http_proxy='http://'+str(username)+':'+str(password)+'@'+str(urlProxy)
https_proxy='https://'+str(username)+':'+str(password)+'@'+str(urlProxy)
proxies={
'http':http_proxy,
'https':https_proxy
}
return proxies
def get_url(urlFile):
with open(urlFile,'r') as urlFile:
allUrl=urlFile.readlines()
return allUrl
def http_request(url,proxy=''):
headers={
'User-Agent':'curl/3.03',
'Connection':'close' # keep-alive
}
try:
r=requests.get(url,headers=headers,proxies=proxy,timeout=15,verify=False)
urlresult=url+'\t'+str(r.status_code)
except Exception as e:
urlresult=url+'\t'+str(e)
finally:
print(urlresult)
return urlresult
def main():
start_time=datetime.datetime.now()
inputFile, outputFile, proxyServer, processNum=get_parameter()
allUrl=get_url(inputFile)
proxies=set_proxy(proxyServer)
p=Pool(processNum)
print('總URL數量:{}'.format(len(allUrl)))
def writer_log(urlresult):
with open(outputFile,'a+') as wf:
wf.write(urlresult+'\n')
# with open(outputFile,'w+') as wf:
for i in allUrl:
url=i.split()[-1]
result=p.apply_async(http_request,args=(url,proxies),callback=writer_log)
p.close()
count=0
while True:
try:
time.sleep(60)
if result.ready():
count+=1
time.sleep(180)
result.get()
if count>4:
break
except Exception as e:
print('進程異常:{}'.format(str(e)))
p.terminate()
p.join()
end_time=datetime.datetime.now()
print('開始時間:{}'.format(start_time))
print('結束時間:{}'.format(end_time))
print('總耗時:{}'.format(end_time-start_time))
print('結果保存在:{}'.format(outputFile))
if __name__=='__main__':
main()
