在執行python腳本的多進程程序時,會隨機性的觸發程序執行完,無法主動退出的情況。尤其是當進程數足夠大時,處理的數據量足夠多時。出現退出異常的概率越大。下面的腳本用於解決退出異常問題。
import argparse import requests import getpass from multiprocessing import Pool import datetime import time from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) def get_parameter(): parser=argparse.ArgumentParser(description='利用requests庫批量訪問url(v3)') parser.add_argument('-f',dest='inputFile',type=str,default='',help='url文件') parser.add_argument('-o',dest='outputFile',type=str,default='result.txt',help='結果保存文件') parser.add_argument('-p',dest='proxyServer',type=str,default='',help='設置代理服務器,默認不指定') parser.add_argument('-n',dest='processNum',type=int,default='1',help='指明腳本進程數,缺省為1') args=parser.parse_args() inputFile=args.inputFile outputFile=args.outputFile proxyServer=args.proxyServer processNum=args.processNum return inputFile,outputFile,proxyServer,processNum def set_proxy(urlProxy): if not urlProxy: proxies={} else: username=input('username:') password=getpass.getpass('password:') http_proxy='http://'+str(username)+':'+str(password)+'@'+str(urlProxy) https_proxy='https://'+str(username)+':'+str(password)+'@'+str(urlProxy) proxies={ 'http':http_proxy, 'https':https_proxy } return proxies def get_url(urlFile): with open(urlFile,'r') as urlFile: allUrl=urlFile.readlines() return allUrl def http_request(url,proxy=''): headers={ 'User-Agent':'curl/3.03', 'Connection':'close' # keep-alive } try: r=requests.get(url,headers=headers,proxies=proxy,timeout=15,verify=False) urlresult=url+'\t'+str(r.status_code) except Exception as e: urlresult=url+'\t'+str(e) finally: print(urlresult) return urlresult def main(): start_time=datetime.datetime.now() inputFile, outputFile, proxyServer, processNum=get_parameter() allUrl=get_url(inputFile) proxies=set_proxy(proxyServer) p=Pool(processNum) print('總URL數量:{}'.format(len(allUrl))) def writer_log(urlresult): with open(outputFile,'a+') as wf: wf.write(urlresult+'\n') # with open(outputFile,'w+') as wf: for i in allUrl: url=i.split()[-1] result=p.apply_async(http_request,args=(url,proxies),callback=writer_log) p.close() count=0 while True: try: time.sleep(60) if result.ready(): count+=1 time.sleep(180) result.get() if count>4: break except Exception as e: print('進程異常:{}'.format(str(e))) p.terminate() p.join() end_time=datetime.datetime.now() print('開始時間:{}'.format(start_time)) print('結束時間:{}'.format(end_time)) print('總耗時:{}'.format(end_time-start_time)) print('結果保存在:{}'.format(outputFile)) if __name__=='__main__': main()