兩種方法隱藏(修改)headers
1.下面是關於用Python程序調用有道進行翻譯的 一段隱藏的代碼(第一種方法)。
2.為什么要進行隱藏操作?因為如果一個IP在一定時間訪問過於頻繁,那么就會被被訪問網站進行反爬蟲攔截,無法進行我們爬蟲的后續工作了,所以要給爬蟲披上一層神秘的面紗,從而瞞天過海嘍~
1 import urllib.request 2 import urllib.parse #負責解析功能 3 import json 4 5 content=input("請輸入需要翻譯的內容:") 6 url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index' 7 8 head={} #增加header賦值代碼 9 head['User-Agent']='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'#增加header賦值代碼,通過網址F12之后Network找到POST請求,找到User-Agent值,在此處進行定位賦值。 10 11 data={} 12 data['type']='AUTO' 13 data['i']=content #'I'換成content 14 data['doctype']='json' 15 data['xmlVersion']='1.8' 16 data['keyfrom']='fanyi.web' 17 data['ue']='UTF-8' 18 data['action']='FY_BY_CLICKBUTTON' 19 data['typoResult']='true' 20 data=urllib.parse.urlencode(data).encode('utf-8') 21 22 req=urllib.request.Request(url,data,head) 23 response=urllib.request.urlopen(req) 24 html=response.read().decode('utf-8') 25 26 #print(html) 27 target=json.loads(html) 28 print("翻譯結果:%s"%(target['translateResult'][0][0]['tgt']))
運行結果及headers是否正確輸入的檢查:
1 >>> 2 請輸入需要翻譯的內容:I 3 翻譯結果:我 4 >>> req.headers #檢查 5 {'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}
第二種方法隱藏:
1 import urllib.request 2 import urllib.parse 3 import json 4 5 content=input("請輸入需要翻譯的內容:") 6 url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index' 7 8 ''' 9 1.用下面一句add_header函數語句q來代替以下head的賦值 10 head={} 11 head['User-Agent']='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36' 12 ''' 13 14 data={} 15 data['type']='AUTO' 16 data['i']=content 17 data['doctype']='json' 18 data['xmlVersion']='1.8' 19 data['keyfrom']='fanyi.web' 20 data['ue']='UTF-8' 21 data['action']='FY_BY_CLICKBUTTON' 22 data['typoResult']='true' 23 data=urllib.parse.urlencode(data).encode('utf-8') 24 25 #req=urllib.request.Request(url,data,head) #2.替換成下一句,因為不再引用上面的head所以去掉head 26 req=urllib.request.Request(url,data) 27 #+3.q語句 28 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36') 29 30 response=urllib.request.urlopen(req) 31 html=response.read().decode('utf-8') 32 33 #print(html) 34 target=json.loads(html) 35 print("翻譯結果:%s"%(target['translateResult'][0][0]['tgt']))
第三種方法:引入休息時間:
1 import urllib.request 2 import urllib.parse 3 import json 4 import time 5 6 while True:#+添加循環 7 content=input('請輸入需要翻譯的內容(輸入"q!"退出程序):')#+ 8 if content =='q!':#+ 9 break#+ 10 url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index' 11 12 ''' 13 1.用下面一句add_header函數語句q來代替以下head的賦值 14 head={} 15 head['User-Agent']='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36' 16 ''' 17 18 data={} 19 data['type']='AUTO' 20 data['i']=content 21 data['doctype']='json' 22 data['xmlVersion']='1.8' 23 data['keyfrom']='fanyi.web' 24 data['ue']='UTF-8' 25 data['action']='FY_BY_CLICKBUTTON' 26 data['typoResult']='true' 27 data=urllib.parse.urlencode(data).encode('utf-8') 28 29 req=urllib.request.Request(url,data) 30 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36') 31 32 response=urllib.request.urlopen(req) 33 html=response.read().decode('utf-8') 34 35 #print(html) 36 target=json.loads(html) 37 #print("翻譯結果:%s"%(target['translateResult'][0][0]['tgt'])) 38 target=target['translateResult'][0][0]['tgt']#+ 39 print(target)#+ 40 time.sleep(5)#+延長時間以避免網站認為是爬蟲非法訪問
第四種:引入代理,代理把看到的內容返回給你,所以可以達到同樣的效果
1 import urllib.request 2 3 url='http://www.whatismyip.com.tw' 4 iplist=['']#+ 5 6 #proxy_support = urllib.request.ProxyHandler({'http':'196.168.0.100:808'}) 7 proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)}) 8 9 opener=urllib.request.build_opener(proxy_support) 10 opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36')] 11 urllib.request.install_opener(opener) 12 13 response=urllib.request.urlopen(url) 14 html=response.read().decode('utf-8') 15 16 print(html)
