爬取到的data不是想要獲取文章頁面的源碼,而是跳轉到驗證碼的頁面的源碼。我網上查了一下是由於訪問過於頻繁導致的,我也加了time.sleep和改了請求頭但還是顯示不了,求大神支招,除了識別驗證碼的方式還能怎么做??
import re import urllib.request import time import urllib.error headers = {'User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} opener = urllib.request.build_opener() opener.addheaders = [headers] #設置一個列表listurl儲存文章網址列表 listurl = [] #函數:使用代理IP def use_proxy(proxy_addr,url): try: import urllib.request proxy = urllib.request.ProxyHandler({'http':proxy_addr}) opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) data = urllib.request.urlopen(url).read().decode('utf-8') return data except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"e.reason"): print(e.reason) time.sleep(10) except Exception as e: print("expection:"+str(e)) time.sleep(1) #函數:獲取所有文章的連接 def getlisturl(key,pagestart,pageend,proxy): try: #page = pagestart #編碼關鍵詞 keycode = urllib.request.quote(key) #編碼"&page" #pagecode = urllib.request.quote("&page") #循環爬取各頁的文章鏈接 for page in range(pagestart,pageend+1,1): #每次循環構建各頁的url url = 'http://weixin.sogou.com/weixin?&type=2&ie=utf8&query='+keycode+"&&page="+str(page) #用換IP函數獲得data data1 = use_proxy(proxy,url) time.sleep(1) #匹配的正則表達式 pattern1 = '<div class="txt-box">.*?(http://.*?)"' listurl.append(re.compile(pattern1,re.S).findall(data1)) print("共獲取到"+str(len(listurl))+"頁")#便於調試 return listurl except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) #若為URLerror異常,延遲10s進行 time.sleep(10) except Exception as e: print("exception:"+str(e)) #若為Exception異常,延遲1s進行 time.sleep(1) #函數:通過鏈接獲取對應的內容 def getcontent(listurl,proxy): i = 0 #設置本地文件中的開始html編碼 html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtm11/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>微信文章頁面</title> </head> <body>''' with open("D:/WEB/1.html","wb")as f: f.write(html1.encode("utf-8")) #再次以追加寫入的方式打開文件,以寫入對應文章內容 with open("D:/WEB/1.html","ab")as ff: #此時listurl為二維列表,形如listurl[][],第一維存儲信息跟第幾頁相關,第二維存的跟該頁第幾個文章鏈接相關 for i in range(0,len(listurl)): for j in range(0,len(listurl[i])): try: url = listurl[i][j] #處理真實url,亦可觀察對應網址的關系自行分析,采集網址比真實網址多了一串amp url = url.replace("amp;","") #使用代理去爬取對應網址的內容 data = use_proxy(proxy,url) #文章標題正則表達式 titlepat = "<title>(.*?)</title>" #文章內容正則表達式 contentpat = 'id="js_content">(.*?)id="js_sg_bar"' #找到標題並賦給列表title title = re.compile(titlepat).findall(data) #找到內容並賦給列表content content = re.compile(contentpat,re.S).findall(data) #初始化標題與內容 thistitle = "此次沒有獲取到" thiscontent= "此次沒有獲取到" #如果標題列表不為空,說明找到了標題,取列表第0個元素,即此次標題賦給變量thistitle if (title!=[]): thistitle = title[0] if (content!=[]): thiscontent = content[0] #將標題與內容匯總賦給變量dataall dataall = '<p>標題為:'+thistitle+'</p><p>內容為:'+thiscontent+'</p><br>' #將文章標題與內容寫入對應文件 ff.write(dataall.encode("utf-8")) print("第"+str(i)+"個網頁第"+str(j)+"次處理")#便於調試 except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) time.sleep(10) except Exception as e: print("exception:"+str(e)) time.sleep(1) html2 = '''</body> </html> ''' with open("D:/WEB/1.html","ab")as f: f.write(html2.encode("utf-8")) #設置關鍵詞 key = "物聯網" #設置代理服務器 proxy = "122.72.32.73:80" #可以為getlisturl()與getcontent設置不同的代理服務器,此處沒有啟用該項設置 proxy2 = "" #啟示頁 pagesatrt = 1 #爬取到哪頁 pageend = 2 listurl = getlisturl(key,pagesatrt,pageend,proxy) getcontent(listurl,proxy)
