python 抓取搜狗微信出現的問題，求大神解決

本文轉載自查看原文 2017-08-12 21:26 1285

爬取到的data不是想要獲取文章頁面的源碼，而是跳轉到驗證碼的頁面的源碼。我網上查了一下是由於訪問過於頻繁導致的，我也加了time.sleep和改了請求頭但還是顯示不了，求大神支招，除了識別驗證碼的方式還能怎么做？？

import re
import urllib.request
import time
import urllib.error

headers = {'User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
opener = urllib.request.build_opener()
opener.addheaders = [headers]
#設置一個列表listurl儲存文章網址列表
listurl = []
#函數：使用代理IP
def use_proxy(proxy_addr,url):
    try:
        import urllib.request
        proxy = urllib.request.ProxyHandler({'http':proxy_addr})
        opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        data = urllib.request.urlopen(url).read().decode('utf-8')
        return data
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"e.reason"):
            print(e.reason)
        time.sleep(10)
    except Exception as e:
        print("expection:"+str(e))
        time.sleep(1)

#函數：獲取所有文章的連接
def getlisturl(key,pagestart,pageend,proxy):
    try:
        #page = pagestart
        #編碼關鍵詞
        keycode = urllib.request.quote(key)
        #編碼"&page"
        #pagecode = urllib.request.quote("&page")
        #循環爬取各頁的文章鏈接
        for page in range(pagestart,pageend+1,1):
            #每次循環構建各頁的url
            url = 'http://weixin.sogou.com/weixin?&type=2&ie=utf8&query='+keycode+"&&page="+str(page)
            #用換IP函數獲得data
            data1 = use_proxy(proxy,url)
            time.sleep(1)
            #匹配的正則表達式
            pattern1 = '<div class="txt-box">.*?(http://.*?)"'
            listurl.append(re.compile(pattern1,re.S).findall(data1))
        print("共獲取到"+str(len(listurl))+"頁")#便於調試
        return listurl
    except urllib.error.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
        #若為URLerror異常，延遲10s進行
        time.sleep(10)
    except Exception as e:
        print("exception:"+str(e))
        #若為Exception異常，延遲1s進行
        time.sleep(1)

#函數：通過鏈接獲取對應的內容
def getcontent(listurl,proxy):
    i = 0
    #設置本地文件中的開始html編碼
    html1 = '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtm11/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 
<title>微信文章頁面</title>
</head>
<body>'''
    with open("D:/WEB/1.html","wb")as f:
        f.write(html1.encode("utf-8"))
    #再次以追加寫入的方式打開文件，以寫入對應文章內容
    with open("D:/WEB/1.html","ab")as ff:
        #此時listurl為二維列表，形如listurl[][]，第一維存儲信息跟第幾頁相關，第二維存的跟該頁第幾個文章鏈接相關
        for i in range(0,len(listurl)):
            for j in range(0,len(listurl[i])):
                try:
                    url = listurl[i][j]
                    #處理真實url，亦可觀察對應網址的關系自行分析，采集網址比真實網址多了一串amp
                    url = url.replace("amp;","")
                    #使用代理去爬取對應網址的內容
                    data = use_proxy(proxy,url)
                    #文章標題正則表達式
                    titlepat = "<title>(.*?)</title>"
                    #文章內容正則表達式
                    contentpat = 'id="js_content">(.*?)id="js_sg_bar"'
                    #找到標題並賦給列表title
                    title = re.compile(titlepat).findall(data)
                    #找到內容並賦給列表content
                    content = re.compile(contentpat,re.S).findall(data)
                    #初始化標題與內容
                    thistitle = "此次沒有獲取到"
                    thiscontent= "此次沒有獲取到"
                    #如果標題列表不為空，說明找到了標題，取列表第0個元素，即此次標題賦給變量thistitle
                    if (title!=[]):
                        thistitle = title[0]
                    if (content!=[]):
                        thiscontent = content[0]
                    #將標題與內容匯總賦給變量dataall
                    dataall = '<p>標題為:'+thistitle+'</p><p>內容為：'+thiscontent+'</p><br>'
                    #將文章標題與內容寫入對應文件
                    ff.write(dataall.encode("utf-8"))
                    print("第"+str(i)+"個網頁第"+str(j)+"次處理")#便於調試
                except urllib.error.URLError as e:
                    if hasattr(e,"code"):
                        print(e.code)
                    if hasattr(e,"reason"):
                        print(e.reason)
                    time.sleep(10)
                except Exception as e:
                    print("exception:"+str(e))
                    time.sleep(1)
    html2 = '''</body>
</html>
    '''
    with open("D:/WEB/1.html","ab")as f:
        f.write(html2.encode("utf-8"))

#設置關鍵詞
key = "物聯網"
#設置代理服務器
proxy = "122.72.32.73:80"
#可以為getlisturl()與getcontent設置不同的代理服務器，此處沒有啟用該項設置
proxy2 = ""
#啟示頁
pagesatrt = 1
#爬取到哪頁
pageend = 2
listurl = getlisturl(key,pagesatrt,pageend,proxy)
getcontent(listurl,proxy)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 搜狗微信公眾號文章抓取微信和QQ分享出現的問題總結【微信小程序】解決豎向組件 “豎向滾動頁面出現遮擋”問題關於ios配置微信config出現驗簽失敗的問題解決微信開發實現一鍵撥號出現屏蔽問題的解決方案 PyCharm中用搜狗輸入法中出現問題的解決辦法 [Python爬蟲] 之十五：Selenium +phantomjs根據微信公眾號抓取微信文章 python爬蟲：搜狗微信公眾號文章信息的采集（https://weixin.sogou.com/），保存csv文件解決IOS微信瀏覽器底部會出現向前向后返回按鈕，返回不刷新的問題 Python抓取微博評論(二)