requests利用selenium,代理Ip,雲打碼，驗證碼摳圖操作爬取搜狗微信公眾號內容

本文轉載自查看原文 2018-10-29 22:25 949 爬蟲

爬取思路，
爬取搜狗微信公眾號內容，爬取第一層url時請求太快出現驗證碼，我這里用的蘑菇雲代理，並在程序中我判斷什么情況下是否＋代理，
做到合理運用代理ip。
爬取第二層url時驗證碼出現次數更嚴重(和第一層驗證碼完全不一樣)，一開始也用更換代理，但是感覺不怎么解決問題，后來利用selenium自動打開該頁面，並把
驗證碼圖片抓取下來，然后發現抓下來的並不是頁面中的真實驗證碼，估計是網站加密的原因。
后來利用selenium直接保存截屏整張頁面，再利用python的pil包把驗證碼圖片截取下來，發送到雲打碼，
最后發現， 成功解決問題。


import requests from lxml import etree import time import json import random from dama import yundama from selenium import webdriver from PIL import Image  #處理圖片包
import pymysql import re from requests.exceptions import ConnectionError #=============================代理ip封裝函數=======================
def get_ip():  #代理Ip
    url = 'http://piping.mogumiao.com/proxy/api/' #代理ip的api接口 time.sleep(random.uniform(1, 3)) response = requests.get(url=url).json() n = response['msg'][0] ip = {} ip['http'] = 'http://'+n['ip']+':'+n['port'] return ip #=================================================================== #隨機更換請求頭
user_agent = [ 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)'
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Opera/8.0 (Windows NT 5.1; U; en)', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50' ] #=================================================================== #鏈接數據庫
db = pymysql.Connect( db = 'test', user = 'root', port = 3306, host = 'localhost', password = 'mysql', charset = 'utf8' ) cursor = db.cursor() proxi = None  #定義全局變量代理用
count = 0   #計數用
def sougou(page): global proxi,count  #改變全局變量 #可以修改關鍵字查詢其它
    url = 'http://weixin.sogou.com/weixin?query=python&_sug_type_=&s_from=input&_sug_=n&type=1&ie=utf8&page='+str(page) headers = { 'Referer': 'http://weixin.sogou.com/weixin?type=1&query=python&ie=utf8&s_from=input&_sug_=n&_sug_type_=1&w=01015002&oq=&ri=5&sourceid=sugg&sut=0&sst0=1540733222633&lkt=0%2C0%2C0&p=40040108', 'User-Agent': random.choice(user_agent), 'Cookie': 'CXID=04C14DAB703E117FA82047F41148A82D; SUID=82F4FB723665860A5AB30BA8000211FB; SUV=1526707509991840; UM_distinctid=16376dbdee02b1-0820fe9948d64e-c343567-100200-16376dbdee4fb8; IPLOC=CN1100; usid=ue2M7rhDvZ5zfSvQ; pgv_pvi=1717965824; dt_ssuid=4873588560; ssuid=9294144357; pex=C864C03270DED3DD8A06887A372DA219231FFAC25A9D64AE09E82AED12E416AC; weixinIndexVisited=1; ld=Vkllllllll2bQuDilllllVs2PuGlllllNYkuOkllll9lllllVklll5@@@@@@@@@@; ad=vkllllllll2b8Y4nlllllVsyx@tlllllNYkuskllll9lllllpVxlw@@@@@@@@@@@; ABTEST=8|1540692132|v1; GOTO=Af71175-1502; SUIR=1AC841BDC9CDB1889FD40AC7C92328E1; SNUID=DD51FF0D787D0E234D0D8342788E1DC9; sct=38; JSESSIONID=aaaFhG5t_2zIAdtqom-Aw; Hm_lvt_dde6ba2851f3db0ddc415ce0f895822e=1540807114,1540807224,1540808537,1540816279; Hm_lpvt_dde6ba2851f3db0ddc415ce0f895822e=1540816279' } #注意cookie存活時長

    try: #捕獲異常（主要針對錯誤的代理ip）
        if proxi: #判斷加上代理則執行
            response = requests.get(url=url, headers=headers, proxies=proxi) else: #none為不加代理
            response = requests.get(url=url, headers=headers, proxies=proxi) html = etree.HTML(response.text) datas = html.xpath("//p[@class='tit']/a") #-------------------------------------------------------------------
        if len(response.text) > 5500:#如果不是驗證碼頁面
            for i in datas: gongzhonghao = ''.join(i.xpath('.//text()'))  #公眾號
                gongzhonghao_url = i.xpath('.//@href')[0]  #公眾號url
                list_response = requests.get(url=gongzhonghao_url, headers=headers) if len(list_response.text) > 6500: #如果不是驗證碼頁面
                    res = re.compile(r'var msgList = (.*?)};', re.S)  #正則取出json數據
                    lists = json.loads(res.findall(list_response.text)[0] + '}')['list']  #轉python格式
                    for i in lists: title_name = i['app_msg_ext_info']['title']  #獲取文章標題
                        content_url = 'https://mp.weixin.qq.com' + i['app_msg_ext_info']['content_url'].replace('amp;', '')#取出內容也的url並處理
                        content_response = requests.get(url=content_url, headers=headers)#請求內容頁面
                        # time.sleep(random.uniform(1,3)) #建議打開隨機休眠時間
                        html1 = etree.HTML(content_response.text) contents = ''.join(html1.xpath('//*[@id="js_content"]//text()')).strip() #文章內容
                        img = html1.xpath('//*[@id="js_content"]//img/@data-src') #圖片
                        if len(img)==0:  #判斷頁面是否有無圖片
                            imgs = '圖片暫無'
                        else: imgs = img # -------------------------------------------------------------------
                        #數據庫查詢是否已存在
                        check = 'select * from weixin where title="{}"'.format(title_name) result = cursor.execute(check)#返回查詢語句的影響行數
                        if result == 0: #不存在則執行添加sql語句
                            sql = 'insert into weixin VALUES (0,%s,%s)' cursor.execute(sql, [title_name, re.sub(r'\W', '', contents)])#替換文章特殊字符
                            count += 1  #計數
                            print(count, title_name) else: print('{}----------已存在'.format(title_name)) db.commit()#數據庫提交

                else:#驗證碼頁面則重新發起請求
                    web = webdriver.Chrome() web.maximize_window() #頁面最大化
                    web.get(list_response.url) #請求驗證碼頁面
                    web.save_screenshot("D:\\quan.png")#截取全屏並保存到該路徑
                    imgs = web.find_element_by_id('verify_img') #驗證碼頁面定位驗證碼圖片元素位置
                    #第一步取參數
                    place = imgs.location  #驗證碼的坐標位置
                    size = imgs.size    #驗證碼的大小
                    #第二部整理參數（數據為元組）
                    rangle = (int(place['x']), int(place['y']), int(place['x'] + size['width']), int(place['y'] + size['height']))  # 寫成我們需要截取的位置坐標
                    #第三步導入PIL，打開截圖
                    i = Image.open("D:\\quan.png") #第四部進行摳圖操作
                    frame4 = i.crop(rangle)  # 使用Image的crop函數，從截圖中再次截取我們需要的區域
                    #第五步 保存摳下來的驗證碼
                    frame4.save('D:\\cropped.png')  # 保存我們接下來的驗證碼圖片 進行打碼
                    web.find_element_by_id('input').send_keys(yundama())#調用雲打碼返回參數並發送到input框
                    time.sleep(1) web.find_element_by_id('bt').click()#點擊提交
                    time.sleep(2) web.close() #關閉瀏覽器

        else:#驗證碼頁面文本長度低於5500
            proxi = get_ip()  #代理生效
            print('第一個頁面{}正在使用代理ip'.format(proxi)) sougou(page) #遞歸調用本身

    except ConnectionError:  #代理ip不能用的
        proxi = get_ip() print('{}請求出錯，更換代理ip'.format(proxi)) sougou(page)

 if __name__ == '__main__': 　　for page in range(1,7):#循環頁碼
    　　sougou(page)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 php:微信公眾號token驗證失敗原因、驗證碼顯示不出來的問題 selenium自動爬取網易易盾的驗證碼微信公眾號圖文回復驗證碼並寫入數據庫的基本思路模仿網易雲盾滑動驗證碼生成--摳圖，大圖部分背景透明 curl 模擬登錄微信公眾平台帶驗證碼 C#微信公眾號接口開發，靈活利用網頁授權、帶參數二維碼、模板消息，提升用戶體驗之完成用戶綁定個人微信及驗證碼獲取爬取微信公眾號解決微信端公眾號網頁獲取短信驗證碼ajax重復調用兩次的問題爬取微信公眾號文章 python爬取微信公眾號

requests利用selenium,代理Ip,雲打碼，驗證碼摳圖操作 爬取搜狗微信公眾號內容

免責聲明！

requests利用selenium,代理Ip,雲打碼，驗證碼摳圖操作爬取搜狗微信公眾號內容