Python爬蟲(5) 借助搜狗搜索爬取微信文章

本文轉載自查看原文 2019-04-08 17:42 515

借助搜狗搜索爬取微信文章

from urllib import request as r

import re as e

from urllib import error as o

import time as t

#模擬成瀏覽器

headers = {"User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2767.400"}

opener = r.build_opener()

opener.addheaders = [headers]

#將opener安裝為全局

r.install_opener(opener)

#設置一個列表listurl存儲文章網址列表

listurl = []

#自定義函數，功能為使用代理服務器

def use_proxy(proxy_addr,url):

#建立異常處理機制

try:

proxy = r.ProxyHandler({'http':proxy_addr})

opener = r.build_opener(proxy,r.HTTPHandler)

r.install_opener(opener)

data = r.urlopen(url).read().decode('utf-8')

return data

except o.URLError as u:

if hasattr(u,'code'):

print(u.code)

if hasattr(u,'reason'):

print(u.reason)

#若為URLError異常，延遲10秒執行

t.sleep(10)

except Exception as x:

print('Exception:'+str(x))

#若為Exception異常，延遲1秒執行

t.sleep(1)

#獲得所有文章鏈接

def getlisturl(key,pagestart,pageend,proxy):

try:

page = pagestart

#編碼關鍵字key

keycode = r.quote(key)

#編碼 &page

# pagecode = r.quote("&page")

#循環抓取各頁面鏈接

for page in range(pagestart,pageend+1):

#分別構建各頁面的url

url = "http://weixin.sogou.com/weixin?type=2&query="+keycode+"&page="+str(page)

#用代理服務器爬取，解決IP封殺問題

data1 = use_proxy(proxy,url)

#獲取文章鏈接的正則表達式

listurlpat = '<div class="txt-box">.*?(http://.*?)"'

listurlpat = 'href="(http://.*?)"'

#獲取每頁的所有文章鏈接並添加到列表listurl中

d = e.compile(listurlpat,e.S).findall(data1)

listurl.append(d)

#測試代碼

print("共獲得到"+str(len(listurl))+"頁")

return listurl

except o.URLError as u:

if hasattr(u,'code'):

print(u.code)

if hasattr(u,'reason'):

print(u.reason)

#若為URLError異常，延遲10秒執行

t.sleep(10)

except Exception as x:

print('Exception:'+str(x))

#若為Exception異常，延遲1秒執行

t.sleep(1)

#通過文章鏈接獲取對應內容

def getcontent(listurl,proxy):

i = 0

#設置本地文件中的開始html編碼

html1='''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<head>

<title>微信文章頁面</title>

</head>

<body>'''

fh = open("G:\\Pcode\\1.html","wb")

fh.write(html1.encode("utf-8"))

fh.close()

#再次以最佳寫入的方式打開文件，以寫入對應文章內容

fh = open("G:\\Pcode\\1.html","ab")

#此時listurl為二維列表，形如listurl[][]，第一緯存儲的信息跟第幾頁相關，第二緯存儲的跟該頁面第幾個文章鏈接相關

for i in range(0,len(listurl)):

for j in range(0,len(listurl[i])):

try:

url = listurl[i][j]

#處理成真實url，可以觀察對應網址的關系自行分析，采集網址比真實網址多了一串amp

url = url.replace("amp;","")

#使用代理去爬取對應網址的內容

data = use_proxy(proxy,url)

#文章標題正則表達式

titlepat = "<title>(.*?)</title>"

#文章內容正則表達式

contentpat = 'id="js_content">(.*?)id="js_sg_bar"'

#通過對應正則表達式找到標題並賦給列表title

title = e.compile(titlepat).findall(data)

#通過對應正則表達式找到內容並賦給列表content

content = e.compile(contentpat,e.S).findall(data)

#初始化標題與內容

thistitle = "此次沒有獲取到"

thiscontent = "此次沒有獲取到"

#如果標題列表不為空，說明找到了標題，取列表第0個元素，即此次標題賦給變量thistitle

if(title != []):

thistitle = title[0]

if(content != []):

thiscontent = content[0]

#將標題與內容匯總賦給變量dataall

dataall = "標題為："+thistitle+"內容為："+thiscontent+" "

#將該篇文章的標題與內容的總信息寫入對應文件

fh.write(dataall.encode('utf-8'))

print(" 第 "+str(i+1)+"個網頁第"+str(j+1)+" 次處理 ") #便於調試

except o.URLError as u:

if hasattr(u, 'code'):

print(u.code)

if hasattr(u, 'reason'):

print(u.reason)

# 若為URLError異常，延遲10秒執行

t.sleep(10)

except Exception as x:

print('Exception:' + str(x))

# 若為Exception異常，延遲1秒執行

t.sleep(1)

fh.close()

#設置並寫入本地文件的html后面結束部分代碼

html2='''</body>

</html>'''

fh = open("G:\\Pcode\\1.html","ab")

fh.write(html2.encode("utf-8"))

fh.close()

#設置關鍵字

key = "物聯網"

#設置代碼服務器，該代理服務器有可能失效

proxy = '119.101.113.217:9999'

#可以為getlisturl()與getcontent()設置不同的代理服務器，此處沒有啟用該項設置

proxy2 = ''

#起始頁

pagestart = 1

#終止頁

pageend = 2

listurl = getlisturl(key,pagestart,pageend,proxy)

getcontent(listurl,proxy)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 爬取微信文章 6 爬取微信搜索平台的微信文章保存為本地網頁 python爬蟲實戰（三）--------搜狗微信文章（IP代理池和用戶代理池設定----scrapy） Python爬蟲實踐——爬取網站文章微信公眾號文章采集爬取微信文章采集公眾號的閱讀數和點贊數？ Python 爬蟲實例（9）—— 搜索爬取淘寶 [Python爬蟲] 之十五：Selenium +phantomjs根據微信公眾號抓取微信文章爬取微博文章內容，關鍵字搜索爬取 Python爬蟲練習（一）爬取筆趣閣小說（搜索+爬取）爬取微信公眾號文章