微信公眾號爬蟲--歷史文章-首頁


在上次的爬蟲中,我們只是爬取了歷史文章中加載更多的數據(https://www.cnblogs.com/jueshilaozhongyi/p/11656435.html),這次是歷史文章中首頁的數據

歷史文章首頁的數據是返回在html中的,再具體點在JavaScript中

本次代碼的缺點:1.還是不能很智能,需要通過抓包工具獲取首頁的鏈接

        2.有些公眾號沒有歷史文章,這種公眾號不能使用

            3.有些公眾號歷史文章使用的是分類,這種也不能使用(下次分享這種的怎么處理)

好了,我們先來看看首頁的鏈接吧:

 

 

 +action=getmsg

對比地址,我們可以看到也就是訪問的路徑都一樣,只是action的參數不一樣,這次的action值是home,后面的參數都一樣

下面開始放代碼吧:

# 在之前我們的公眾號名字是通過我們手動輸入的,這次因為是在首頁,可以通過正則表達式直接獲取,新增加了獲取公眾號名的步驟

import requests
import re, os
import time
# 在之前的鏈接里我們封裝的數據庫操作,可以直接拿來用
from conn.connect_mysql import insert_wechat_content,select_wechat_content

path = os.getcwd()
print(path)
file_path = path + '//content_file'
def get_content_text(url):
"""
請求接口數據
:return:
"""
wechat_home_url = url
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) MicroMessenger/2.3.27(0x12031b13) MacWechat Chrome/39.0.2171.95 Safari/537.36 NetType/WIFI WindowsWechat",
"cookie": "devicetype=android-28; lang=zh_CN; pass_ticket=sZNf5AG/C0AvageD87nRhK3W3AuVgYP3dYTvz3i57WFq718hIiDmMmA/ICUWA3W; version=2700073a; wap_sid2=CILAnPMFElxnMzRMNjdKbGpLdXYxZ0xzN2JfeldZX25JaGQ1a0EyLTNGUmE5SHZxNGRqTERPX1kybnd6a0Nwd2pONkJiLUxRbW84OU9kdkxjcHJjMHVZRXRxQUVDd2dFQUFBfjCMwsvtBTgNQJVO; wxuin=1583816706"
}

result = requests.get(url=wechat_home_url, headers=headers, verify=False)
r = result.text
return r

def write_content_file(url):
"""
寫入接口請求的數據
:param data:
:return:
"""
data = get_content_text(url)
f = open(file_path, 'w+', encoding='utf-8')
f.write(data)
f.close()

def read_content_file():
"""
讀取file數據
:return: text
"""
f = open(file_path, 'r', encoding='utf-8')
text = f.read()
f.close()
return text

def find_msg():
"""
正則表達式獲取msgList中的數據
:return: str
"""
r = read_content_file()
msgList = re.findall(r"msgList = \'(.*)\'", str(r))
return str(msgList[0])

def msg_replace():
"""
替換引號為單引號
:return: str
"""
msg = find_msg()
msg_replace = msg.replace(""", "'")
return msg_replace

def msg_json():
"""
將數據處理成json格式
:return: json
"""
import demjson
msg = msg_replace()
msg_json = demjson.decode(msg)
return msg_json['list']

def get_wechat_name():
"""
獲取公眾號名
:return:
"""
r = read_content_file()
wechat_name = re.findall(r"nickname = \"(.*)\" |\|"";", str(r))
# wechat_name = "'{}'".format(wechat_name)
# print(wechat_name)
return wechat_name[0]

def format_data():
"""
保存獲取到的數據
:return:
"""
msg = msg_json()
wechat_name = get_wechat_name()
wechat_name = "'{}'".format(wechat_name)

for i in msg:
# 標題
title = i['app_msg_ext_info']['title']
title = "'{}'".format(title)
# 文章地址
content_url = i['app_msg_ext_info']['content_url']
content_url = "'{}'".format(content_url)
# 封面圖
cover = i['app_msg_ext_info']['cover']
cover = "'{}'".format(cover)
# 轉載路徑
source_url = i['app_msg_ext_info']['source_url']
source_url = "'{}'".format(source_url)

# 轉載公眾號
source_name = i['app_msg_ext_info']['author']
source_name = "'{}'".format(source_name)

# 發布時間
datetime = i['comm_msg_info']['datetime']
datetime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(datetime))
datetime = "'{}'".format(datetime)
print(title, content_url, cover, source_url, source_name, datetime)
if select_wechat_content(title) == 1:
print("數據已經存在")
else:
insert_wechat_content(wechat_name, title, content_url, cover, source_url, source_name, datetime)

def run(url):
write_content_file(url)
format_data()

if __name__ == "__main__":
url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI5NDY1MjQzNA==&uin=MTU4MzgxNjcwNg%3D%3D&key=b1719993cc296ec41a4aad024aa262db236a1b7242d12dd98e5d02bf751cb5e705f8ef8ef6cda9e235519a360bab4c42b4ab301a460e39a67ca76f0945e49ddf2cbaaf03553a73e079426924bbbe17ce&devicetype=iMac+MacBookPro15%2C1+OSX+OSX+10.14.5+build(18F203)&version=12031b13&lang=zh_CN&nettype=WIFI&a8scene=0&fontScale=100&pass_ticket=CSP6SWxOUwP4xAvrB01DuLNCJIO%2FR65vUpx4MFOWrJCce3JldcoyR1VZK4%2BQfXzn"
run(url)


大功告成,Over!

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM