搜狗微信爬蟲項目
目錄
一、需求分析
1、概述
1.1 項目簡介
- 基於搜狗微信搜索的微信公眾號爬蟲接口
2、需求分析
-
獲取公眾號信息
-
通過api,輸入特定公眾號,能查找相關信息
-
相關信息
{ 'public_name':str # 公眾號名稱 'wechat_id':str # 微信id "public_qrcode": url # 二維碼 'public_image': url # 公眾號圖片 'authentication':str # 認證 'introduction': str #簡介 } -
效果示例

-
-
通過公眾號名稱搜索該公眾號的所有文章
-
圖片示例

-
相關信息
{ 'article': { 'title': '', # 文章標題 'url': '', # 文章鏈接 'imgs': '', # 文章圖片list 'abstract': '', # 文章摘要 'time': int # 文章推送時間 10位時間戳 }, 'gzh': { 'public_name':str # 公眾號名稱 'wechat_name':str # 微信名稱 'wechat_id':str # 微信id "public_qrcode": url # 二維碼 'public_image': url # 公眾號二維碼 'authentication':str # 認證 'introduction': str #簡介 } }
-
二、數據來源分析
1、概述
1.1 目的
- 找到數據來源入口,並分析出相關的依賴
2、數據來源分析
2.1 首頁
-
圖片示例

-
url分析
-
url:
https://weixin.sogou.com/weixin?type=1&s_from=input&query=南航青年志願者&ie=utf8&_sug_=n&_sug_type_= -
參數解析
- type區分類型
- type=1搜索公眾號
- type=2搜索文章
- query為查詢參數,需要通過
urllib.parse.quote()進行轉換
- type區分類型
-
type=1時
- url:
https://weixin.sogou.com/weixin?type=1&s_from=input&query=南航青年志願者&ie=utf8&_sug_=n&_sug_type_=
- url:
-
type=2時
- url:
[https://weixin.sogou.com/weixin?query=%E6%9C%AA%E9%97%BBcode&_sug_type_=&s_from=input&_sug_=n&type=2&page=1&ie=utf8](https://weixin.sogou.com/weixin?query=未聞code&_sug_type_=&s_from=input&_sug_=n&type=2&page=1&ie=utf8)
- url:
-
三、代碼編寫
1、獲取公眾號信息
-
效果示例

-
代碼
import requests from urllib import parse from lxml import etree from pprint import pprint def process_list_content(content: list): if content: content_str = str() for a_str in content: content_str += a_str return content_str else: return None def process_html_str(html_str): html_str = etree.HTML(html_str) li_list = html_str.xpath('//ul[contains(@class, "news-list2")]/li') public_info = list() for li in li_list: item = dict() public_name = li.xpath('.//p[contains(@class, "tit")]/a//text()') item["public_name"] = process_list_content(public_name) wechat_id = li.xpath('.//p[contains(@class, "info")]/label/text()') item["wechat_id"] = wechat_id[0] if wechat_id else None publish_qrcode = li.xpath('.//div[contains(@class,"ew-pop")]//span[@class="pop"]/img[1]/@src') item["public_qrcode"] = publish_qrcode[0] if publish_qrcode else None publish_image = li.xpath('.//div[contains(@class,"ew-pop")]//span[@class="pop"]/img[2]/@src') item["public_image"] = "https:" + publish_image[0] if publish_image else None authentication = li.xpath('.//i[@class="identify"]/../text()') item['authentication'] = authentication[1] if authentication else None introduction = li.xpath('.//dl[1]/dd//text()') item["introduction"] = process_list_content(introduction) public_info.append(item) return public_info def public_search(public_name: str): public_name = parse.quote(public_name) base_url = "https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_=" url = base_url.format(public_name) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", } response = requests.get(url=url, headers=headers) if response.ok: return process_html_str(response.text) def public_search_api(public_name): public_info = public_search(public_name) for info in public_info: pprint("No{}:{}".format(public_info.index(info), info["public_name"])) num = int(input("請選擇要查詢的公眾號:")) return public_info[num] def run(): public_name = input("請輸入你要查找的公眾號:") public_info = public_search_api(public_name) pprint(public_info) if __name__ == "__main__": # public_name = input("請輸入你要查找的公眾號:") # public_search_api(public_name) run()
2、獲取公眾號的文章信息
-
效果示例

-
代碼
from gevent import monkey monkey.patch_all() from gevent.pool import Pool import requests import time from urllib import parse from lxml import etree from pprint import pprint def process_list_content(content: list): if content: content_str = str() for a_str in content: content_str += a_str return content_str else: return None def process_timestamp(content: int): return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(content)) def process_html_str(html_str: str): html_str = etree.HTML(html_str) li_list = html_str.xpath('//ul[contains(@class,"news-list")]/li') article_list = list() for li in li_list: article = dict() title = li.xpath('.//div[contains(@class,"txt-box")]/h3/a//text()') article['title'] = title[0] if title else None url = li.xpath('.//div[contains(@class,"txt-box")]/h3/a/@href') article['url'] = "https://weixin.sogou.com" + url[0] if url else None images = li.xpath('.//div[contains(@class,"img-box")]//img/@src') article['images'] = ['https:' + i for i in images] if images else None abstract = li.xpath('.//p[contains(@class,"txt-info")]/text()') article['abstract'] = process_list_content(abstract) timestamp = li.xpath('.//div[@class="s-p"]/@t') article['publish_date'] = process_timestamp(int(timestamp[0])) if timestamp else None article_list.append(article) return article_list def process_prepare_work(public_name: str): public_name = parse.quote(public_name) base_url = "https://weixin.sogou.com/weixin?type=2&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_=&page={}" url_list = [base_url.format(public_name, i) for i in range(1, 11)] return url_list def process_request(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36", } try: response = requests.get(url=url, headers=headers) if response.ok: article_s = process_html_str(response.text) return article_s except Exception: pass def public_article(public_name: str): url_list = process_prepare_work(public_name) pool = Pool(3) article_list = pool.map(process_request, url_list) a = list() for a_ in article_list: for a__ in a_: a.append(a__) return a if __name__ == "__main__": public_name = "未聞code" article_list = public_article(public_name) pprint(article_list)
3、通過公眾號名字,獲取公眾號信息和其前100篇文章
-
效果示例


-
代碼
from pprint import pprint import json from sougoweixin.test.public_article_test import public_article from sougoweixin.test.public_search_test import public_search_api def add_info_article(public_aritcle_list, public_info): public_aritcle_info_list = list() for public_aritcle in public_aritcle_list: item = dict() item["article"] = public_aritcle item['gzh'] = public_info public_aritcle_info_list.append(item) return public_aritcle_info_list def save_info_to_file(article_info, author_info): file_name = author_info['public_name']+"的文章" with open('{}.json'.format(file_name), 'a+', encoding='utf8') as f: f.write(json.dumps(article_info, ensure_ascii=False, indent=4)) def process_console(): public_name = input("請輸入你要查找的公眾號:") public_info = public_search_api(public_name) print("公眾號信息:") pprint(public_info) num = input("是否查詢該作者的文章:1>是 2>否 :") if num == "1": public_article_list = public_article(public_info['public_name']) public_article_list = add_info_article(public_article_list, public_info) save_info_to_file(public_article_list, public_info) print("已寫入當前目錄中:{}.json".format(public_info['public_name'])) else: print("歡迎再次使用") if __name__ == "__main__": process_console()
四、總結
1、總結
- 通過搜狗微信的第三方接口,獲取對應信息,並不困難
- 在獲取文章的發布時間時,只能拿到時間戳,通過
time.strftime("fmt", time.localtime(timestamp))進行轉換,便能拿到 - 通過本例,熟悉了
gevent的使用,當使用協程池時,對應函數,返回值是一個值組成的列表 - 實現了,通過公眾號名稱進行搜索時,相關公眾號的選擇
- 可能是未登錄,未見反爬蟲措施(已使用user_agent)
2、改進
- 目前只能拿到當前公眾號的前100篇文章,需要登錄才能獲取更多的文章信息````
