搜狗微信爬蟲項目

搜狗微信爬蟲項目

一、需求分析

1、概述

1.1 項目簡介

基於搜狗微信搜索的微信公眾號爬蟲接口

2、需求分析

獲取公眾號信息

通過api，輸入特定公眾號，能查找相關信息

相關信息

{
	'public_name':str # 公眾號名稱
	'wechat_id':str	# 微信id
	"public_qrcode": url	# 二維碼
	'public_image': url	# 公眾號圖片
	'authentication':str	# 認證
	'introduction': str #簡介
}

效果示例

通過公眾號名稱搜索該公眾號的所有文章

圖片示例

相關信息

{
    'article': {
        'title': '',  # 文章標題
        'url': '',  # 文章鏈接
        'imgs': '',  # 文章圖片list
        'abstract': '',  # 文章摘要
        'time': int  # 文章推送時間 10位時間戳
    },
    'gzh': {
        'public_name':str # 公眾號名稱
        'wechat_name':str # 微信名稱
        'wechat_id':str	# 微信id
        "public_qrcode": url	# 二維碼
        'public_image': url	# 公眾號二維碼
        'authentication':str	# 認證
        'introduction': str #簡介
	}
}

二、數據來源分析

1、概述

1.1 目的

找到數據來源入口，並分析出相關的依賴

2、數據來源分析

2.1 首頁

圖片示例
url分析
- url:https://weixin.sogou.com/weixin?type=1&s_from=input&query=南航青年志願者&ie=utf8&_sug_=n&_sug_type_=
- 參數解析
  - type區分類型
    - type=1搜索公眾號
    - type=2搜索文章
  - query為查詢參數，需要通過urllib.parse.quote()進行轉換
- type=1時
  - url:https://weixin.sogou.com/weixin?type=1&s_from=input&query=南航青年志願者&ie=utf8&_sug_=n&_sug_type_=
- type=2時
  - url:[https://weixin.sogou.com/weixin?query=%E6%9C%AA%E9%97%BBcode&_sug_type_=&s_from=input&_sug_=n&type=2&page=1&ie=utf8](https://weixin.sogou.com/weixin?query=未聞code&_sug_type_=&s_from=input&_sug_=n&type=2&page=1&ie=utf8)

三、代碼編寫

1、獲取公眾號信息

效果示例

代碼

import requests
from urllib import parse
from lxml import etree
from pprint import pprint


def process_list_content(content: list):
    if content:
        content_str = str()
        for a_str in content:
            content_str += a_str
        return content_str
    else:
        return None

def process_html_str(html_str):
    html_str = etree.HTML(html_str)
    li_list = html_str.xpath('//ul[contains(@class, "news-list2")]/li')
    public_info = list()
    for li in li_list:
        item = dict()
        public_name = li.xpath('.//p[contains(@class, "tit")]/a//text()')
        item["public_name"] = process_list_content(public_name)
        wechat_id = li.xpath('.//p[contains(@class, "info")]/label/text()')
        item["wechat_id"] = wechat_id[0] if wechat_id else None
        publish_qrcode = li.xpath('.//div[contains(@class,"ew-pop")]//span[@class="pop"]/img[1]/@src')
        item["public_qrcode"] = publish_qrcode[0] if publish_qrcode else None
        publish_image = li.xpath('.//div[contains(@class,"ew-pop")]//span[@class="pop"]/img[2]/@src')
        item["public_image"] = "https:" + publish_image[0] if publish_image else None
        authentication = li.xpath('.//i[@class="identify"]/../text()')
        item['authentication'] = authentication[1] if authentication else None
        introduction = li.xpath('.//dl[1]/dd//text()')
        item["introduction"] = process_list_content(introduction)
        public_info.append(item)
    return public_info

def public_search(public_name: str):
    public_name = parse.quote(public_name)
    base_url = "https://weixin.sogou.com/weixin?type=1&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_="
    url = base_url.format(public_name)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
    }
    response = requests.get(url=url, headers=headers)
    if response.ok:
        return process_html_str(response.text)


def public_search_api(public_name):
    public_info = public_search(public_name)
    for info in public_info:
        pprint("No{}:{}".format(public_info.index(info), info["public_name"]))
    num = int(input("請選擇要查詢的公眾號:"))
    return public_info[num]

def run():
    public_name = input("請輸入你要查找的公眾號：")
    public_info = public_search_api(public_name)
    pprint(public_info)

if __name__ == "__main__":
    # public_name = input("請輸入你要查找的公眾號：")
    # public_search_api(public_name)
    run()

2、獲取公眾號的文章信息

效果示例

代碼

from gevent import monkey

monkey.patch_all()
from gevent.pool import Pool
import requests
import time
from urllib import parse
from lxml import etree
from pprint import pprint


def process_list_content(content: list):
    if content:
        content_str = str()
        for a_str in content:
            content_str += a_str
        return content_str
    else:
        return None


def process_timestamp(content: int):
    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(content))


def process_html_str(html_str: str):
    html_str = etree.HTML(html_str)
    li_list = html_str.xpath('//ul[contains(@class,"news-list")]/li')
    article_list = list()
    for li in li_list:
        article = dict()
        title = li.xpath('.//div[contains(@class,"txt-box")]/h3/a//text()')
        article['title'] = title[0] if title else None
        url = li.xpath('.//div[contains(@class,"txt-box")]/h3/a/@href')
        article['url'] = "https://weixin.sogou.com" + url[0] if url else None
        images = li.xpath('.//div[contains(@class,"img-box")]//img/@src')
        article['images'] = ['https:' + i for i in images] if images else None
        abstract = li.xpath('.//p[contains(@class,"txt-info")]/text()')
        article['abstract'] = process_list_content(abstract)
        timestamp = li.xpath('.//div[@class="s-p"]/@t')
        article['publish_date'] = process_timestamp(int(timestamp[0])) if timestamp else None
        article_list.append(article)
    return article_list


def process_prepare_work(public_name: str):
    public_name = parse.quote(public_name)
    base_url = "https://weixin.sogou.com/weixin?type=2&s_from=input&query={}&ie=utf8&_sug_=n&_sug_type_=&page={}"
    url_list = [base_url.format(public_name, i) for i in range(1, 11)]
    return url_list


def process_request(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
    }
    try:
        response = requests.get(url=url, headers=headers)
        if response.ok:
            article_s = process_html_str(response.text)
            return article_s
    except Exception:
        pass


def public_article(public_name: str):
    url_list = process_prepare_work(public_name)
    pool = Pool(3)
    article_list = pool.map(process_request, url_list)
    a = list()
    for a_ in article_list:
        for a__ in a_:
            a.append(a__)
    return a



if __name__ == "__main__":
    public_name = "未聞code"
    article_list = public_article(public_name)
    pprint(article_list)

3、通過公眾號名字，獲取公眾號信息和其前100篇文章

效果示例

代碼

from pprint import pprint
import json
from sougoweixin.test.public_article_test import public_article
from sougoweixin.test.public_search_test import public_search_api


def add_info_article(public_aritcle_list, public_info):
    public_aritcle_info_list = list()
    for public_aritcle in public_aritcle_list:
        item = dict()
        item["article"] = public_aritcle
        item['gzh'] = public_info
        public_aritcle_info_list.append(item)
    return public_aritcle_info_list

def save_info_to_file(article_info, author_info):
    file_name = author_info['public_name']+"的文章"
    with open('{}.json'.format(file_name), 'a+', encoding='utf8') as f:
        f.write(json.dumps(article_info, ensure_ascii=False, indent=4))

def process_console():
    public_name = input("請輸入你要查找的公眾號：")
    public_info = public_search_api(public_name)
    print("公眾號信息：")
    pprint(public_info)
    num = input("是否查詢該作者的文章：1>是 2>否 :")
    if num == "1":
        public_article_list = public_article(public_info['public_name'])
        public_article_list = add_info_article(public_article_list, public_info)
        save_info_to_file(public_article_list, public_info)
        print("已寫入當前目錄中:{}.json".format(public_info['public_name']))

    else:
        print("歡迎再次使用")


if __name__ == "__main__":
    process_console()

四、總結

1、總結

通過搜狗微信的第三方接口，獲取對應信息，並不困難
在獲取文章的發布時間時，只能拿到時間戳，通過time.strftime("fmt", time.localtime(timestamp))進行轉換，便能拿到
通過本例，熟悉了gevent的使用，當使用協程池時，對應函數，返回值是一個值組成的列表
實現了，通過公眾號名稱進行搜索時，相關公眾號的選擇
可能是未登錄，未見反爬蟲措施（已使用user_agent）

2、改進

目前只能拿到當前公眾號的前100篇文章，需要登錄才能獲取更多的文章信息````

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python爬搜狗微信獲取指定微信公眾號的文章 Python爬蟲開源項目代碼，爬取微信、淘寶、豆瓣、知乎、新浪微博、QQ、去哪網等代碼整理微信公眾號文章的爬蟲系統爬蟲爬取微信公眾號微信公眾號文章的爬蟲系統基於搜狗微信搜索獲取公眾號文章的閱讀量及點贊量如何利用 Python 爬蟲實現給微信群發新聞早報？（詳細）第一個爬蟲練習微信公眾號。微信公眾號爬蟲--歷史文章-首頁 python_爬蟲_微信公眾號抓取