爬蟲篇-遞歸爬取今日頭條指定用戶一個月內發表的所有文章,視頻,微頭條


最近找工作,爬蟲面試的一個面試題。涉及的反爬還是比較全面的,結果公司要求高,要解決視頻鏈接時效性問題,涼涼。

直接上代碼

import requests
import time
from datetime import datetime
import json
import execjs
import hashlib
import re
import csv
from zlib import crc32
from base64 import b64decode
import random
import urllib3
import os
import threading
from queue import Queue
from lxml import etree

# 查看js版本信息
# print(execjs.get().name)
# 屏蔽ssl驗證警告
urllib3.disable_warnings()

"""
需要nodejs環境,需要修改subprocess.py文件內的class Popen(object)類中的__init__(..encode='utf-8)否則調用js文件時會報錯
請求列表頁時.py文件中的ua頭要與js文件中一致,不然很難請求到數據,請求詳情頁時要用ua池否則會封瀏覽器/ip
會有一些空白表格,是因為該賬號七天內為發表內容,或者該賬號被封禁
輸出結果在此文件所在根目錄下/toutiao/
右鍵運行此py文件,newsign.js文件,toutiao.csv文件需在同一文件夾內
爬取的視頻有時效性
"""


# 定義ua池
def headers():
    # 各種PC端
    user_agent_list = [
        # Opera
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        "Opera/8.0 (Windows NT 5.1; U; en)",
        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
        # Firefox
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
        # Safari
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
        # chrome
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
        # 360
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        # 淘寶瀏覽器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
        # 獵豹瀏覽器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
        # QQ瀏覽器
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
        # sogou瀏覽器
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
        # maxthon瀏覽器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
        # UC瀏覽器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent}
    return headers


headers_a = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
}
# 代理ip
proxy = {
    'http': '183.57.44.62:808'
}
# cookies值
cookies = {'s_v_web_id': 'b68312370162a4754efb0510a0f6d394'}


# 獲取_signature
def get_signature(user_id, max_behot_time):
    with open('newsign.js', 'r', encoding='utf-8') as f:
        jsData = f.read()
    execjs.get()
    ctx = execjs.compile(jsData).call('tac', str(user_id) + str(
        max_behot_time))  # 復原TAC.sign(userInfo.id + "" + i.param.max_behot_time)
    return ctx


# 獲取as,cp
def get_as_cp():  # 該函數主要是為了獲取as和cp參數,程序參考今日頭條中的加密js文件:home_4abea46.js
    zz = {}
    now = round(time.time())
    # print(now)  # 獲取當前計算機時間
    e = hex(int(now)).upper()[2:]  # hex()轉換一個整數對象為16進制的字符串表示
    # print('e:', e)
    a = hashlib.md5()  # hashlib.md5().hexdigest()創建hash對象並返回16進制結果
    # print('a:', a)
    a.update(str(int(now)).encode('utf-8'))
    i = a.hexdigest().upper()
    # print('i:', i)
    if len(e) != 8:
        zz = {'as': '479BB4B7254C150',
              'cp': '7E0AC8874BB0985'}
        return zz
    n = i[:5]
    a = i[-5:]
    r = ''
    s = ''
    for i in range(5):
        s = s + n[i] + e[i]
    for j in range(5):
        r = r + e[j + 3] + a[j]
    zz = {
        'as': 'A1' + s + e[-3:],
        'cp': e[0:3] + r + 'E1'
    }
    # print('zz:', zz)
    return zz


# 獲取as,cp,_signature(棄用)
def get_js():
    f = open(r"juejin.js", 'r', encoding='UTF-8')  ##打開JS文件
    line = f.readline()
    htmlstr = ''
    while line:
        htmlstr = htmlstr + line
        line = f.readline()
    ctx = execjs.compile(htmlstr)
    return ctx.call('get_as_cp_signature')


# print(json.loads(get_js())['as'])


# 文章數據
break_flag = []


def wenzhang(url=None, max_behot_time=0, n=0, csv_name=0):
    max_qingqiu = 50
    headers1 = ['發表時間', '標題', '來源', '所有圖片', '文章內容']
    first_url = 'https://www.toutiao.com/c/user/article/?page_type=1&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s' % (
        url.split('/')[-2], max_behot_time, get_as_cp()['as'], get_as_cp()['cp'],
        get_signature(url.split('/')[-2], max_behot_time))
    while n < max_qingqiu and not break_flag:
        try:
            # print(url)
            r = requests.get(first_url, headers=headers_a, cookies=cookies)
            data = json.loads(r.text)
            # print(data)
            max_behot_time = data['next']['max_behot_time']
            if max_behot_time:
                article_list = data['data']
                for i in article_list:
                    try:
                        if i['article_genre'] == 'article':
                            res = requests.get('https://www.toutiao.com/i' + i['group_id'], headers=headers(),
                                               cookies=cookies)
                            # time.sleep(1)
                            article_title = re.findall("title: '(.*?)'", res.text)
                            article_content = re.findall("content: '(.*?)'", res.text, re.S)[0]
                            # pattern = re.compile(r"[(a-zA-Z~\-_!@#$%\^\+\*&\\\/\?\|:\.<>{}()';=)*|\d]")
                            # article_content = re.sub(pattern, '', article_content[0])
                            article_content = article_content.replace('&quot;', '').replace('u003C', '<').replace(
                                'u003E',
                                '>').replace(
                                '&#x3D;',
                                '=').replace(
                                'u002F', '/').replace('\\', '')
                            article_images = etree.HTML(article_content)
                            article_image = article_images.xpath('//img/@src')
                            article_time = re.findall("time: '(.*?)'", res.text)
                            article_source = re.findall("source: '(.*?)'", res.text, re.S)
                            result_time = []
                            [result_time.append(i) for i in
                             str(article_time[0]).split(' ')[0].replace('-', ',').split(',')]
                            # print(result_time)
                            cha = (datetime.now() - datetime(int(result_time[0]), int(result_time[1]),
                                                             int(result_time[2]))).days
                            # print(cha)
                            if 30 < cha <= 32:
                                # print('完成')
                                # break_flag.append(1)
                                # break
                                continue
                            if cha > 32:
                                print('完成')
                                break_flag.append(1)
                                break
                            row = {'發表時間': article_time[0], '標題': article_title[0].strip('&quot;'),
                                   '來源': article_source[0],'所有圖片':article_image,
                                   '文章內容': article_content.strip()}
                            with open('/toutiao/' + str(csv_name) + '文章.csv', 'a', newline='', encoding='gb18030')as f:
                                f_csv = csv.DictWriter(f, headers1)
                                # f_csv.writeheader()
                                f_csv.writerow(row)
                            print('正在爬取文章:', article_title[0].strip('&quot;'), article_time[0],
                                  'https://www.toutiao.com/i' + i['group_id'])
                            time.sleep(1)
                        else:
                            pass
                    except Exception as e:
                        print(e, 'https://www.toutiao.com/i' + i['group_id'])
                wenzhang(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n)
            else:
                pass
        except KeyError:
            n += 1
            print('' + str(n) + '次請求', first_url)
            time.sleep(1)
            if n == max_qingqiu:
                print('請求超過最大次數')
                break_flag.append(1)
            else:
                pass
        except Exception as e:
            print(e)
    else:
        pass

        # print(max_behot_time)
        # print(data)


# 文章詳情頁數據(已合並到文章數據)
def get_wenzhang_detail(url, csv_name=0):
    headers1 = ['發表時間', '標題', '來源', '文章內容']
    res = requests.get(url, headers=headers_a, cookies=cookies)
    # time.sleep(1)
    article_title = re.findall("title: '(.*?)'", res.text)
    article_content = re.findall("content: '(.*?)'", res.text, re.S)
    pattern = re.compile(r"[(a-zA-Z~\-_!@#$%\^\+\*&\\\/\?\|:\.<>{}()';=)*|\d]")
    article_content = re.sub(pattern, '', article_content[0])
    article_time = re.findall("time: '(.*?)'", res.text)
    article_source = re.findall("source: '(.*?)'", res.text, re.S)
    result_time = []
    [result_time.append(i) for i in str(article_time[0]).split(' ')[0].replace('-', ',').split(',')]
    # print(result_time)
    cha = (datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days
    # print(cha)
    if cha > 8:
        return None

    row = {'發表時間': article_time[0], '標題': article_title[0].strip('&quot;'), '來源': article_source[0],
           '文章內容': article_content.strip()}
    with open('/toutiao/' + str(csv_name) + '文章.csv', 'a', newline='')as f:
        f_csv = csv.DictWriter(f, headers1)
        # f_csv.writeheader()
        f_csv.writerow(row)
    print('正在爬取文章:', article_title[0].strip('&quot;'), article_time[0], url)
    time.sleep(0.5)
    return 'ok'


# 視頻數據
break_flag_video = []


def shipin(url, max_behot_time=0, csv_name=0, n=0):
    max_qingqiu = 20
    headers2 = ['視頻發表時間', '標題', '來源', '視頻鏈接']
    first_url = 'https://www.toutiao.com/c/user/article/?page_type=0&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s' % (
        url.split('/')[-2], max_behot_time, get_as_cp()['as'], get_as_cp()['cp'],
        get_signature(url.split('/')[-2], max_behot_time))
    while n < max_qingqiu and not break_flag_video:
        try:
            res = requests.get(first_url, headers=headers_a, cookies=cookies)
            data = json.loads(res.text)
            # print(data)
            max_behot_time = data['next']['max_behot_time']
            if max_behot_time:
                video_list = data['data']
                for i in video_list:
                    try:
                        start_time = i['behot_time']
                        video_title = i['title']
                        video_source = i['source']
                        detail_url = 'https://www.ixigua.com/i' + i['item_id']

                        resp = requests.get(detail_url, headers=headers())
                        r = str(random.random())[2:]
                        url_part = "/video/urls/v/1/toutiao/mp4/{}?r={}".format(
                            re.findall('"video_id":"(.*?)"', resp.text)[0], r)
                        s = crc32(url_part.encode())
                        api_url = "https://ib.365yg.com{}&s={}".format(url_part, s)
                        resp = requests.get(api_url, headers=headers())
                        j_resp = resp.json()
                        video_url = j_resp['data']['video_list']['video_1']['main_url']
                        video_url = b64decode(video_url.encode()).decode()
                        # print((int(str(time.time()).split('.')[0])-start_time)/86400)
                        if 30 < (int(str(time.time()).split('.')[0]) - start_time) / 86400 <= 32:
                            # print('完成')
                            # break_flag_video.append(1)
                            continue
                        if (int(str(time.time()).split('.')[0]) - start_time) / 86400 > 32:
                            print('完成')
                            break_flag_video.append(1)
                            break
                        row = {'視頻發表時間': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time)),
                               '標題': video_title, '來源': video_source,
                               '視頻鏈接': video_url}
                        with open('/toutiao/' + str(csv_name) + '視頻.csv', 'a', newline='', encoding='gb18030')as f:
                            f_csv = csv.DictWriter(f, headers2)
                            # f_csv.writeheader()
                            f_csv.writerow(row)
                        print('正在爬取視頻:', video_title, detail_url, video_url)
                        time.sleep(3)
                    except Exception as e:
                        print(e, 'https://www.ixigua.com/i' + i['item_id'])
                shipin(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n)
        except KeyError:
            n += 1
            print('' + str(n) + '次請求', first_url)
            time.sleep(3)
            if n == max_qingqiu:
                print('請求超過最大次數')
                break_flag_video.append(1)
        except Exception as e:
            print(e)
    else:
        pass


# 微頭條
break_flag_weitoutiao = []


def weitoutiao(url, max_behot_time=0, n=0, csv_name=0):
    max_qingqiu = 20
    headers3 = ['微頭條發表時間', '來源', '標題', '文章內圖片', '微頭條內容']
    while n < max_qingqiu and not break_flag_weitoutiao:
        try:

            first_url = 'https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=%s&max_behot_time=%s' % (
                url.split('/')[-2], max_behot_time)
            # print(first_url)
            res = requests.get(first_url, headers=headers_a, cookies=cookies)
            data = json.loads(res.text)
            # print(data)
            max_behot_time = data['next']['max_behot_time']
            weitoutiao_list = data['data']
            for i in weitoutiao_list:
                try:
                    detail_url = 'https://www.toutiao.com/a' + str(i['concern_talk_cell']['id'])
                    # print(detail_url)
                    resp = requests.get(detail_url, headers=headers(), cookies=cookies)
                    start_time = re.findall("time: '(.*?)'", resp.text, re.S)
                    weitoutiao_name = re.findall("name: '(.*?)'", resp.text, re.S)
                    weitoutiao_title = re.findall("title: '(.*?)'", resp.text, re.S)
                    weitoutiao_images = re.findall('images: \["(.*?)"\]',resp.text,re.S)
                    # print(weitoutiao_images)
                    if weitoutiao_images:
                        weitoutiao_image = 'http:' + weitoutiao_images[0].replace('u002F','/').replace('\\','')
                        # print(weitoutiao_image)
                    else:
                        weitoutiao_image = '此頭條內無附件圖片'
                    weitoutiao_content = re.findall("content: '(.*?)'", resp.text, re.S)
                    result_time = []
                    [result_time.append(i) for i in str(start_time[0]).split(' ')[0].replace('-', ',').split(',')]
                    # print(result_time)
                    cha = (
                        datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days
                    # print(cha)
                    if cha > 30:
                        break_flag_weitoutiao.append(1)
                        print('完成')
                        break
                    row = {'微頭條發表時間': start_time[0], '來源': weitoutiao_name[0],
                           '標題': weitoutiao_title[0].strip('&quot;'),'文章內圖片': weitoutiao_image,
                           '微頭條內容': weitoutiao_content[0].strip('&quot;')}
                    with open('/toutiao/' + str(csv_name) + '微頭條.csv', 'a', newline='', encoding='gb18030')as f:
                        f_csv = csv.DictWriter(f, headers3)
                        # f_csv.writeheader()
                        f_csv.writerow(row)
                    time.sleep(1)
                    print('正在爬取微頭條', weitoutiao_name[0], start_time[0], detail_url)
                except Exception as e:
                    print(e, 'https://www.toutiao.com/a' + str(i['concern_talk_cell']['id']))
            weitoutiao(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n)
        except KeyError:
            n += 1
            print('' + str(n) + '次請求')
            time.sleep(2)
            if n == max_qingqiu:
                print('請求超過最大次數')
                break_flag_weitoutiao.append(1)
            else:
                pass
        except Exception as e:
            print(e)
    else:
        pass


# 獲取需要爬取的網站數據
def csv_read(path):
    data = []
    with open(path, 'r', encoding='gb18030') as f:
        reader = csv.reader(f, dialect='excel')
        for row in reader:
            data.append(row)
    return data


# 啟動函數
def main():
    for j, i in enumerate(csv_read('toutiao-suoyou.csv')):
        # data_url = data.get_nowait()
        if '文章' in i[3]:
            # 啟動抓取文章函數
            print('當前正在抓取文章第', j, i[2])
            headers1 = ['發表時間', '標題', '來源', '所有圖片', '文章內容']
            with open('/toutiao/' + i[0] + '文章.csv', 'a', newline='')as f:
                f_csv = csv.DictWriter(f, headers1)
                f_csv.writeheader()
            break_flag.clear()
            wenzhang(url=i[2], csv_name=i[0])

        if '視頻' in i[3]:
            # 啟動爬取視頻的函數
            print('當前正在抓取視頻第', j, i[2])
            headers2 = ['視頻發表時間', '標題', '來源', '視頻鏈接']
            with open('/toutiao/' + i[0] + '視頻.csv', 'a', newline='')as f:
                f_csv = csv.DictWriter(f, headers2)
                f_csv.writeheader()
            break_flag_video.clear()
            shipin(url=i[2], csv_name=i[0])

        if '微頭條' in i[3]:
            # 啟動獲取微頭條的函數
            headers3 = ['微頭條發表時間', '來源', '標題', '文章內圖片', '微頭條內容']
            print('當前正在抓取微頭條第', j, i[2])
            with open('/toutiao/' + i[0] + '微頭條.csv', 'a', newline='')as f:
                f_csv = csv.DictWriter(f, headers3)
                f_csv.writeheader()
            break_flag_weitoutiao.clear()
            weitoutiao(url=i[2], csv_name=i[0])


# 多線程啟用
def get_all(urlQueue):
    while True:
        try:
            # 不阻塞的讀取隊列數據
            data_url = urlQueue.get_nowait()
            # i = urlQueue.qsize()
        except Exception as e:
            break
        # print(data_url)
        # if '文章' in data_url[3]:
        #     # 啟動抓取文章函數
        #     print('當前正在抓取文章', data_url[2])
        #     headers1 = ['發表時間', '標題', '來源', '所有圖片', '文章內容']
        #     with open('/toutiao/' + data_url[0] + '文章.csv', 'a', newline='')as f:
        #         f_csv = csv.DictWriter(f, headers1)
        #         f_csv.writeheader()
        #     break_flag.clear()
        #     wenzhang(url=data_url[2], csv_name=data_url[0])

        if '視頻' in data_url[3]:
            # 啟動爬取視頻的函數
            print('當前正在抓取視頻', data_url[2])
            headers2 = ['視頻發表時間', '標題', '來源', '視頻鏈接']
            with open('/toutiao/' + data_url[0] + '視頻.csv', 'a', newline='')as f:
                f_csv = csv.DictWriter(f, headers2)
                f_csv.writeheader()
            break_flag_video.clear()
            shipin(url=data_url[2], csv_name=data_url[0])
            #
        # if '微頭條' in data_url[3]:
        #     # 啟動獲取微頭條的函數
        #     headers3 = ['微頭條發表時間', '來源', '標題','文章內圖片', '微頭條內容']
        #     print('當前正在抓取微頭條', data_url[2])
        #     with open('/toutiao/' + data_url[0] + '微頭條.csv', 'a', newline='')as f:
        #         f_csv = csv.DictWriter(f, headers3)
        #         f_csv.writeheader()
        #     break_flag_weitoutiao.clear()
        #     weitoutiao(url=data_url[2], csv_name=data_url[0])


if __name__ == '__main__':
    # 創建存儲目錄
    path = '/toutiao/'
    if not os.path.exists(path):
        os.mkdir(path)

    """單一腳本使用main函數,開啟多線程按照下面方法控制線程數,開啟多線程會請求過於頻繁,導致頭條反爬封ip等,需要設置代理ip"""
    # main()


    urlQueue = Queue()
    for j, i in enumerate(csv_read('toutiao-suoyou.csv')):
        urlQueue.put(i)
    # print(urlQueue.get_nowait())
    # print(urlQueue.qsize())
    threads = []
    # 可以調節線程數, 進而控制抓取速度
    threadNum = 4
    for i in range(0, threadNum):
        t = threading.Thread(target=get_all, args=(urlQueue,))
        threads.append(t)

    for t in threads:
        # 設置為守護線程,當守護線程退出時,由它啟動的其它子線程將同時退出,
        # t.setDaemon(True)
        t.start()
    for t in threads:
        # 多線程多join的情況下,依次執行各線程的join方法, 這樣可以確保主線程最后退出, 且各個線程間沒有阻塞
        t.join()

        # pass

讀取csv文件中的用戶信息

 

 抓取的結果

 

 

 

 

 

 

 

 內容僅供參考學習使用,有意見可聯系作者刪除。。。。。。

求份爬蟲工作

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM