【Python爬蟲】“曹芬~~嘿嘿”是什么梗?批量下載快手平台視頻數據


知識點

  • requests
  • json
  • re
  • pprint

開發環境:

  • 版 本:anaconda5.2.0(python3.6.5)
  • 編輯器:pycharm

案例實現步驟:

一. 數據來源分析 (只有當你找到數據來源的時候, 才能通過代碼實現)

  1. 確定需求 (要爬取的內容是什么?)
    爬取某個關鍵詞對應的視頻 保存mp4
  2. 通過開發者工具進行抓包分析 分析數據從哪里來的(找出真正的數據來源)?
    靜態加載頁面
    筆趣閣為例
    動態加載頁面
    開發者工具抓數據包

 

二. 代碼實現過程

  1. 找到目標網址
  2. 發送請求
    get post
  3. 解析數據 (獲取視頻地址 視頻標題)
  4. 發送請求 請求每個視頻地址
  5. 保存視頻

今天的目標

三. 單個視頻

導入所需模塊

import json
import requests
import re

 

發送請求

data = {
    'operationName': "visionSearchPhoto",
    'query': "query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      type\n      author {\n        id\n        name\n        following\n        headerUrl\n        headerUrls {\n          cdn\n          url\n          __typename\n        }\n        __typename\n      }\n      tags {\n        type\n        name\n        __typename\n      }\n      photo {\n        id\n        duration\n        caption\n        likeCount\n        realLikeCount\n        coverUrl\n        photoUrl\n        liked\n        timestamp\n        expTag\n        coverUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrls {\n          cdn\n          url\n          __typename\n        }\n        animatedCoverUrl\n        stereoType\n        videoRatio\n        __typename\n      }\n      canAddComment\n      currentPcursor\n      llsid\n      status\n      __typename\n    }\n    searchSessionId\n    pcursor\n    aladdinBanner {\n      imgUrl\n      link\n      __typename\n    }\n    __typename\n  }\n}\n",
    'variables': {
        'keyword': '張三',
        'pcursor': ' ',
        'page': "search",
        'searchSessionId': "MTRfMjcwOTMyMTQ2XzE2Mjk5ODcyODQ2NTJf5oWi5pGHXzQzMQ"
    }

response = requests.post('https://www.kuaishou.com/graphql', data=data)

 

加請求頭

headers = {
    # Content-Type(內容類型)的格式有四種(對應data):分別是
    # 爬蟲基礎/xml: 把xml作為一個文件來傳輸
    # multipart/form-data: 用於文件上傳
    'content-type': 'application/json',
    # 用戶身份標識
    'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_721a784b472981d650bcb8bbc5e9c9c2',
    # 瀏覽器信息 (偽裝成瀏覽器發送請求)
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

 

json序列化操作

# json數據交換格式, 在JSON出現之前, 大家一直用XML來傳遞數據
# 由於各個語言都支持 JSON ,JSON 又支持各種數據類型,所以JSON常用於我們日常的 HTTP 交互、數據存儲等。
# 將python對象編碼成Json字符串
data = json.dumps(data)
json_data = requests.post('https://www.kuaishou.com/graphql', headers=headers, data=data).json()

 

字典取值

feeds = json_data['data']['visionSearchPhoto']['feeds']
for feed in feeds:
    caption = feed['photo']['caption']
    photoUrl = feed['photo']['photoUrl']
    new_title = re.sub(r'[/\:*?<>/\n] ', '-', caption)

 

再次發送請求

resp = requests.get(photoUrl).content

 

保存數據

with open('video\\' + title + '.mp4', mode='wb') as f:
    f.write(resp)
print(title, '爬取成功!!!')

 

四. 翻頁爬取

導入模塊

import concurrent.futures
import time

 

發送請求

def get_json(url, data):
    response = requests.post(url, headers=headers, data=data).json()
    return response

 

修改標題

def change_title(title):
    # windows系統文件命名 不能含有特殊字符...
    # windows文件命名 字符串不能超過 256...
    new_title = re.sub(r'[/\\|:?<>"*\n]', '_', title)
    if len(new_title) > 50:
        new_title = new_title[:10]
    return new_title

 

數據提取

def parse(json_data):
    data_list = json_data['data']['visionSearchPhoto']['feeds']
    info_list = []
    for data in data_list:
        # 提取標題
        title = data['photo']['caption']
        new_title = change_title(title)
        url_1 = data['photo']['photoUrl']
        info_list.append([new_title, url_1])
    return info_list

 

保存數據

def save(title, url_1):
    resp = requests.get(url_1).content
    with open('video\\' + title + '.mp4', mode='wb') as f:
        f.write(resp)
    print(title, '爬取成功!!!')

 

主函數 調動所有的函數

def run(url, data):
    """主函數 調動所有的函數"""
    json_data = get_json(url, data)
    info_list = parse(json_data)
    for title, url_1 in info_list:
        save(title, url_1)

if __name__ == '__main__':
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        for page in range(1, 5):
            url = 'https://www.kuaishou.com/graphql'
            data = {
                'operationName': "visionSearchPhoto",
                'query': "query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n  visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      type\n      author {\n        id\n        name\n        following\n        headerUrl\n        headerUrls {\n          cdn\n          url\n          __typename\n        }\n        __typename\n      }\n      tags {\n        type\n        name\n        __typename\n      }\n      photo {\n        id\n        duration\n        caption\n        likeCount\n        realLikeCount\n        coverUrl\n        photoUrl\n        liked\n        timestamp\n        expTag\n        coverUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrls {\n          cdn\n          url\n          __typename\n        }\n        animatedCoverUrl\n        stereoType\n        videoRatio\n        __typename\n      }\n      canAddComment\n      currentPcursor\n      llsid\n      status\n      __typename\n    }\n    searchSessionId\n    pcursor\n    aladdinBanner {\n      imgUrl\n      link\n      __typename\n    }\n    __typename\n  }\n}\n",
                'variables': {
                    'keyword': '曹芬',
                    # 'keyword': keyword,
                    'pcursor': str(page),
                    'page': "search",
                    'searchSessionId': "MTRfMjcwOTMyMTQ2XzE2Mjk5ODcyODQ2NTJf5oWi5pGHXzQzMQ"
                }
            }
            data = json.dumps(data)
            executor.submit(run, url, data, )
    print('一共花費了:', time.time()-start_time)

 

耗時為57.7秒


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM