前言
今天分享的案例是Python爬取快手短視頻平台高清無水印視頻
- requests
- json
- re
- pprint
- 版 本:anaconda5.2.0(python3.6.5)
- 編輯器:pycharm
【付費VIP完整版】只要看了就能學會的教程,80集Python基礎入門視頻教學
- 找到目標網址 https://www.kuaishou.com/graphql
- 發送請求 get post
- 解析數據 (視頻地址 視頻標題)
- 發送請求 請求每一個視頻的地址
- 保存視頻
import requests import pprint import json import re
headers = { # data內容類型 # application/json: 傳入json類型數據 json 瀏覽器跟 快手服務器 交流(數據傳輸格式)的方式 # 默認格式: application/x-www-form-urlencoded 'content-type': 'application/json', # cookie: 用戶身份標識 有沒有登錄 'Cookie': 'did=web_53827e0b098c608bc6f42524b1f3211a; didv=1617281516668; kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3', # User-Agent: 瀏覽器信息(用來偽裝成瀏覽器) 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', } data = { 'operationName': "visionSearchPhoto", 'query': "query visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n type\n author {\n id\n name\n following\n headerUrl\n headerUrls {\n cdn\n url\n __typename\n }\n __typename\n }\n tags {\n type\n name\n __typename\n }\n photo {\n id\n duration\n caption\n likeCount\n realLikeCount\n coverUrl\n photoUrl\n liked\n timestamp\n expTag\n coverUrls {\n cdn\n url\n __typename\n }\n photoUrls {\n cdn\n url\n __typename\n }\n animatedCoverUrl\n stereoType\n videoRatio\n __typename\n }\n canAddComment\n currentPcursor\n llsid\n status\n __typename\n }\n searchSessionId\n pcursor\n aladdinBanner {\n imgUrl\n link\n __typename\n }\n __typename\n }\n}\n", 'variables': { 'keyword': keyword, 'pcursor': str(page), 'page': "search" # 發送請求 response = requests.post('https://www.kuaishou.com/graphql', headers=headers, data=data)
for page in range(0, 11): print(f'-----------------------正在爬取{page+1}頁----------------------') json_data = response.json() data_list = json_data['data']['visionSearchPhoto']['feeds'] for data in data_list: title = data['photo']['caption'] url_1 = data['photo']['photoUrl'] new_title = re.sub(r'[/\:*?"<>|\n]', '_', title) # print(title, url_1) # content: 獲取到的二進制數據 # 文字 text # 圖片 視頻 音頻 二進制數據 content = requests.get(url_1).content
with open('./video/' + new_title + '.mp4', mode='wb') as f: f.write(content) print(new_title, '爬取成功!!!')
