數據抓取
上一節中,我們分析了網站的url,可以抓取視頻的數據以及熱詞數據(搜索框提示數據)
URL分析
分析一下視頻數據的url
url = 'https://haokan.baidu.com/videoui/api/videorec?tab=yingshi&act=pcFeed&pd=pc&num=20&shuaxin_id=1608125768624'
其中的tab是我們在首頁看到的視頻分類標簽的拼音縮寫,后面的shuaxin_id看起來是一個時間戳,用來充當一個隨機數,num表示一次獲取幾條數據
JSON數據格式分析
上一節中,我們從URL中獲取到的數據中,有幾個公共的字段
{
"errno": 0,
"error": "成功",
"data": {
"requestParam": [],
"response": {
"videos": [
{
"id": "5935263900090481104",
"title": "霸總怎么都想不到,他隨手救下的小孩,居然是他的親兒子!",
"poster": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_454,h_256,q_100",
"poster_small": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_454,h_256,q_100",
"poster_big": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_681,h_381,q_100",
"poster_pc": "https://tukuimg.bdstatic.com/processed/8e8086eb7cf3f54c90da2d590f652484.jpg@s_2,w_681,h_381,q_100,f_webp",
"source_name": "好劇渲染",
"play_url": "http://vd3.bdstatic.com/mda-kj6edbgpk3cs0qz4/cae_h264_nowatermark/1606875218/mda-kj6edbgpk3cs0qz4.mp4",
"playcnt": 549866,
"mthid": "1634935029156178",
"mthpic": "https://pic.rmb.bdstatic.com/bjh/user/94863b3c176d3223a379e0e206876aa0.jpeg?x-bce-process=image/resize,m_lfit,w_100,h_100",
"threadId": "1059000036007127",
"site_name": null,
"duration": "10:00",
"url": "https://haokan.baidu.com/v?pd=pc&vid=5935263900090481104",
"cmd": "baiduboxapp://v1/easybrowse/open?upgrade=1&type=video&url=https%3A%2F%2Fhaokan.baidu.com%2F%2Fv%3Fcontext%3D%257B%2522nid%2522%253A%25225935263900090481104%2522%257D%26backflow%3D1%26pd%3Dpc&style=%7B%22toolbaricons%22%3A%7B%22toolids%22%3A%5B%221%22%2C%222%22%2C%223%22%5D%7D%2C%22menumode%22%3A2%7D&newbrowser=1&slog=%257B%2522from%2522%253A%2522feed%2522%252C%2522page%2522%253A%2522sv%2522%257D",
"loc_id": "http://www.internal.video.baidu.com/5149be5226f83954df8b41ac83a9b546.html",
"commentInfo": {
"source": "baidumedia",
"key": "1679857347109984154"
},
"comment_id": "1679857347109984154",
"show_tag": 0,
"publish_time": "2020年10月07日",
"new_cate_v2": "影視",
"appid": "",
"path": "",
"channel_name": "",
"channel_total_number": "",
"channel_poster": "",
"like": 7628,
"fmlike": "7628",
"comment": "0",
"fmcomment": "0次播放",
"fmplaycnt": "55萬次播放",
"fmplaycnt_2": "55萬",
"outstand_tag": ""
},
]
}
}
}
其中的errno表示錯誤碼,0表示沒有出錯,error表示錯誤信息,data是一個json對象,里面存儲我們請求的數據和參數信息,我們用到的在data對象的response中,所以我們會做一下數據的基礎封裝。
響應數據的封裝
基礎響應數據
class BaseData:
"""
響應數據
requestParam list對象
response 響應數據
"""
def __init__(self,data):
self.requestParam = data.get("requestParam")
self.response= data.get("response")
def __repr__(self):
return "<Data>[%s,%s]" %(self.requestParam,self.response)
class BaseResponse:
"""
響應數據的基礎類
"""
def __init__(self,errno,error,data):
self.errno = errno
self.error = error
self.data = data
def __repr__(self):
return "<Bese>[%d,%s,%s]" %(self.errno,self.error,self.data)
熱詞響應數據
class Hotword:
"""
熱詞
"""
def __init__(self,title,hotNum):
self.title = title
self.hotNum = hotNum
def __repr__(self):
return "<Hotword>[%s,%s]" %(self.title,self.hotNum)
視頻響應數據
class CommentInfo:
"""
視頻評論信息
"""
def __init__(self, data):
self.source = data["source"]
self.key = data["key"]
def __repr__(self):
return "<CommentInfo>[%s,%s]" % (self.source, self.key)
class VideoBean:
"""
視頻數據
"""
def __init__(self, data):
self.id = data["id"]
self.title = data["title"]
self.poster = data["poster"]
self.poster_small = data["poster_small"]
self.poster_big = data["poster_big"]
self.source_name = data["source_name"]
self.poster_pc = data["poster_pc"]
self.play_url = data["play_url"]
self.mthid = data["mthid"]
self.playcnt = data["playcnt"]
self.mthpic = data["mthpic"]
self.threadId = data["threadId"]
self.site_name = data["site_name"]
self.duration = data["duration"]
self.url = data["url"]
self.cmd = data["cmd"]
self.loc_id = data["loc_id"]
self.comment_id = data["comment_id"]
self.show_tag = data["show_tag"]
self.publish_time = data["publish_time"]
self.new_cate_v2 = data["new_cate_v2"]
self.appid = data["appid"]
self.channel_name = data["channel_name"]
self.channel_total_number = data["channel_total_number"]
self.channel_poster = data["channel_poster"]
self.fmlike = data["fmlike"]
self.comment = data["comment"]
self.fmcomment = data["fmcomment"]
self.fmplaycnt = data["fmplaycnt"]
self.fmplaycnt_2 = data["fmplaycnt_2"]
self.outstand_tag = data["outstand_tag"]
self.commentInfo = CommentInfo(data["commentInfo"])
獲取數據
import requests
import time
from bean.Bean import BaseResponse
from bean.Bean import Hotword
from bean.Bean import VideoBean
from bean.Bean import BaseData
def do_net(url, headers=None):
"""
獲取網絡數據,返回base對象
:param url:
:param headers:
:return:
"""
if not headers:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
resp = requests.get(url=url, headers=headers).json()
base = parse_bean(resp)
return base
def parse_bean(data):
base = BaseResponse(-1, "", "")
base.errno = data.get("errno")
base.error = data.get("error")
base.data = BaseData(data.get("data"))
return base
def get_hot_words():
url = 'https://haokan.baidu.com/videoui/api/hotwords?sfrom=pc'
resp_bean = do_net(url=url)
if resp_bean.errno != 0:
print("獲取數據失敗!數據為:", resp_bean)
return
hot_words = []
data = resp_bean.data.response.get("hotwords")
for item in data:
hot_word = Hotword(item["title"], item["hot_num"])
hot_words.append(hot_word)
print("獲取熱詞成功", hot_words)
def get_video_data():
url = "https://haokan.baidu.com/videoui/api/videorec?tab=yinyue&act=pcFeed&pd=pc&num=5&shuaxin_id=%d".format(
(int)(time.time() * 1000))
base = do_net(url)
if base.errno != 0:
print("獲取數據失敗!數據為:", base)
return
videos = []
for item in base.data.response.get("videos"):
videos.append(VideoBean(item))
print(len(videos))
print(videos)
if __name__ == '__main__':
get_hot_words()
get_video_data()
總結
由於從首頁抓取視頻分類失敗,后面研究后再補上。
下一節,將我們抓取的數據寫入數據庫中