百度熱搜數據爬取及分析

本文轉載自查看原文 2020-04-19 16:13 1480

一、網絡爬蟲設計方案

1、爬蟲名稱：百度汽車熱搜

2、內容：爬取百度不同汽車的熱搜指數

3、概述：首先查找源代碼，分析請求方式和url地址，再使用requests模塊獲取網頁源代碼，再使用BeautifulSoup解析得到所需要的數據，然后使用matplotlib實現數據可視化分析，最后進行小結。

難點：回歸直線

二、主題頁面的結構特征分析
1.主題頁面的結構與特征分析

獲取電動汽車、小型汽車、中級車、豪華車、SUV這五類汽車每一類的前五名的熱搜車型和其對應的指數。

2.Htmls頁面解析

三、網絡爬蟲程序設計
1.數據爬取與采集

對數據進行清洗和處理

.數據分析與可視化（例如：數據柱形圖、直方圖、散點圖、盒圖、分布圖）

.根據數據之間的關系，分析兩個變量之間的相關系數，畫出散點圖，並建立變量之間的回歸方程（一元或多元）

數據持久化

四、結論：人們對大多喜歡中級車，且SUV類型較多。

小結：經過這次作業，加強了自己對舊知識的掌握以及部分新知識的學習，認識到自己在Python方面還有許多不足。

源代碼：

'''
獲取百度汽車熱搜數據

'''
import requests
from bs4 import BeautifulSoup
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import re
import numpy as np

from sklearn.linear_model import LinearRegression

# 設置matplotlib正常顯示中文和負號
matplotlib.rcParams['font.sans-serif'] = ['SimHei']

matplotlib.rcParams['axes.unicode_minus'] = False

# 定義一個字典方便構造url地址
car_type_dic = {'電動汽車':1676,'小型車':1544,'中級車':1545,'豪華車':1548,'SUV汽車':1542}

# 獲取數據
def get_data():

    # 定義一個空的dataFrame
    df = pd.DataFrame(columns=['類型', '排名', '名稱', '熱搜指數'])

    # 設置請求頭
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/69.0.3497.100 Safari/537.36'}  # 偽裝爬蟲
    # 遍歷字典，循環發送請求
    for key, value in car_type_dic.items():
        #構建url地址
        url = "http://top.baidu.com/buzz?b="+str(value)+"&c=18"
        #發送請求
        r = requests.get(url, headers=headers)
        #獲取網頁源代碼
        html = r.text
        #對代碼進行編碼解碼，才能正常顯示中文
        html = html.encode('iso-8859-1').decode('gbk')

        #調用bs4解析代碼
        soup = BeautifulSoup(html, "html.parser")

        all_data = soup.find_all('tr')
        # print(all_data)
        for each_topic in all_data:
            # 排名
            car_rank = each_topic.find('td',class_='first')
            # 標題目
            car_name = each_topic.find('td', class_='keyword')
            # 索指數
            car_index = each_topic.find('td', class_='last')

            # print(car_name,car_rank,car_index)

            #判斷不能為空數據
            if car_rank != None and car_name != None and car_index != None:
                # 利用正則表達式提取數據
                # 排名
                car_rank = int(re.findall('\d+', car_rank.get_text())[0])
                #汽車名
                car_name = car_name.get_text().replace('search', '').replace('\n', '')
                # 熱搜
                car_index = int(re.findall('\d+', car_index.get_text())[0])

                #數據存儲
                df = df.append([{'排名':car_rank,'名稱':car_name,'熱搜指數':car_index,'類型':key}])

    # 數據持久化存儲
    df.to_excel('data.xlsx',index=False)

    return df


#畫出每個類型前十名熱搜平均值的柱狀圖
def mean_plt(df):

    #平均值列表
    mean_list = []

    type_list = df['類型'].unique()
    for i in type_list:

        # 取出每個類型前十名的熱搜指數
        top_ten_data = df.loc[(df['排名'] < 10) & (df['類型'] == i)]['熱搜指數']

        #求平均值
        mean_data = top_ten_data.mean()

        mean_list.append(mean_data)
    # 生成畫布
    plt.figure(figsize=(10, 4), dpi=80)
    # 橫坐標汽車類型
    type_name = type_list

    x = range(len(type_name))

    plt.bar(x, mean_list, width=0.5, color=['b', 'r', 'g', 'y', 'c'])

    # 設置行對應的汽車類型
    plt.xticks(x, type_name)
    # x軸標注
    plt.xlabel('汽車類型')
    # y軸標注
    plt.ylabel('汽車類型平均熱搜指數')
    # 圖例
    plt.legend()

    # 保存圖片
    plt.savefig('mean.png')
    #顯示
    plt.show()

# 畫出電動汽車的折線圖
def data_plt(df):

    #構造行
    x = [i for i in range(9)]
    # 列數據
    y = df.loc[(df['排名'] < 10) & (df['類型'] == '電動汽車')]['熱搜指數']

    # print(x)
    # print(y)

    plt.plot(x, y, lw=1, c='red', marker='s', ms=4, label='電動汽車折線圖')
    # x軸的刻度
    plt.xticks(x)
    # x軸標注
    plt.xlabel('電動汽車排名')
    # y軸標注
    plt.ylabel('電動汽車熱搜指數')
    # 圖例
    plt.legend()
    # 保存圖片
    plt.savefig('data.png')
    #顯示
    plt.show()


# 回歸方程
def reg_plt(df):

    # x 為排名
    x = np.asarray(df.loc[df['類型'] == 'SUV汽車']['排名'].values).reshape(-1, 1)
    # y為 熱搜數據
    y = np.asarray(df.loc[df['類型'] == 'SUV汽車']['熱搜指數'].values).reshape(-1, 1)

    # 調用函數進行訓練
    reg = LinearRegression()

    reg = reg.fit(x, y)

    # 打印方程
    print("一元回歸方程為:  Y = %.5fX + (%.5f)" % (reg.coef_[0][0], reg.intercept_[0]))

    plt.scatter(x, y, color='black')
    # 畫圖
    plt.plot(x, reg.predict(x), color='red', linewidth=1)
    # x軸標注
    plt.xlabel('SUV汽車排名')
    # y軸標注
    plt.ylabel('SUV汽車熱搜指數')
    # 圖例
    plt.legend()

    plt.show()

# 函數入口
def main():
    # 獲取數據
    df = get_data()
    # 折線圖
    data_plt(df)

    # 柱狀圖
    mean_plt(df)
    # 散點圖 和 一元線性回歸方程
    reg_plt(df)


main()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 爬取百度熱搜爬取百度熱搜榜以及數據分析爬取百度熱搜榜及數據分析與可視化處理爬取百度熱搜榜 Python爬取百度熱搜和數據處理爬取百度熱搜風雲榜數據爬取今日熱榜百度熱搜TOP10 爬取百度熱榜對微博熱搜的爬取及數據分析百度地圖爬取數據