Python爬取百度熱搜和數據處理

本文轉載自查看原文 2020-04-24 02:13 1737

一、主題式網絡爬蟲設計方案
1.主題式網絡爬蟲名稱：爬取百度熱搜
2.主題式網絡爬蟲爬取的內容與數據特征分析：百度熱搜排行，標題，熱度
3.主題式網絡爬蟲設計方案概述：先搜索網站，查找數據並比對然后再輸入代碼進行爬取。難點在於文件的生成和讀取。

二、主題頁面的結構特征分析
1.主題頁面的結構與特征分析

2.Htmls頁面解析

三、網絡爬蟲程序設計

1.數據爬取與采集

import requests

from bs4 import BeautifulSoup

 

def get_html(url,headers):

   r = requests.get(url,headers=headers)

   r.encoding = r.apparent_encoding

   return r.text

 

def get_pages(html):

   soup = BeautifulSoup(html,'html.parser')

   all_topics=soup.find_all('tr')[1:]

   for each_topic in all_topics:

       topic_times = each_topic.find('td', class_='last')  # 搜索指數

       topic_rank = each_topic.find('td', class_='first')  # 排名

       topic_name = each_topic.find('td', class_='keyword')  # 標題目

       if topic_rank != None and topic_name != None and topic_times != None:

           topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')

           topic_name = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')

           topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')

           # print('排名：{}，標題：{}，熱度：{}'.format(topic_rank,topic_name,topic_times))

           tplt = "排名：{0:^4}\t標題：{1:{3}^15}\t熱度：{2:^8}"

           print(tplt.format(topic_rank, topic_name, topic_times, chr(12288)))

 

def main():

    #百度熱點排行榜單鏈接

    url = 'http://top.baidu.com/buzz?b=1&fr=20811'

    headers = {'User-Agent': 'Mozilla/5.0'}

    html = get_html(url, headers)

    get_pages(html)

 

if __name__ == '__main__':

    main()

2.對數據進行清洗和處理

讀取文件

import pandas as pd
#讀取文件
df=pd.DataFrame(pd.read_csv('mmm.csv'))
print(df)

#刪除無效列與行
df.drop('標題', axis=1, inplace = True)
df.head()

3.數據分析與可視化

#繪制垂直柱狀圖
plt.bar(['第1名','第2名','第3名','第4名','第5名'],[5953262,2775714,2443604,2313987,2209700],label="百度熱搜前五名")
#繪制水平柱狀圖
plt.barh(['第1名','第2名','第3名','第4名','第5名'],[5953262,2775714,2443604,2313987,2209700],label="百度熱搜前五名")

#繪制折線圖
def two():
    x = df['排名']
    y = df['熱搜指數']
    plt.xlabel('排名')
    plt.ylabel('熱度指數')
    plt.plot(x,y)
    plt.scatter(x,y)
    plt.title("繪制折線圖")
    plt.show()
two()

#繪制散點圖
def sandian():
x = df['排名']
y = df['熱搜指數']
plt.xlabel('排名')
plt.ylabel('熱搜指數')
plt.scatter(x,y,color="red",label=u"熱度分布數據",linewidth=2)
plt.title("排名與熱搜指數散點圖")
plt.legend()
plt.show()
sandian()

4.將以上各部分的代碼匯總

import requests

from bs4 import BeautifulSoup

 

def get_html(url,headers):

   r = requests.get(url,headers=headers)

   r.encoding = r.apparent_encoding

   return r.text

 

def get_pages(html):

   soup = BeautifulSoup(html,'html.parser')

   all_topics=soup.find_all('tr')[1:]

   for each_topic in all_topics:

       topic_times = each_topic.find('td', class_='last')  # 搜索指數

       topic_rank = each_topic.find('td', class_='first')  # 排名

       topic_name = each_topic.find('td', class_='keyword')  # 標題目

       if topic_rank != None and topic_name != None and topic_times != None:

           topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')

           topic_name = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')

           topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')

           # print('排名：{}，標題：{}，熱度：{}'.format(topic_rank,topic_name,topic_times))

           tplt = "排名：{0:^4}\t標題：{1:{3}^15}\t熱度：{2:^8}"

           print(tplt.format(topic_rank, topic_name, topic_times, chr(12288)))

 

def main():

    #百度熱點排行榜單鏈接

    url = 'http://top.baidu.com/buzz?b=1&fr=20811'

    headers = {'User-Agent': 'Mozilla/5.0'}

    html = get_html(url, headers)

    get_pages(html)

 

if __name__ == '__main__':

    main()
import pandas as pd
#讀取文件
df=pd.DataFrame(pd.read_csv('mmm.csv'))
print(df)
#刪除無效列與行
df.drop('標題', axis=1, inplace = True)
df.head()
#繪制垂直柱狀圖
plt.bar(['第1名','第2名','第3名','第4名','第5名'],[5953262,2775714,2443604,2313987,2209700],label="百度熱搜前五名")
#繪制水平柱狀圖
plt.barh(['第1名','第2名','第3名','第4名','第5名'],[5953262,2775714,2443604,2313987,2209700],label="百度熱搜前五名")
#繪制折線圖
def two():
    x = df['排名']
    y = df['熱搜指數']
    plt.xlabel('排名')
    plt.ylabel('熱度指數')
    plt.plot(x,y)
    plt.scatter(x,y)
    plt.title("繪制折線圖")
    plt.show()
two()

#繪制散點圖
def sandian():
x = df['排名']
y = df['熱搜指數']
plt.xlabel('排名')
plt.ylabel('熱搜指數')
plt.scatter(x,y,color="red",label=u"熱度分布數據",linewidth=2)
plt.title("排名與熱搜指數散點圖")
plt.legend()
plt.show()
sandian()

四、結論

1.所得結論：熱度越高排名越靠前。

2.小結：通過這次做題讓我發現了自己在學習Python這方面的能力有待提高，學好Python對我來說是一項挑戰。在此次任務中我找了資料和觀看視頻，但是還是沒有很好的做好任務，學習能力有待提高，希望改善學習方法，爭取學好Python。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 爬取百度熱搜百度熱搜數據爬取及分析爬取百度熱搜榜及數據分析與可視化處理爬取百度熱搜榜爬取百度熱搜榜以及數據分析爬取百度熱搜風雲榜數據爬取今日熱榜百度熱搜TOP10 爬取百度熱榜 python 爬取百度圖片 Python爬取百度圖片