爬取百度熱搜

本文轉載自查看原文 2020-04-18 13:00 595

一、網絡爬蟲設計方案

1、爬蟲名稱：百度熱搜

2、內容：爬取百度熱搜排行榜和熱度

3、概述：首先查找源代碼，使用request進行請求后對數據進行清洗和處理。再使用BeautifulSoup等工具對數據可視化，最后進行小結。

難點：回歸直線

二、頁面結構與特征分析

1、

2、查看源代碼

三、爬蟲程序設計

1、數據爬取與采集

#導入相關庫
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np 
import scipy as sp
import seaborn as sns
 
def get_html(url,headers):
   r = requests.get(url,headers=headers)
   r.encoding = r.apparent_encoding
   return r.text
 
def get_pages(html):
   soup = BeautifulSoup(html,'html.parser')
   all_topics=soup.find_all('tr')[1:]
   for each_topic in all_topics:
       topic_times = each_topic.find('td', class_='last')  # 搜索指數
       topic_rank = each_topic.find('td', class_='first')  # 排名
       topic_name = each_topic.find('td', class_='keyword')  # 標題目
       if topic_rank != None and topic_name != None and topic_times != None:
           topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
           topic_name = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
           topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
           # print('排名：{}，標題：{}，熱度：{}'.format(topic_rank,topic_name,topic_times))
           tplt = "排名：{0:^4}\t標題：{1:{3}^15}\t熱度：{2:^8}"
           print(tplt.format(topic_rank, topic_name, topic_times, chr(12288)))
 
def main():
    #百度熱點排行榜單鏈接
    url = 'http://top.baidu.com/buzz?b=1&fr=20811'
    headers = {'User-Agent': 'Mozilla/5.0'}
    html = get_html(url, headers)
    get_pages(html)
 
if __name__ == '__main__':
    main()
 #將數據存入excel文件中
df = pd.DataFrame([“排名”，“標題”，“熱度”])
print(df)
df.to_excel('D:/BaiDu.xlsx.')
print(Done!)

爬取的數據

提取前五存入Excel中

2、進行數據清洗和處理

#數據清洗
BaiDu = pd.read_excel('D:/BaiDu.xlsx')
print(BaiDu.duplicated())
  #重復值處理
print(BaiDu.insnull())
  #空值與缺失值處理
print(BaiDu.describe())
  #異常值處理

結果：

3、數據可視化

#餅圖

label='1’，‘2’，‘3’，‘4’，‘5’ #排名前五
sizes=5344506,3774958,3552070,3485324,2844965 #熱度
colors='yellow','green','gold','lightskyblu','lightcoral' #顏色
explode=0,0.1,0,0,0
plt.pie(sizes,explode=explode,labels=labels,colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.show()

結果：

#柱形圖

# -*- coding: utf-8 -*-
plt.rcParams['font.sans-serif']=['Arial Unicode MS'] #正確顯示中文
#正確顯示正負號
plt.rcParams['axes.unicode_minus']=False
name_list = ['1','2',‘3','4','5]
num_list = [5344506,3774958,3552070,3485324,2844965]
plt.bar(range(len(num_list)), num_list,color='rgb',tick_label=name_list)
plt.show()

結果

#散點圖

x = np.arange(1,5)
y =x
fig = plt.figure()
ax1 =fig.add_subplot(111)
#設置標題
ax1.set_title('BaiDu'
#設置X軸標簽
plt.xlabel('rank')
#設置Y軸標簽
plt.ylabel('hot‘）
#畫散點圖
ax1.scatter(x,y,c = 'r',marker = 'o')
#設置圖標
plt.legend('x1')
plt.show()

結果

#回歸直線

#導入相關庫
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np 
import scipy as sp
import seaborn as sns
 
def get_html(url,headers):
   r = requests.get(url,headers=headers)
   r.encoding = r.apparent_encoding
   return r.text
 
def get_pages(html):
   soup = BeautifulSoup(html,'html.parser')
   all_topics=soup.find_all('tr')[1:]
   for each_topic in all_topics:
       topic_times = each_topic.find('td', class_='last')  
  # 搜索指數
       topic_rank = each_topic.find('td', class_='first') 
   # 排名
       topic_name = each_topic.find('td', class_='keyword')
   # 標題目
       if topic_rank != None and topic_name != None and topic_times != None:
           topic_rank = each_topic.find('td', class_='first').get_text().replace(' ', '').replace('\n', '')
           topic_name = each_topic.find('td', class_='keyword').get_text().replace(' ', '').replace('\n', '')
           topic_times = each_topic.find('td', class_='last').get_text().replace(' ', '').replace('\n', '')
           # print('排名：{}，標題：{}，熱度：{}'.format(topic_rank,topic_name,topic_times))
           tplt = "排名：{0:^4}\t標題：{1:{3}^15}\t熱度：{2:^8}"
           print(tplt.format(topic_rank, topic_name, topic_times, chr(12288)))
 
def main():
    #百度熱點排行榜單鏈接
    url = 'http://top.baidu.com/buzz?b=1&fr=20811'
    headers = {'User-Agent': 'Mozilla/5.0'}
    html = get_html(url, headers)
    get_pages(html)
 
if __name__ == '__main__':
    main()
 #將數據存入excel文件中
df = pd.DataFrame([“排名”，“標題”，“熱度”])
print(df)
df.to_excel('D:/BaiDu.xlsx.')
print(Done!)
#數據清洗
BaiDu = pd.read_excel('D:/BaiDu.xlsx')
print(BaiDu.duplicated())
  #重復值處理
print(BaiDu.insnull())
  #空值與缺失值處理
print(BaiDu.describe())
  #異常值處理
#餅圖
label='1’，‘2’，‘3’，‘4’，‘5’ 
#排名前五
sizes=5344506,3774958,3552070,3485324,2844965 #熱度
colors='yellow','green','gold','lightskyblu','lightcoral' #顏色
explode=0,0.1,0,0,0
plt.pie(sizes,explode=explode,labels=labels,colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.show()
#柱形圖
# -*- coding: utf-8 -*-
plt.rcParams['font.sans-serif']=['Arial Unicode MS'] #正確顯示中文
#正確顯示正負號
plt.rcParams['axes.unicode_minus']=False
name_list = ['1','2',‘3','4','5]
num_list = [5344506,3774958,3552070,3485324,2844965]
plt.bar(range(len(num_list)), num_list,color='rgb',tick_label=name_list)
plt.show()
x = np.arange(1,5)
y =x
fig = plt.figure()
ax1 =fig.add_subplot(111)
#設置標題
ax1.set_title('BaiDu'
#設置X軸標簽
plt.xlabel('rank')
#設置Y軸標簽
plt.ylabel('hot‘）
#畫散點圖
ax1.scatter(x,y,c = 'r',marker = 'o')
#設置圖標
plt.legend('x1')
plt.show()
sns.set_style('darkgrid')
#設置風格為暗

student=pd.read_excel('D:/BaiDu.xlsx')
g=sns.FacetGrid(student,col_order='class',size=7
)#size為設置顯示界面大小
g.map(plt.scatter,'momheight','height',s=140,linewidth=.7,edgecolor='red',color='#ff8000')
#edgecolor為全邊顏色，color為圈內顏色
g.set_axis_labels('hot','rank')
#設置標題
plt.show()
#盒圖
sns.set(style="ticks")
tips = sns.load_dataset("tips)
#繪圖
sns.boxplot(x="hot",y="rank"),data=tips,palette="PRGn'）
sns.despine(offset=10,trim=True)

#圖片顯示
pyplot.savefig("GroupBoxplots.png")
pyplot.show()

四、結論：人們對娛樂方面更感興趣，且也同時關注着全國疫情的情況。

小結：經過爬蟲學習后，發現很多很實用很有趣的工具。回歸方程對於我來說還是有難度的！

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 爬取百度熱搜榜百度熱搜數據爬取及分析爬取今日熱榜百度熱搜TOP10 Python爬取百度熱搜和數據處理爬取百度熱搜榜以及數據分析爬取百度熱搜風雲榜數據爬取百度熱搜榜及數據分析與可視化處理爬取百度熱榜今日百度熱搜前十名看你想看的，不受打擾地工作（瀏覽器屏蔽百度熱搜，去百度熱搜）