爬取縱橫中文網點擊榜月榜數據

本文轉載自查看原文 2020-04-24 22:42 714

一、主題式網絡爬蟲設計方案
1.主題式網絡爬蟲名稱
爬取縱橫中文網點擊榜月榜

2.主題式網絡爬蟲爬取的內容與數據特征分析
爬取網站的“排名”，“類別”，“書名”，“狀態”，“字數”，“點擊數”，“作者”

3.主題式網絡爬蟲設計方案概述（包括實現思路與技術難點）
思路：通過分析網頁源代碼，找出數據所在的標簽，通過爬蟲讀取數據保存到xlsx文件中，讀取文件，對數據進行清洗和處理，數據分析與可視化處理。

技術難點：對數據的和其他方面的處理較為生疏

二、主題頁面的結構特征分析
1.主題頁面的結構與分析：爬取的“排名”標簽為"rank_listnum"，“類別”標簽為“rankpage_list_box”，“書名”標簽為“rankpage_list3”等等...

2.Htmls頁面解析

三、網絡爬蟲程序設計

爬蟲程序主體要包括以下各部分，要附源代碼及較詳細注釋，並在每部分程序后面提供輸出結果的截圖。
1.數據爬取與采集：

import requests
#引入requests庫下載網頁
from bs4 import BeautifulSoup
#BeautifulSoup解析網頁
import pandas as pd
#引入pandas數據可視化
url = "http://www.zongheng.com/rank/details.html?rt=5&d=0&r=&i=2&c=0"
#縱橫中文網點擊榜月榜
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/69.0.3497.100 Safari/537.36'}
#偽裝爬蟲
r=requests.get(url)
#請求網站
r.encoding=r.apparent_encoding
#對頁面內容進行重新編碼
x=r.text
#獲取源代碼
soup=BeautifulSoup(x,'lxml')
#構造Soup的對象
print(soup.prettify())
#顯示網站結構
b=[]
#創建一個空列表
z=[]
#創建一個空列表
c=[]
#創建一個空列表
p=[]
#創建一個空列表
u=[]
#創建一個空列表
v=[]
#創建一個空列表
q=[]
#創建一個空列表
for g in soup.find_all(class_="rank_listnum"):
#for語句查找標簽
    b.append(g.get_text().strip())
for n in soup.find_all(class_="rankpage_list_box"):
#for語句查找標簽
    z.append(n.get_text().strip())
for i in soup.find_all(class_="rankpage_list3"):
#for語句查找標簽
    c.append(i.get_text().strip())
for l in soup.find_all(class_="rankpage_list4"):
#for語句查找標簽
    p.append(l.get_text().strip())
for o in soup.find_all(class_="rankpage_list5"):
#for語句查找標簽
    u.append(o.get_text().strip())
for m in soup.find_all(class_="rankpage_list6"):
#for語句查找標簽
    v.append(m.get_text().strip())
for f in soup.find_all(class_="rankpage_list7"):
#for語句查找標簽
    q.append(f.get_text().strip())
data=[b,z,c,p,u,v,q]
print(data)
df=pd.DataFrame(data,index=["排名","類別","書名","狀態","字數","點擊數","作者"])
#使用工具使其可視化
print(df.T)
#將所得數據進行可視化

S="E:\SteamLibrary\縱橫中文網點擊榜月榜.xlsx"
#將數據保存在新的文本中
df.T.to_excel(S)

2.對數據進行清洗和處理：

df = pd.DataFrame(pd.read_excel('E:\SteamLibrary\縱橫中文網點擊榜月榜.xlsx'))
#讀取excel文件
df.head()
df.drop('書名', axis=1, inplace = True)
#刪除無效列與行
df.drop('作者', axis=1, inplace = True)
#刪除無效列與行
df.drop('狀態', axis=1, inplace = True)
#刪除無效列與行
df.drop('字數', axis=1, inplace = True)
#刪除無效列與行
df.drop('類別', axis=1, inplace = True)
#刪除無效列與行
df.head()

#數據清洗
print('\n====各列是否有缺失值情況如下：====')
print(df.isnull().sum())
#返回0，表示沒有空值
print(df.duplicated()) 
#檢查是否有重復值
#缺失值處理
df[df.isnull().values==True]
#返回無缺失值
print(df.isna().head()) 
print(df.describe())   
#用describe()命令顯示描述性統計指標

3.數據分析與可視化：

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.sans-serif']=['SimHei']
x =['劍仙在此','遣返者的游戲','一劍獨尊','醫品宗師','祭煉山河','蓋世','長寧帝軍','劍道第一仙','千億贅婿','都市超級醫生']
y = [1,2,3,4,5,6,7,8,9,10]
plt.plot(x,y)
plt.xlabel("書名")
plt.ylabel("排名")
plt.title('點擊數')
plt.show()

#繪制排名與點擊數的回歸圖
plt.rcParams['font.sans-serif']=['STSong']#顯示中文
sns.regplot(df.排名,df.點擊數)

#繪制垂直柱狀圖
plt.rcParams['axes.unicode_minus']=False #用來正常顯示負號
plt.bar(df.排名, df.點擊數, label="排名與點擊數柱狀圖")
plt.show()

# 繪制散點圖
def Scatter_point():
    
    plt.scatter(df.排名, df.點擊數, color='yellow', s=25, marker="o")
    plt.xlabel("排名")
    plt.ylabel("點擊數")
    plt.title("排名與點擊數-散點圖")
    plt.show()

Scatter_point()

#繪制排名與點擊數折線圖
def draw():
    
    x = df['排名']
    y = df['點擊數']
    plt.xlabel('排名')
    plt.ylabel('點擊數')
    plt.plot(x,y)
    plt.scatter(x,y)
    plt.title("繪制排名與點擊數折線圖")
    plt.show()
    
draw()

#繪制部分箱體圖
import seaborn as sns
file_path="E:\SteamLibrary\縱橫中文網點擊榜月榜.xlsx"
sns.boxplot(x='排名',y='點擊數',data=df)

#繪制部分分布圖
sns.jointplot(x="排名",y='點擊數',data = df, kind='kde', color='y')

sns.jointplot(x="排名",y='點擊數',data = df)

sns.jointplot(x="排名",y='點擊數',data = df, kind='reg')

sns.jointplot(x="排名",y='點擊數',data = df, kind='hex')

4.根據數據之間的關系，分析兩個變量之間的相關系數，畫出散點圖，並建立變量之間的回歸方程（一元或多元）：

import seaborn as sns
from scipy.optimize import leastsq
df=pd.DataFrame(pd.read_excel('E:\SteamLibrary\縱橫中文網點擊榜月榜.xlsx'))
print(df.head())
X = df.排名
Y = df.點擊數  
def func(params, x):
    a, b, c = params
    return a*x*x+b*x+c
def error(params,x,y):
    return func(params,x)-y    
def main(a,b,c):
    p0 = [0,0,0]
Para=leastsq(error,p0,args=(X,Y))
a,b,c=Para[0]    
print(" a=",a," b=",b," c=",c) 
plt.scatter(X,Y,color="green",label=u"評分分布",linewidth=2)
x=np.linspace(1,10082854,30)
y=a * x * x + b * x + c
plt.plot(x,y,color="red",label=u"回歸方程直線",linewidth=2)     
plt.title("排名與點擊數關系圖")
plt.legend()
plt.grid()
plt.show()
def draw_scatterplot(surface, options, dataSet):
    chart = pycha.scatter.ScatterplotChart(surface,options) 
    chart.addDataset(dataSet) 
    chart.render() 
    surface.write_to_png('D:\python\排名與點擊數關系圖.png')     


a=()
b=()
c=()

5.將以上各部分的代碼匯總，附上完整程序代碼：

import requests
#引入requests庫下載網頁
from bs4 import BeautifulSoup
#BeautifulSoup解析網頁
import pandas as pd
#引入pandas數據可視化
url = "http://www.zongheng.com/rank/details.html?rt=5&d=0&r=&i=2&c=0"
#縱橫中文網點擊榜月榜
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/69.0.3497.100 Safari/537.36'}
#偽裝爬蟲
r=requests.get(url)
#請求網站
r.encoding=r.apparent_encoding
#對頁面內容進行重新編碼
x=r.text
#獲取源代碼
soup=BeautifulSoup(x,'lxml')
#構造Soup的對象
print(soup.prettify())
#顯示網站結構
b=[]
#創建一個空列表
z=[]
#創建一個空列表
c=[]
#創建一個空列表
p=[]
#創建一個空列表
u=[]
#創建一個空列表
v=[]
#創建一個空列表
q=[]
#創建一個空列表
for g in soup.find_all(class_="rank_listnum"):
#for語句查找標簽
    b.append(g.get_text().strip())
for n in soup.find_all(class_="rankpage_list_box"):
#for語句查找標簽
    z.append(n.get_text().strip())
for i in soup.find_all(class_="rankpage_list3"):
#for語句查找標簽
    c.append(i.get_text().strip())
for l in soup.find_all(class_="rankpage_list4"):
#for語句查找標簽
    p.append(l.get_text().strip())
for o in soup.find_all(class_="rankpage_list5"):
#for語句查找標簽
    u.append(o.get_text().strip())
for m in soup.find_all(class_="rankpage_list6"):
#for語句查找標簽
    v.append(m.get_text().strip())
for f in soup.find_all(class_="rankpage_list7"):
#for語句查找標簽
    q.append(f.get_text().strip())
data=[b,z,c,p,u,v,q]
print(data)
df=pd.DataFrame(data,index=["排名","類別","書名","狀態","字數","點擊數","作者"])
#使用工具使其可視化
print(df.T)
#將所得數據進行可視化

S="E:\SteamLibrary\縱橫中文網點擊榜月榜.xlsx"
#將數據保存在新的文本中
df.T.to_excel(S)

df = pd.DataFrame(pd.read_excel('E:\SteamLibrary\縱橫中文網點擊榜月榜.xlsx'))
#讀取excel文件
df.head()
df.drop('書名', axis=1, inplace = True)
#刪除無效列與行
df.drop('作者', axis=1, inplace = True)
#刪除無效列與行
df.drop('狀態', axis=1, inplace = True)
#刪除無效列與行
df.drop('字數', axis=1, inplace = True)
#刪除無效列與行
df.drop('類別', axis=1, inplace = True)
#刪除無效列與行
df.head()


#數據清洗
print('\n====各列是否有缺失值情況如下：====')
print(df.isnull().sum())
#返回0，表示沒有空值
print(df.duplicated()) 
#檢查是否有重復值
df[df.isnull().values==True]
#返回無缺失值
print(df.isna().head()) 
print(df.describe())   
#用describe()命令顯示描述性統計指標

import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.sans-serif']=['SimHei']
x =['劍仙在此','遣返者的游戲','一劍獨尊','醫品宗師','祭煉山河','蓋世','長寧帝軍','劍道第一仙','千億贅婿','都市超級醫生']
y = [1,2,3,4,5,6,7,8,9,10]
plt.plot(x,y)
plt.xlabel("書名")
plt.ylabel("排名")
plt.title('點擊數')
plt.show()

import seaborn as sns
#繪制排名與點擊數的回歸圖
plt.rcParams['font.sans-serif']=['STSong']#顯示中文
sns.regplot(df.排名,df.點擊數)


#繪制垂直柱狀圖
plt.rcParams['axes.unicode_minus']=False 


#用來正常顯示負號

plt.bar(df.排名, df.點擊數, label="排名與點擊數柱狀圖")


plt.show()


# 繪制散點圖
def Scatter_point():
    
    plt.scatter(df.排名, df.點擊數, color='yellow', s=25, marker="o")
    plt.xlabel("排名")
    plt.ylabel("點擊數")
    plt.title("排名與點擊數-散點圖")
    plt.show()

Scatter_point()


#繪制排名與點擊數折線圖
def draw():
    
    x = df['排名']
    y = df['點擊數']
    plt.xlabel('排名')
    plt.ylabel('點擊數')
    plt.plot(x,y)
    plt.scatter(x,y)
    plt.title("繪制排名與點擊數折線圖")
    plt.show()
    
draw()





#繪制部分箱體圖
import seaborn as sns

file_path="E:\SteamLibrary\縱橫中文網點擊榜月榜.xlsx"

sns.boxplot(x='排名',y='點擊數',data=df)


#繪制部分分布圖
sns.jointplot(x="排名",y='點擊數',data = df, kind='kde', color='y')

sns.jointplot(x="排名",y='點擊數',data = df)

sns.jointplot(x="排名",y='點擊數',data = df, kind='reg')

sns.jointplot(x="排名",y='點擊數',data = df, kind='hex')






#繪制排名與點擊數關系圖
import numpy as np
import seaborn as sns
from scipy.optimize import leastsq
df=pd.DataFrame(pd.read_excel('E:\SteamLibrary\縱橫中文網點擊榜月榜.xlsx'))
print(df.head())
X = df.排名
Y = df.點擊數  
def func(params, x):
    a, b, c = params
    return a*x*x+b*x+c
def error(params,x,y):
    return func(params,x)-y    
def main(a,b,c):
    p0=[0,0,0]
Para=leastsq(error,p0,args=(X,Y))
a,b,c=Para[0]    
print(" a=",a," b=",b," c=",c) 
plt.scatter(X,Y,color="green",label=u"評分分布",linewidth=2)
x=np.linspace(1,10082854,30)
y=a * x * x + b * x + c
plt.plot(x,y,color="red",label=u"回歸方程直線",linewidth=2)     
plt.title("排名與點擊數關系圖")
plt.legend()
plt.grid()
plt.show()


a=()
b=()
c=()

四、結論（10分）
1.經過對主題數據的分析與可視化，可以得到哪些結論？

結論：通過對數據的分析與可視化，各數據之間的關系和呈現出來的分布，讓數據的分析變的更容易起來。對於繪制圖形方面的時候我們需要整理好正確的數據，這樣才可以繪制出正確的圖示。

2.對本次程序設計任務完成的情況做一個簡單的小結。

小結：通過這次做題任務，我明白了數據的分析與可視化，掌握了不少庫的使用，但對網絡爬取掌握的還不夠熟練。我在一些像是B站的平台上求助一些專業的老師，向他們求教，在此期間，我對python的學習有了更深的了解，也對這門課程更加的感興趣了。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 利用python爬取龍虎榜數據及后續分析爬取豆瓣電影排行榜爬取bilibili綜合熱門排行榜及數據分析網絡爬蟲&起點中文網完本榜500部小說爬取B站熱門視頻排行榜 VitePress中文網爬取疫情數據數據爬取去哪兒網數據爬取爬取騰訊疫情數據