python爬蟲爬取福布斯排行榜並可視化


一、選題的背景

由於我對福布斯排行榜的印象還處於小時候的階段,所以我用網絡爬蟲去爬取福布斯排行榜來重新認識一下,並且去分析榜單上的人都來自哪些國家,和一共有多少資金。

二、主題式網絡爬蟲設計方案

1.主題式網絡爬蟲名稱

爬取福布斯排行榜並可視化

2.主題式網絡爬蟲爬取的內容與數據特征分析

爬取福布斯排行榜上的名單

3.主題式網絡爬蟲設計方案概述

先確定此次的選題的主題內容,然后爬取數據,設計爬取程序進行爬取,並以csv的形式儲存,接着利用pandas庫進行數據分析以及清洗,之后進行進行圖形與圖像的繪制。最后,保存數據。

三、主題頁面的結構特征分析

1.主題頁面的結構特征分析

要爬取的網站是https://www.phb123.com/renwu/fuhao/shishi.html,是個html式網站。

 

 通過進入網頁按下f12進行檢查,找到所需要的標簽名——上圖中紅色箭頭所示。

四、網絡爬蟲程序設計
1.獲取數據
 1 # 讀取一頁的數據
 2 def loaddata(url):
 3    from bs4 import BeautifulSoup
 4    import requests
 5    headers = {
 6        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) '
 7                     'Chrome/72.0.3626.121 Safari/537.36'
 8    }
 9    f = requests.get(url,headers=headers)   #Get該網頁從而獲取該html內容
10    soup = BeautifulSoup(f.content, "lxml")  #用lxml解析器解析該網頁的內容
11    # print(f.content.decode())        #嘗試打印出網頁內容,看是否獲取成功
12    ranktable = soup.find_all('table',class_="rank-table" )[0]   #獲取排行榜表格
13    trlist = ranktable.find_all('tr') #獲取表格中所有tr標簽
14    trlist.pop(0) #去掉第一個元素
15    persionlist = []
16    for tr in trlist:
17       persion = {}
18       persion['num'] = tr.find_all('td')[0].string  #編號
19       persion['name'] = tr.find_all('td')[1].p.string #名稱
20       persion['money'] = tr.find_all('td')[2].string #財產
21       persion['company'] = tr.find_all('td')[3].string #企業
22       persion['country'] = tr.find_all('td')[4].a.string #國家
23       persionlist.append(persion)
24    print("頁面"+url+"爬取成功")
25    return persionlist

2.將所爬取到的數據轉入excel表格內

 1 ## 讀取所有福布斯排行榜數據
 2 def loadalldata():
 3    alldata = []
 4    for i in range(1,16,1):
 5       url = "https://www.phb123.com/renwu/fuhao/shishi_"+str(i)+".html"
 6       data = loaddata(url)
 7       alldata = alldata + data
 8    return alldata
 9 
10 ## 將爬取的數據保存到文件
11 def savedata(path,persionlist):
12    import xlwt
13    workbook = xlwt.Workbook()
14    worksheet = workbook.add_sheet('test')
15    worksheet.write(0, 0, '排名')
16    worksheet.write(0, 1, '姓名')
17    worksheet.write(0, 2, '財富')
18    worksheet.write(0, 3, '企業')
19    worksheet.write(0, 4, '國家')
20    for i in range(1,len(persionlist)+1,1):
21       worksheet.write(i, 0, persionlist[i-1]['num'])
22       worksheet.write(i, 1, persionlist[i-1]['name'])
23       worksheet.write(i, 2, persionlist[i-1]['money'])
24       worksheet.write(i, 3, persionlist[i-1]['company'])
25       worksheet.write(i, 4, persionlist[i-1]['country'])
26    workbook.save(path)
27    print("數據保存成功:"+path)

表格內容如下

3.對數據進行處理,取出前十位的數據

 1 ## 取出排行榜前十的姓名和財富數據 以兩個list返回
 2 def loadtop10(path):
 3     import xlrd
 4     book = xlrd.open_workbook(path)
 5     sheet1 = book.sheets()[0]
 6     namelist = sheet1.col_values(1)
 7     moneylist = sheet1.col_values(2)
 8     namelist = namelist[1:11]
 9     moneylist = moneylist[1:11]
10 
11     moneylist2 = []
12     for a in moneylist:
13         a = int(a[0:-3])
14         moneylist2.append(a)
15     print("取出排行榜前十的姓名和財富數據")
16     print(namelist)
17     print(moneylist2)
18     return namelist,moneylist2
19 
20 ## 統計排行榜中每個國家的上榜人數 以字典list返回
21 def countcountrynum(path):
22    import xlrd
23    book = xlrd.open_workbook(path)
24    sheet1 = book.sheets()[0]
25    countrylist = sheet1.col_values(4)[1:-1]
26    print(countrylist)
27    countryset = list(set(countrylist))
28    dictlist = []
29    for country in countryset:
30       obj = {"name":country,"count":0}
31       dictlist.append(obj)
32    ## 統計出每個國家對應的數量
33    for obj in dictlist:
34       for a in countrylist:
35          if obj['name'] == a:
36             obj['count'] = obj['count'] + 1
37    print(dictlist)
38    ## 將dictlist排序 數量多的放前面 8 5 6 9 3 2 4
39    for i in range(0,len(dictlist),1):
40       for j in range(0,len(dictlist)-i-1,1):
41           if dictlist[j]['count'] < dictlist[j+1]['count']:
42              temp = dictlist[j]
43              dictlist[j] = dictlist[j+1]
44              dictlist[j+1] = temp
45    dictlist2 = dictlist[0:5]
46    set2 = []
47    for a in dictlist2:
48       set2.append(a['name'])
49    othercount = 0;
50    for a in dictlist:
51       if a['name'] not in set2:
52          othercount = othercount + 1
53    dictlist2.append({"name":"其他","count":othercount})
54    print('獲取排行榜中每個國家的上榜人數')
55    print(dictlist2)
56    return dictlist2

取出后如下圖

 

 4.進行數據可視化

 1 ## 繪制條形圖和餅狀圖
 2 def drow():
 3    import matplotlib.pyplot as plt
 4    plt.rcParams['font.sans-serif'] = ['SimHei'] # 設置中文字體
 5    plt.figure('福布斯前十榜',figsize=(15,5))
 6 
 7    ## 讀取福布斯排行榜前十的數據
 8    listx,listy = loadtop10('rank.xls')
 9 
10    plt.title('福布斯前十榜', fontsize=16)
11    plt.xlabel('人物', fontsize=14)
12    plt.ylabel('金額/億美元', fontsize=14)
13    plt.tick_params(labelsize=10)
14    plt.grid(linestyle=':', axis='y')
15    a = plt.bar(listx, listy, color='dodgerblue', label='Apple', align='center')
16    # 設置標簽
17    for i in a:
18       h = i.get_height()
19       plt.text(i.get_x() + i.get_width() / 2, h, '%d' % int(h), ha='center', va='bottom')
20    ## -------------------------------------------------------------------------
21    dictlist = countcountrynum("rank.xls")
22    plt.figure('各國家上榜人數所占比例')
23    labels = []
24    sizes = []
25    for a in dictlist:
26       labels.append(a['name'])
27       sizes.append(a['count'])
28    explode = (0.1, 0, 0, 0, 0, 0)
29    plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
30    plt.title("各國家上榜人數所占比例", fontsize=16)
31    plt.axis('equal')  # 該行代碼使餅圖長寬相等
32 
33    plt.show()
34 
35 if __name__ == '__main__':
36 
37    ## 爬取數據
38    data = loadalldata()
39    ## 保存數據
40    savedata("rank.xls",data)    # py文件同級目錄創建rank.xls文件
41    ## 展示數據
42    drow()

效果如圖

5.完整代碼

  1 # 讀取一頁的數據
  2 def loaddata(url):
  3    from bs4 import BeautifulSoup
  4    import requests
  5    headers = {
  6        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) '
  7                     'Chrome/72.0.3626.121 Safari/537.36'
  8    }
  9    f = requests.get(url,headers=headers)   #Get該網頁從而獲取該html內容
 10    soup = BeautifulSoup(f.content, "lxml")  #用lxml解析器解析該網頁的內容
 11    # print(f.content.decode())        #嘗試打印出網頁內容,看是否獲取成功
 12    ranktable = soup.find_all('table',class_="rank-table" )[0]   #獲取排行榜表格
 13    trlist = ranktable.find_all('tr') #獲取表格中所有tr標簽
 14    trlist.pop(0) #去掉第一個元素
 15    persionlist = []
 16    for tr in trlist:
 17       persion = {}
 18       persion['num'] = tr.find_all('td')[0].string  #編號
 19       persion['name'] = tr.find_all('td')[1].p.string #名稱
 20       persion['money'] = tr.find_all('td')[2].string #財產
 21       persion['company'] = tr.find_all('td')[3].string #企業
 22       persion['country'] = tr.find_all('td')[4].a.string #國家
 23       persionlist.append(persion)
 24    print("頁面"+url+"爬取成功")
 25    return persionlist
 26 
 27 
 28 ## 讀取所有福布斯排行榜數據
 29 def loadalldata():
 30    alldata = []
 31    for i in range(1,16,1):
 32       url = "https://www.phb123.com/renwu/fuhao/shishi_"+str(i)+".html"
 33       data = loaddata(url)
 34       alldata = alldata + data
 35    return alldata
 36 
 37 ## 將爬取的數據保存到文件
 38 def savedata(path,persionlist):
 39    import xlwt
 40    workbook = xlwt.Workbook()
 41    worksheet = workbook.add_sheet('test')
 42    worksheet.write(0, 0, '排名')
 43    worksheet.write(0, 1, '姓名')
 44    worksheet.write(0, 2, '財富')
 45    worksheet.write(0, 3, '企業')
 46    worksheet.write(0, 4, '國家')
 47    for i in range(1,len(persionlist)+1,1):
 48       worksheet.write(i, 0, persionlist[i-1]['num'])
 49       worksheet.write(i, 1, persionlist[i-1]['name'])
 50       worksheet.write(i, 2, persionlist[i-1]['money'])
 51       worksheet.write(i, 3, persionlist[i-1]['company'])
 52       worksheet.write(i, 4, persionlist[i-1]['country'])
 53    workbook.save(path)
 54    print("數據保存成功:"+path)
 55 
 56 ## 取出排行榜前十的姓名和財富數據 以兩個list返回
 57 def loadtop10(path):
 58     import xlrd
 59     book = xlrd.open_workbook(path)
 60     sheet1 = book.sheets()[0]
 61     namelist = sheet1.col_values(1)
 62     moneylist = sheet1.col_values(2)
 63     namelist = namelist[1:11]
 64     moneylist = moneylist[1:11]
 65 
 66     moneylist2 = []
 67     for a in moneylist:
 68         a = int(a[0:-3])
 69         moneylist2.append(a)
 70     print("取出排行榜前十的姓名和財富數據")
 71     print(namelist)
 72     print(moneylist2)
 73     return namelist,moneylist2
 74 
 75 ## 統計排行榜中每個國家的上榜人數 以字典list返回
 76 def countcountrynum(path):
 77    import xlrd
 78    book = xlrd.open_workbook(path)
 79    sheet1 = book.sheets()[0]
 80    countrylist = sheet1.col_values(4)[1:-1]
 81    print(countrylist)
 82    countryset = list(set(countrylist))
 83    dictlist = []
 84    for country in countryset:
 85       obj = {"name":country,"count":0}
 86       dictlist.append(obj)
 87    ## 統計出每個國家對應的數量
 88    for obj in dictlist:
 89       for a in countrylist:
 90          if obj['name'] == a:
 91             obj['count'] = obj['count'] + 1
 92    print(dictlist)
 93    ## 將dictlist排序 數量多的放前面 8 5 6 9 3 2 4
 94    for i in range(0,len(dictlist),1):
 95       for j in range(0,len(dictlist)-i-1,1):
 96           if dictlist[j]['count'] < dictlist[j+1]['count']:
 97              temp = dictlist[j]
 98              dictlist[j] = dictlist[j+1]
 99              dictlist[j+1] = temp
100    dictlist2 = dictlist[0:5]
101    set2 = []
102    for a in dictlist2:
103       set2.append(a['name'])
104    othercount = 0;
105    for a in dictlist:
106       if a['name'] not in set2:
107          othercount = othercount + 1
108    dictlist2.append({"name":"其他","count":othercount})
109    print('獲取排行榜中每個國家的上榜人數')
110    print(dictlist2)
111    return dictlist2
112 
113 ## 繪制條形圖和餅狀圖
114 def drow():
115    import matplotlib.pyplot as plt
116    plt.rcParams['font.sans-serif'] = ['SimHei'] # 設置中文字體
117    plt.figure('福布斯前十榜',figsize=(15,5))
118 
119    ## 讀取福布斯排行榜前十的數據
120    listx,listy = loadtop10('rank.xls')
121 
122    plt.title('福布斯前十榜', fontsize=16)
123    plt.xlabel('人物', fontsize=14)
124    plt.ylabel('金額/億美元', fontsize=14)
125    plt.tick_params(labelsize=10)
126    plt.grid(linestyle=':', axis='y')
127    a = plt.bar(listx, listy, color='dodgerblue', label='Apple', align='center')
128    # 設置標簽
129    for i in a:
130       h = i.get_height()
131       plt.text(i.get_x() + i.get_width() / 2, h, '%d' % int(h), ha='center', va='bottom')
132    ## -------------------------------------------------------------------------
133    dictlist = countcountrynum("rank.xls")
134    plt.figure('各國家上榜人數所占比例')
135    labels = []
136    sizes = []
137    for a in dictlist:
138       labels.append(a['name'])
139       sizes.append(a['count'])
140    explode = (0.1, 0, 0, 0, 0, 0)
141    plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
142    plt.title("各國家上榜人數所占比例", fontsize=16)
143    plt.axis('equal')  # 該行代碼使餅圖長寬相等
144 
145    plt.show()
146 
147 if __name__ == '__main__':
148 
149    ## 爬取數據
150    data = loadalldata()
151    ## 保存數據
152    savedata("rank.xls",data)    # py文件同級目錄創建rank.xls文件
153    ## 展示數據
154    drow()

五、總結

1.經過對主題數據的分析與可視化,可以看出進入榜單的大多數都是美國人,總占比38.8%其次是中國,總站比為26.3%。

2.在這次設計過程中,進一步加深對網絡爬蟲和數據可視化原理的認知,需要改進的是對於可視化部分的掌握還是不夠,需要進一步了解

 
 

 

 

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM