一、選題的背景
由於我對福布斯排行榜的印象還處於小時候的階段,所以我用網絡爬蟲去爬取福布斯排行榜來重新認識一下,並且去分析榜單上的人都來自哪些國家,和一共有多少資金。
二、主題式網絡爬蟲設計方案
1.主題式網絡爬蟲名稱
爬取福布斯排行榜並可視化
2.主題式網絡爬蟲爬取的內容與數據特征分析
爬取福布斯排行榜上的名單
3.主題式網絡爬蟲設計方案概述
先確定此次的選題的主題內容,然后爬取數據,設計爬取程序進行爬取,並以csv的形式儲存,接着利用pandas庫進行數據分析以及清洗,之后進行進行圖形與圖像的繪制。最后,保存數據。
三、主題頁面的結構特征分析
1.主題頁面的結構特征分析
要爬取的網站是https://www.phb123.com/renwu/fuhao/shishi.html,是個html式網站。
通過進入網頁按下f12進行檢查,找到所需要的標簽名——上圖中紅色箭頭所示。
四、網絡爬蟲程序設計
1.獲取數據
1 # 讀取一頁的數據 2 def loaddata(url): 3 from bs4 import BeautifulSoup 4 import requests 5 headers = { 6 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) ' 7 'Chrome/72.0.3626.121 Safari/537.36' 8 } 9 f = requests.get(url,headers=headers) #Get該網頁從而獲取該html內容 10 soup = BeautifulSoup(f.content, "lxml") #用lxml解析器解析該網頁的內容 11 # print(f.content.decode()) #嘗試打印出網頁內容,看是否獲取成功 12 ranktable = soup.find_all('table',class_="rank-table" )[0] #獲取排行榜表格 13 trlist = ranktable.find_all('tr') #獲取表格中所有tr標簽 14 trlist.pop(0) #去掉第一個元素 15 persionlist = [] 16 for tr in trlist: 17 persion = {} 18 persion['num'] = tr.find_all('td')[0].string #編號 19 persion['name'] = tr.find_all('td')[1].p.string #名稱 20 persion['money'] = tr.find_all('td')[2].string #財產 21 persion['company'] = tr.find_all('td')[3].string #企業 22 persion['country'] = tr.find_all('td')[4].a.string #國家 23 persionlist.append(persion) 24 print("頁面"+url+"爬取成功") 25 return persionlist

2.將所爬取到的數據轉入excel表格內
1 ## 讀取所有福布斯排行榜數據 2 def loadalldata(): 3 alldata = [] 4 for i in range(1,16,1): 5 url = "https://www.phb123.com/renwu/fuhao/shishi_"+str(i)+".html" 6 data = loaddata(url) 7 alldata = alldata + data 8 return alldata 9 10 ## 將爬取的數據保存到文件 11 def savedata(path,persionlist): 12 import xlwt 13 workbook = xlwt.Workbook() 14 worksheet = workbook.add_sheet('test') 15 worksheet.write(0, 0, '排名') 16 worksheet.write(0, 1, '姓名') 17 worksheet.write(0, 2, '財富') 18 worksheet.write(0, 3, '企業') 19 worksheet.write(0, 4, '國家') 20 for i in range(1,len(persionlist)+1,1): 21 worksheet.write(i, 0, persionlist[i-1]['num']) 22 worksheet.write(i, 1, persionlist[i-1]['name']) 23 worksheet.write(i, 2, persionlist[i-1]['money']) 24 worksheet.write(i, 3, persionlist[i-1]['company']) 25 worksheet.write(i, 4, persionlist[i-1]['country']) 26 workbook.save(path) 27 print("數據保存成功:"+path)
表格內容如下
3.對數據進行處理,取出前十位的數據
1 ## 取出排行榜前十的姓名和財富數據 以兩個list返回 2 def loadtop10(path): 3 import xlrd 4 book = xlrd.open_workbook(path) 5 sheet1 = book.sheets()[0] 6 namelist = sheet1.col_values(1) 7 moneylist = sheet1.col_values(2) 8 namelist = namelist[1:11] 9 moneylist = moneylist[1:11] 10 11 moneylist2 = [] 12 for a in moneylist: 13 a = int(a[0:-3]) 14 moneylist2.append(a) 15 print("取出排行榜前十的姓名和財富數據") 16 print(namelist) 17 print(moneylist2) 18 return namelist,moneylist2 19 20 ## 統計排行榜中每個國家的上榜人數 以字典list返回 21 def countcountrynum(path): 22 import xlrd 23 book = xlrd.open_workbook(path) 24 sheet1 = book.sheets()[0] 25 countrylist = sheet1.col_values(4)[1:-1] 26 print(countrylist) 27 countryset = list(set(countrylist)) 28 dictlist = [] 29 for country in countryset: 30 obj = {"name":country,"count":0} 31 dictlist.append(obj) 32 ## 統計出每個國家對應的數量 33 for obj in dictlist: 34 for a in countrylist: 35 if obj['name'] == a: 36 obj['count'] = obj['count'] + 1 37 print(dictlist) 38 ## 將dictlist排序 數量多的放前面 8 5 6 9 3 2 4 39 for i in range(0,len(dictlist),1): 40 for j in range(0,len(dictlist)-i-1,1): 41 if dictlist[j]['count'] < dictlist[j+1]['count']: 42 temp = dictlist[j] 43 dictlist[j] = dictlist[j+1] 44 dictlist[j+1] = temp 45 dictlist2 = dictlist[0:5] 46 set2 = [] 47 for a in dictlist2: 48 set2.append(a['name']) 49 othercount = 0; 50 for a in dictlist: 51 if a['name'] not in set2: 52 othercount = othercount + 1 53 dictlist2.append({"name":"其他","count":othercount}) 54 print('獲取排行榜中每個國家的上榜人數') 55 print(dictlist2) 56 return dictlist2
取出后如下圖
4.進行數據可視化
1 ## 繪制條形圖和餅狀圖 2 def drow(): 3 import matplotlib.pyplot as plt 4 plt.rcParams['font.sans-serif'] = ['SimHei'] # 設置中文字體 5 plt.figure('福布斯前十榜',figsize=(15,5)) 6 7 ## 讀取福布斯排行榜前十的數據 8 listx,listy = loadtop10('rank.xls') 9 10 plt.title('福布斯前十榜', fontsize=16) 11 plt.xlabel('人物', fontsize=14) 12 plt.ylabel('金額/億美元', fontsize=14) 13 plt.tick_params(labelsize=10) 14 plt.grid(linestyle=':', axis='y') 15 a = plt.bar(listx, listy, color='dodgerblue', label='Apple', align='center') 16 # 設置標簽 17 for i in a: 18 h = i.get_height() 19 plt.text(i.get_x() + i.get_width() / 2, h, '%d' % int(h), ha='center', va='bottom') 20 ## ------------------------------------------------------------------------- 21 dictlist = countcountrynum("rank.xls") 22 plt.figure('各國家上榜人數所占比例') 23 labels = [] 24 sizes = [] 25 for a in dictlist: 26 labels.append(a['name']) 27 sizes.append(a['count']) 28 explode = (0.1, 0, 0, 0, 0, 0) 29 plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150) 30 plt.title("各國家上榜人數所占比例", fontsize=16) 31 plt.axis('equal') # 該行代碼使餅圖長寬相等 32 33 plt.show() 34 35 if __name__ == '__main__': 36 37 ## 爬取數據 38 data = loadalldata() 39 ## 保存數據 40 savedata("rank.xls",data) # py文件同級目錄創建rank.xls文件 41 ## 展示數據 42 drow()
效果如圖
5.完整代碼
1 # 讀取一頁的數據 2 def loaddata(url): 3 from bs4 import BeautifulSoup 4 import requests 5 headers = { 6 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) ' 7 'Chrome/72.0.3626.121 Safari/537.36' 8 } 9 f = requests.get(url,headers=headers) #Get該網頁從而獲取該html內容 10 soup = BeautifulSoup(f.content, "lxml") #用lxml解析器解析該網頁的內容 11 # print(f.content.decode()) #嘗試打印出網頁內容,看是否獲取成功 12 ranktable = soup.find_all('table',class_="rank-table" )[0] #獲取排行榜表格 13 trlist = ranktable.find_all('tr') #獲取表格中所有tr標簽 14 trlist.pop(0) #去掉第一個元素 15 persionlist = [] 16 for tr in trlist: 17 persion = {} 18 persion['num'] = tr.find_all('td')[0].string #編號 19 persion['name'] = tr.find_all('td')[1].p.string #名稱 20 persion['money'] = tr.find_all('td')[2].string #財產 21 persion['company'] = tr.find_all('td')[3].string #企業 22 persion['country'] = tr.find_all('td')[4].a.string #國家 23 persionlist.append(persion) 24 print("頁面"+url+"爬取成功") 25 return persionlist 26 27 28 ## 讀取所有福布斯排行榜數據 29 def loadalldata(): 30 alldata = [] 31 for i in range(1,16,1): 32 url = "https://www.phb123.com/renwu/fuhao/shishi_"+str(i)+".html" 33 data = loaddata(url) 34 alldata = alldata + data 35 return alldata 36 37 ## 將爬取的數據保存到文件 38 def savedata(path,persionlist): 39 import xlwt 40 workbook = xlwt.Workbook() 41 worksheet = workbook.add_sheet('test') 42 worksheet.write(0, 0, '排名') 43 worksheet.write(0, 1, '姓名') 44 worksheet.write(0, 2, '財富') 45 worksheet.write(0, 3, '企業') 46 worksheet.write(0, 4, '國家') 47 for i in range(1,len(persionlist)+1,1): 48 worksheet.write(i, 0, persionlist[i-1]['num']) 49 worksheet.write(i, 1, persionlist[i-1]['name']) 50 worksheet.write(i, 2, persionlist[i-1]['money']) 51 worksheet.write(i, 3, persionlist[i-1]['company']) 52 worksheet.write(i, 4, persionlist[i-1]['country']) 53 workbook.save(path) 54 print("數據保存成功:"+path) 55 56 ## 取出排行榜前十的姓名和財富數據 以兩個list返回 57 def loadtop10(path): 58 import xlrd 59 book = xlrd.open_workbook(path) 60 sheet1 = book.sheets()[0] 61 namelist = sheet1.col_values(1) 62 moneylist = sheet1.col_values(2) 63 namelist = namelist[1:11] 64 moneylist = moneylist[1:11] 65 66 moneylist2 = [] 67 for a in moneylist: 68 a = int(a[0:-3]) 69 moneylist2.append(a) 70 print("取出排行榜前十的姓名和財富數據") 71 print(namelist) 72 print(moneylist2) 73 return namelist,moneylist2 74 75 ## 統計排行榜中每個國家的上榜人數 以字典list返回 76 def countcountrynum(path): 77 import xlrd 78 book = xlrd.open_workbook(path) 79 sheet1 = book.sheets()[0] 80 countrylist = sheet1.col_values(4)[1:-1] 81 print(countrylist) 82 countryset = list(set(countrylist)) 83 dictlist = [] 84 for country in countryset: 85 obj = {"name":country,"count":0} 86 dictlist.append(obj) 87 ## 統計出每個國家對應的數量 88 for obj in dictlist: 89 for a in countrylist: 90 if obj['name'] == a: 91 obj['count'] = obj['count'] + 1 92 print(dictlist) 93 ## 將dictlist排序 數量多的放前面 8 5 6 9 3 2 4 94 for i in range(0,len(dictlist),1): 95 for j in range(0,len(dictlist)-i-1,1): 96 if dictlist[j]['count'] < dictlist[j+1]['count']: 97 temp = dictlist[j] 98 dictlist[j] = dictlist[j+1] 99 dictlist[j+1] = temp 100 dictlist2 = dictlist[0:5] 101 set2 = [] 102 for a in dictlist2: 103 set2.append(a['name']) 104 othercount = 0; 105 for a in dictlist: 106 if a['name'] not in set2: 107 othercount = othercount + 1 108 dictlist2.append({"name":"其他","count":othercount}) 109 print('獲取排行榜中每個國家的上榜人數') 110 print(dictlist2) 111 return dictlist2 112 113 ## 繪制條形圖和餅狀圖 114 def drow(): 115 import matplotlib.pyplot as plt 116 plt.rcParams['font.sans-serif'] = ['SimHei'] # 設置中文字體 117 plt.figure('福布斯前十榜',figsize=(15,5)) 118 119 ## 讀取福布斯排行榜前十的數據 120 listx,listy = loadtop10('rank.xls') 121 122 plt.title('福布斯前十榜', fontsize=16) 123 plt.xlabel('人物', fontsize=14) 124 plt.ylabel('金額/億美元', fontsize=14) 125 plt.tick_params(labelsize=10) 126 plt.grid(linestyle=':', axis='y') 127 a = plt.bar(listx, listy, color='dodgerblue', label='Apple', align='center') 128 # 設置標簽 129 for i in a: 130 h = i.get_height() 131 plt.text(i.get_x() + i.get_width() / 2, h, '%d' % int(h), ha='center', va='bottom') 132 ## ------------------------------------------------------------------------- 133 dictlist = countcountrynum("rank.xls") 134 plt.figure('各國家上榜人數所占比例') 135 labels = [] 136 sizes = [] 137 for a in dictlist: 138 labels.append(a['name']) 139 sizes.append(a['count']) 140 explode = (0.1, 0, 0, 0, 0, 0) 141 plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150) 142 plt.title("各國家上榜人數所占比例", fontsize=16) 143 plt.axis('equal') # 該行代碼使餅圖長寬相等 144 145 plt.show() 146 147 if __name__ == '__main__': 148 149 ## 爬取數據 150 data = loadalldata() 151 ## 保存數據 152 savedata("rank.xls",data) # py文件同級目錄創建rank.xls文件 153 ## 展示數據 154 drow()
五、總結
1.經過對主題數據的分析與可視化,可以看出進入榜單的大多數都是美國人,總占比38.8%其次是中國,總站比為26.3%。
2.在這次設計過程中,進一步加深對網絡爬蟲和數據可視化原理的認知,需要改進的是對於可視化部分的掌握還是不夠,需要進一步了解