Python爬蟲與數據圖表的實現


要求:

1. 參考教材實例20,編寫Python爬蟲程序,獲取江西省所有高校的大學排名數據記錄,並打印輸出。

2. 使用numpy和matplotlib等庫分析數據,並繪制南昌大學、華東交通大學、江西理工大學三個高校的總分排名、生源質量(新生高考成績得分)、培養結果(畢業生就業率)、頂尖成果(高被引論文·篇)等四個指標構成的多指標柱形圖。

3. 對江西各高校的頂尖成果(高被引論文數量)進行分析,使用matplotlib繪制各高校頂尖成果數構成的餅狀圖,並突出江西理工大學所在的餅狀塊。

實例代碼:

import requests
from bs4 import BeautifulSoup
import numpy as np
import matplotlib.pyplot as plt

allUniv = []
def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""

def fillUnivList(soup):
    data = soup.find_all('tr')
    for tr in data:
        ltd = tr.find_all('td')
        if len(ltd) == 0:
            continue
        singleUniv = []
        for td in ltd:
            singleUniv.append(td.string)
        allUniv.append(singleUniv)
    return len(allUniv)

def printUnivList(num):
    print("{0:^4}\t{1:^20}\t{2:^5}\t{3:^8}\t{4:^8}\t{5:^8}\t{6:^8}".format("排名", "學校名稱", "省市", "總分", "生源質量", "培養結果", "頂尖成果"))
    for i in range(num):
        u = allUniv[i]
        if u[2] == "江西":
            print("{0:^4}\t{1:^20}\t{2:^5}\t{3:^8}\t{4:^8}\t{5:^8}\t{6:^8}".format(u[0], u[1], u[2], u[3], str(u[4]), str(u[5]), str(u[9])))

def drawBarChart(num):
    jxlg = []
    ncdx = []
    hdjd = []
    for i in range(num):
        u = allUniv[i]
        if u[1] == "江西理工大學":
            jxlg.append(float(u[3]))
            jxlg.append(float(u[4]))
            jxlg.append(float(str(u[5]).replace('%', '')))
            jxlg.append(float(u[9]))
        if u[1] == "南昌大學":
            ncdx.append(float(u[3]))
            ncdx.append(float(u[4]))
            ncdx.append(float(str(u[5]).replace('%', '')))
            ncdx.append(float(u[9]))
        if u[1] == "華東交通大學":
            hdjd.append(float(u[3]))
            hdjd.append(float(u[4]))
            hdjd.append(float(str(u[5]).replace('%', '')))
            hdjd.append(float(u[9]))
    name_list = ['總分', '生源質量', '培養結果', "頂尖成果"]
    x = list(range(len(name_list)))
    total_width, n = 0.8, 4
    width = total_width / n
    fig, ax = plt.subplots()
    plt.rcParams['font.sans-serif'] = 'SimHei'
    plt.bar(x, jxlg, width=width, label='江西理工大學', tick_label=name_list, fc='r')
    for i in range(len(x)):
        x[i] = x[i] + width
    plt.bar(x, ncdx, width=width, label='南昌大學', fc='y')
    for i in range(len(x)):
        x[i] = x[i] + width
    plt.bar(x, hdjd, width=width, label='華東交通大學', fc='b')
    # plt.xticks(np.arange(len(name_list)))
    plt.legend()
    plt.show()

def drawBar(num):
    djcg = []
    name = []
    explode = []
    for i in range(num):
        u = allUniv[i]
        if u[2] == "江西":
            djcg.append(u[9])
            name.append(u[1])
            if u[1] == "江西理工大學":
                explode.append(0.5)
            else:
                explode.append(0)
    plt.rcParams['font.sans-serif'] = 'SimHei'
    fig1, ax1 = plt.subplots()
    ax1.pie(djcg, explode=explode, labels=name, autopct='%1.1f%%',
            shadow=True, startangle=90)
    ax1.axis('equal')
    plt.legend()
    plt.show()

def main():
    url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html"
    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    num = fillUnivList(soup)
    printUnivList(num)
    drawBarChart(num)
    drawBar(num)

if __name__ == '__main__':
    main()

 

江西省高校排名結果如下:

 

三校部分數據對比如下:

 

江西各高校的頂尖成果(高被引論文數量)對比分析如下:

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM