參考鏈接:https://www.makcyun.top/web_scraping_withpython2.html
#!/usr/bin/env python # -*- coding: utf-8 -*-
from multiprocessing.pool import Pool import pandas as pd import requests from sqlalchemy import create_engine # 數據庫相關信息
HOSTNAME = '127.0.0.1' PORT = '3306' DATABASE = 'top500' USERNAME = 'root' PASSWORD = 'root' SQLALCHEMY_DATABASE_URI = "mysql+mysqlconnector://{username}:{password}@{host}:{port}/{db}?charset=utf8mb4".format( username=USERNAME, password=PASSWORD, host=HOSTNAME, port=PORT, db=DATABASE) SQLALCHEMY_TRACK_MODIFICATIONS = False SQLALCHEMT_ENCODING = 'utf8mb4' engine = create_engine(SQLALCHEMY_DATABASE_URI, echo=True) # 獲取網頁收據
def get_one_page(url): response = requests.get(url) if response.status_code == 200: return response.text else: return None # 保存到csv文件
def save_csv(html): dataframe = pd.read_html(html) tb = dataframe[0].drop([0]) # 獲取網頁數據中的第一個表格數據,然后再去掉第一個表格數據中的的第一行(去掉的話csv文件中沒有列名,不去掉的話多次寫入列名)
# tb.columns = ['rank', 'site', 'system', 'cores', 'rmax', 'rpeak', 'power'] # 重命名列名
tb.to_csv(r'top500.csv', mode='a', encoding='utf_8_sig', index=True, header=False) #
def save_mysql(html): dataframe = pd.read_html(html) tb = dataframe[0].drop([0]) tb.columns = ['rank', 'site', 'system', 'cores', 'rmax', 'rpeak', 'power'] try: tb.to_sql('top500', con=engine, if_exists='append', index=False) # 需要事先建好top500數據表,並注意字段名稱跟數據列名一一對應,字段值的長度要足夠
print('success') except: print('fail') def main(offset): url = 'https://www.top500.org/list/2018/11/?page=' + str(offset) html = get_one_page(url) # save_csv(html)
save_mysql(html) if __name__ == '__main__': pool = Pool() pool.map(main, [i for i in range(1, 6)])
csv文件效果:
csv文件待優化的地方:加上列名
mysql效果:
問題:
1.不論是csv文件還是mysql表格數據,根據rank字段進行排序,竟然排序的不怎么准確
2.site字段的最后部分數據是國家,這個需要想辦法給剝離出來,再弄一列數據展示