爬取網站: 滬港通https://www.hkexnews.hk/sdw/search/mutualmarket.aspx?t=sh&t=sh
和深港通https://www.hkexnews.hk/sdw/search/mutualmarket.aspx?t=sh&t=sz
(url只是最后一個字母不一樣)
# coding=utf-8 import pandas as pd import numpy as np import datetime from bs4 import BeautifulSoup import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By def get_browser(): chrome_options = Options() chrome_options.add_argument("--headless") br = webdriver.Chrome(options=chrome_options) return br def get_shareholding(): for exchange in ["sh", "sz"]: URL = ( "http://www.hkexnews.hk/sdw/search/mutualmarket.aspx?t=" + exchange.lower() ) browser = get_browser() browser.get(URL) today = datetime.date.today() start_date = today.replace(year=today.year - 1) end_date = today date_list = pd.date_range( start=start_date, end=end_date, freq="1D", closed="left" ).strftime("%Y/%m/%d") for date in date_list: try: js = "document.getElementById('txtShareholdingDate').value='{}';".format( date ) browser.execute_script(js) browser.find_element(By.ID, "txtShareholdingDate").click() browser.find_element(By.ID, "btnSearch").click() soup = BeautifulSoup(browser.page_source, "html.parser") data = [] for tr in ( soup.find("table", {"id": "mutualmarket-result"}) .find("tbody") .findAll("tr") ): code = ( tr.find("td", {"class": "col-stock-code"}) .find("div", {"class": "mobile-list-body"}) .get_text() ) name = ( tr.find("td", {"class": "col-stock-name"}) .find("div", {"class": "mobile-list-body"}) .get_text() ) shareholding = ( tr.find("td", {"class": "col-shareholding"}) .find("div", {"class": "mobile-list-body"}) .get_text() ) shareholding_percent = ( tr.find("td", {"class": "col-shareholding-percent"}) .find("div", {"class": "mobile-list-body"}) .get_text() ) data.append([code, name, shareholding, shareholding_percent]) df = pd.DataFrame(data, columns=["code", "name", "shareholding", "shareholding_percent"]) df["Symbol"] = df["name"].apply(lambda x: x[-7:-1].replace("#", "0")) df["shareholding_percent"] = ( df["shareholding_percent"] .apply(lambda x: x[:-1] if len(x) > 0 else np.nan) .astype("float64") ) df["shareholding"] = ( df["shareholding"] .apply(lambda x: x.replace(",", "")) .astype("float64") ) date = date.replace("/", "-") df["Tradedate"] = date del df["code"], df["name"] import pdb; pdb.set_trace() time.sleep(2) except Exception as er: print(er) browser.close() browser.quit() if __name__ == "__main__": get_shareholding()
注意:
1.這里用pdb打斷了, 輸出結果是pandas的dataframe類型, 一般都會導入到數據庫, 這里就不做演示了, 公司都會有自己封裝好的方法
2.這個網站提供的數據是當前時間往前推一個自然年的數據, 每天的數據, 有個SEARCH按鈕可以手動選擇,
這里用無界面Chrome瀏覽器 + 執行js選擇日期 + click確認 實現這一操作
windows系統下 需要手動下載chromedriver.exe 版本號與本地瀏覽器匹配, 瀏覽器地址欄輸入chrome://version/ 第一行就是版本號
exe本地路徑通過webdriver的executable_path參數指定, 如果放在和模塊同一目錄下可以省略(這里就省略了)
下載地址 http://chromedriver.storage.googleapis.com/index.html
3.這個網站的數據有些奇怪, 我已經做了一些特殊數據處理, 轉化成常見的數據, 輸入df.head()可預覽數據的前五行
返回數據示例