本次主要是抓取開盤啦股票概念數據
采用多進程、requests完成數據的爬取
采用Pandas庫完成數據比對,實現mysql數據存儲
具體代碼如下:
# -*- coding: utf-8 -*- import pandas as pd import tushare as ts import time import requests import json from sqlalchemy import create_engine from multiprocessing import Pool from requests.packages.urllib3.exceptions import InsecureRequestWarning # ====================Tushare股票code獲取==================================================================================================================== def getCode(): print("-------------------------------------------") print("開始從Tushare接口獲取股票列表數據") # 初始化tushare.pro接口 pro = ts.pro_api('ac16b470869c5d82db5033ae9288f77b282d2b5519507d6d2c72fdd7') # L 表示正常上市,P 表示暫停上市 l_list = pro.stock_basic(list_status='L', fields='ts_code,symbol,name,area,exchange,list_status,list_date') p_list = pro.stock_basic(list_status='P', fields='ts_code,symbol,name,area,exchange,list_status,list_date') # 合並正常上市、暫停上市數據 stock_list = pd.concat([l_list, p_list], axis=0, ignore_index=True) # 創建空列表 code_list = [] for index, row in stock_list.iterrows(): symbol = row['symbol'] code_list.append(symbol) return code_list # ====================爬取PC端開盤啦板塊數據==================================================================================================================== def Kplspider(data_list): print("-------------------------------------------") # 構造空html列表 html_list = [] # 構造URL請求、user-agent頭文件 url = 'https://pchq.kaipanla.com/w1/api/index.php' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0'} session = requests.Session() # 禁用安全請求警告 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) for data in data_list: try: html = session.post(url=url, data=data, headers=headers, verify=False).text html_list.append(html) except Exception as spider_error: print("html抓取過程報錯,錯誤信息為:%s" % spider_error) # 分別創建用於存儲tag、concept的空Dataframe stock_tag = pd.DataFrame(); stock_concept = pd.DataFrame() print("-------------------------------------------") print("股票標簽、所屬概念數據開始解析") for html in html_list: # 解析開盤啦股票標簽 code = json.loads(html)['trend']['code'] day = json.loads(html)['trend']['day'] tag = json.loads(html)["pankou"]["tag"] stock_tag = stock_tag.append({'symbol': code, 'tag': tag, 'in_date':day}, ignore_index=True) cept_list = json.loads(html)["stockplate"] try: for cept in cept_list: stock_concept = stock_concept.append({'symbol':code, 'concept': cept[0], 'in_date': day}, ignore_index=True) except Exception as parser_error: print("html抓取過程報錯,錯誤信息為:%s" % parser_error) print("%s概念數據請求為空,請知悉" % code) # 創建Pandas讀寫數據庫引擎 engine = create_engine('mysql://root:123456@127.0.0.1/quant?charset=utf8') # 開始存儲標簽數據 old_tag = pd.read_sql('select * from is_belong_zyj', engine) stock_tag = stock_tag[['symbol','tag','in_date']] stock_tag = stock_tag.append(old_tag,ignore_index=True,sort=False) stock_tag.drop_duplicates(subset=['symbol', 'tag'], keep=False,inplace=True) stock_tag.to_sql('is_belong_zyj', engine, if_exists='append', index=False) print(stock_tag) print("本次存儲開盤啦標簽數據%s條" % stock_tag.shape[0]) # 開始存儲所屬概念數據 old_concept = pd.read_sql('select * from belong_concept',engine) stock_concept = stock_tag[['symbol','concept','in_date']] stock_concept = stock_tag.append(old_tag,ignore_index=True,sort=False) stock_concept.drop_duplicates(subset=['symbol', 'concept'], keep=False,inplace=True) stock_concept.to_sql('belong_concept', engine, if_exists='append', index=False) print(stock_concept) print("本次存儲開盤啦標簽數據%s條" % stock_concept.shape[0]) # ====================主函數==================================================================================================================================== if __name__ == '__main__': print("開盤啦股票標簽及概念爬蟲程序開始執行") print("-------------------------------------") start = time.time() # 調用getCode code_list = getCode() # 獲取當前日期 cur_date = time.strftime("%Y%m%d", time.localtime()) # 創建多進程 pool = Pool(processes=4) # 構造post請求表單 data_list = [] for code in code_list: data = {'c': 'PCArrangeData','a': 'GetHQPlate','StockID': code,'Day': cur_date,'SelType': '1, 2, 3, 8, 9, 5, 6, 7','UserID': 399083,'Token': '71aef0e806e61ad3169ddc9473e37886'} data_list.append(data) # 開啟多進程爬取開盤啦數據 try: pool.map(Kplspider, (data_list,)) except Exception as error: print("進程執行過程報錯,錯誤信息為:%s" % error) end = time.time() print('開盤啦股票標簽及概念爬蟲程序共執行%0.2f秒.' % ((end - start))) print("開盤啦股票標簽及概念爬蟲程序執行完成")
執行效果展示:
Mysql存儲數據展示: