爬取了同花順上概念板塊的成分股
主要是發現了各個量化的網站沒有這個數據源,很多策略無法展開。這份數據應該挺多人需要的吧。把鏈接掛這里了,需要的闊以下載
http://pan.baidu.com/s/1eSGSS5W
數據有4列,分別是板塊代碼,板塊名字,成分股代碼以及對應的公司
源碼貼出來了,初步學習爬蟲,寫的十分難看。。。。
1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 """ 4 Created on Fri Nov 17 19:41:44 2017 5 6 @author: Mr.ZeroW 7 8 同花順板塊成分股 9 """ 10 11 #首先不同板塊地址不同 http://q.10jqka.com.cn/gn/detail/order/desc/page/1/ajax/1/code/300018 12 #需要高出不同板塊頁數有多少,才能得出地址 13 import urllib.request 14 from lxml import etree 15 import pandas as pd 16 import time 17 18 #爬取板塊名稱以及代碼並且存在文件 19 with urllib.request.urlopen('http://q.10jqka.com.cn/gn/') as f: 20 text = f.read().decode('gb2312') 21 22 html = etree.HTML(text) 23 24 gnbk = html.xpath('/html/body/div[2]/div[1]/div//div//div//a') 25 thsgnbk = [] 26 for i in range(len(gnbk)): 27 thsgnbk.append((gnbk[i].text)) 28 29 #板塊代碼 30 bkcode = html.xpath('/html/body/div[2]/div[1]/div//div//div//a/@href') 31 bkcode = list(map(lambda x : x.split('/')[-2], bkcode)) 32 data = {'Name': thsgnbk} 33 34 #存儲 35 gnbk = pd.DataFrame(data, index = bkcode) 36 gnbk.to_csv('gnbk.csv') 37 38 print('板塊名稱以及代碼已爬取,存儲文件名:gnbk.csv') 39 #導入板塊名稱和代碼 40 data = pd.read_csv('gnbk.csv') 41 #建立數據框,四列【板塊id, 板塊name, 成分股id, 成分股name】 42 43 bk_id = [] 44 bk_name = [] 45 s_id = [] 46 s_name = [] 47 iCount = 1 48 print('爬取開始!') 49 start = time.time() 50 for i in range(len(data)): 51 52 bk_code = str(data.iloc[i, 0]) 53 name = str(data.iloc[i, 1]) 54 url = 'http://q.10jqka.com.cn/gn/detail/code/' + bk_code + '/' 55 print('%d: %s' %(iCount, name)) 56 iCount += 1 57 58 with urllib.request.urlopen(url) as f: 59 text = f.read().decode('GBK', 'ignore') 60 61 #得出板塊成分股有多少頁 62 html = etree.HTML(text) 63 64 result = html.xpath('//*[@id="m-page"]/span/text()') 65 try: 66 page = int(result[0].split('/')[-1]) 67 for j in range(page): 68 page_n = str(j + 1) 69 curl = 'http://q.10jqka.com.cn/gn/detail/order/desc/page/' + page_n+ '/ajax/1/code/' + bk_code 70 with urllib.request.urlopen(curl) as f: 71 text = f.read().decode('GBK') 72 html = etree.HTML(text) 73 #成分股代碼 74 stock_code = html.xpath('/html/body/table/tbody/tr/td[2]/a/text()') 75 #成分股名稱 76 stock_name = html.xpath('/html/body/table/tbody/tr/td[3]/a/text()') 77 s_id += stock_code 78 s_name += stock_name 79 bk_id.extend([bk_code]* len(stock_code)) 80 bk_name.extend([name]* len(stock_name)) 81 82 except IndexError as e: 83 curl = url 84 with urllib.request.urlopen(curl) as f: 85 text = f.read().decode('GBK') 86 html = etree.HTML(text) 87 #成分股代碼 88 stock_code = html.xpath('//*[@id="maincont"]/table/tbody/tr/td[2]/a/text()') 89 #成分股名稱 90 stock_name = html.xpath('//*[@id="maincont"]/table/tbody/tr/td[3]/a/text()') 91 s_id += stock_code 92 s_name += stock_name 93 bk_id.extend([bk_code]* len(stock_code)) 94 bk_name.extend([name]* len(stock_name)) 95 96 97 data_dict = dict(BK_ID = bk_id, BK_NAME = bk_name, S_ID = s_id, S_NAME = s_name) 98 cdata = pd.DataFrame(data_dict) 99 cdata.to_csv('chengfengu.csv') 100 end = time.time() 101 print('爬取結束!!\n開始時間:%s\n結束時間:%s\n'%(time.ctime(end), time.ctime(start)))