股票價格會受到各種小道消息的干擾,而這種小道消息對量化來講很難控制(除非時時刻刻對網絡所有資源進行監控,而這不現實)。
散戶通常是小道消息的追捧者,所以我的想法是設計一個針對散戶的爬蟲。
思路:東方財富股吧有一個問董秘欄目,是散戶和上市公司溝通的平台。個股的小道消息散播后,經常會有散戶到問董秘欄目對傳聞進行求證。
經過比較,問董秘平台是對上證e互動和深證互動易的匯總,但問董秘有兩個問題:一是更新不及時;二是只顯示已回復的問題。待董秘回復完並更新到該平台,則事件(傳聞)的影響可能已經釋放。
所以最終選擇從上證e互動(http://sns.sseinfo.com/);深圳互動易(http://irm.cninfo.com.cn/szse/index.html)讀取原始提問。
程序分為兩個部分:第一部分利用python抓取上交所和深交所評論;第二部分利用jieba包進行文本分析,篩選出熱門股票的評論和高頻詞匯。
注意:本程序只篩選了熱點股票的評論和高頻詞匯,並沒有進行詞性分析。每天大概篩選出10-20只股票及其熱點評論,需要人工對評論所涉及的事件進行具體分析。
突發事件是低頻事件。每天篩選出股票並對評論進行分析后,很有可能做無用功。但一旦重大傳聞出現,該程序可能就起到及時提醒的作用。
第一部分:抓取
1.來源1:上證e互動平台
2.來源2: 深證互動易
3.其他說明:之抓取csi800。利用了本地數據庫。復制代碼時,需要修改股票池==》line29:讀取800成分股
1 # -*- coding:utf-8 -*- 2 from selenium import webdriver 3 import time 4 from selenium.webdriver.common.keys import Keys 5 from bs4 import BeautifulSoup 6 import re 7 import datetime 8 import pandas as pd 9 import jieba.analyse 10 import MySQLdb as mdb 11 import requests 12 from bs4 import BeautifulSoup 13 import bs4 14 15 chrome_options = webdriver.chrome.options.Options(); 16 chrome_options.add_argument('--headless') 17 driver = webdriver.Chrome(chrome_options=chrome_options) 18 19 def main(): 20 #0.0 時間初始化 21 today = datetime.datetime.now().date(); 22 tradedays=get_tradecalendar(today); 23 if today not in tradedays: 24 lastday = tradedays[-1] 25 else: 26 lastday=tradedays[tradedays.index(today)-1] 27 28 ##1.0 讀取800成分股 29 members_800=get_index800(today); 30 h=0; 31 print 'crawler start' 32 ##上證e互動 和 深交所互動易 33 Failed_reading=[];Data=pd.DataFrame([]);j=0 34 for i in members_800: 35 code=i[0:6]; exchange=i[7:]; 36 h=h+1;print h,code 37 if exchange=='SH': 38 try: 39 x=get_ask_from_SHExchange(code,lastday,today) 40 if len(x) == 0: 41 continue 42 else: 43 x['code'] = i; 44 Data = Data.append(x) 45 except : 46 Failed_reading.append(i) 47 48 elif exchange=="SZ": 49 x=get_ask_from_SZExchange(code,lastday,today); 50 if len(x)==0: 51 continue 52 else: 53 x['code']=i; 54 Data = Data.append(x) 55 else: 56 continue; 57 Data.index=range(len(Data.index)) 58 Data.to_csv('comments_'+today.strftime("%Y%m%d")+'.xls',index=False,encoding='utf-8_sig') 59 print 'failed reading:', Failed_reading 60 61 def main1(): 62 today = datetime.datetime.now().date(); 63 tradedays = get_tradecalendar(today); 64 if today not in tradedays: 65 today=tradedays[-1]; 66 lastday = tradedays[tradedays.index(today) - 1] 67 print get_ask_from_SHExchange('600000',lastday,today) 68 69 ###讀取上交所評論 70 def get_ask_from_SHExchange(code,yesterday,today): 71 # driver = webdriver.Chrome() 72 73 driver.get("http://sns.sseinfo.com") 74 # 搜索code 75 driver.find_element_by_id("com_search_txt").send_keys(code) 76 driver.find_element_by_id("to_companyByCode").click() 77 time.sleep(1) 78 # 點擊“最新提問 79 driver.find_element_by_link_text('最新提問').click() 80 time.sleep(1) 81 js = "var q=document.documentElement.scrollTop=100000" 82 driver.execute_script(js) 83 time.sleep(3) 84 soup = BeautifulSoup(driver.page_source, features='lxml') 85 Tem = [] 86 for i in soup.find_all('div', attrs={'class': "m_feed_item m_question"}): 87 soup_i = BeautifulSoup(str(i), features='lxml') 88 question = soup_i.find('div', {'class': 'm_feed_txt'}).a.nextSibling.strip() 89 webtime = soup_i.find('div', {'class': 'm_feed_from'}).span.string.strip() 90 if '昨天' in webtime: ### 昨天08:50 91 t = (datetime.datetime.now() - datetime.timedelta(days=1)).date() 92 elif '前' in webtime: ##8分鍾前 93 t = (datetime.datetime.now()).date() 94 else: 95 t = webtime[0:6] 96 t = datetime.datetime.strptime(u"2018年" + str(t), "%Y年%m月%d日").date() ##有誤,全都按當年處理。后續應篩選 97 Tem.append([t, question]) 98 Data = pd.DataFrame(Tem, columns=['t', 'ask']); 99 Data=Data[(Data.t>=yesterday)&(Data.t<=today)].copy() 100 Data.drop_duplicates(inplace=True); 101 return Data 102 103 ##深交所 互動易 ==》讀取上一交易日到當天的評論 104 def get_ask_from_SZExchange(code,yesterday,today): 105 url = r"http://irm.cninfo.com.cn/ircs/interaction/lastQuestionforSzseSsgs.do?condition.type=2&condition.stockcode=" + code + "&condition.stocktype=S" 106 r = requests.get(url) 107 soup = BeautifulSoup(r.text, features='lxml') 108 x = []; 109 for i in soup.find_all(re.compile('p')): 110 if (type(i.a) == bs4.element.Tag) & (type(i.span) == bs4.element.Tag): 111 t = i.span.string.strip()[1:12]; 112 t = datetime.datetime.strptime(t, "%Y年%m月%d日").date() 113 comment = i.a.string.strip() 114 x.append([t, comment]) 115 x = pd.DataFrame(x, columns=['t', 'ask']) 116 x = x[(x.t >= yesterday) & (x.t <= today)].copy(); 117 x.drop_duplicates(inplace=True); 118 return x 119 120 ###后續處理 121 ''' 122 123 questions = "" 124 for i in x.ask: 125 questions = questions + i 126 127 print questions 128 129 for word in useless_words: 130 questions = questions.replace(word, ''); 131 132 questions = questions.strip() 133 print questions 134 for x in jieba.analyse.extract_tags(questions, topK=8, withWeight=True, allowPOS=()): 135 print x[0], x[1] 136 ''' 137 138 139 def get_index800(today): 140 start = today - datetime.timedelta(days=10); start = start.strftime("%Y%m%d"); 141 end = today; end = end.strftime("%Y%m%d"); 142 csi300 = get_indexmembers(start, end, '000300.SH'); csi300 = sorted(set(csi300.ticker.values)); 143 csi500 = get_indexmembers(start, end, '000905.SH'); csi500 = sorted(set(csi500.ticker.values)); 144 index800 = []; 145 index800.extend(csi300); 146 index800.extend(csi500) 147 return index800 148 149 150 151 def get_indexmembers(start,end,index_code): 152 if index_code=='000300.SH':index_code='csi300'; 153 if index_code=='000905.SH':index_code='csi500'; 154 sql="select distinct ticker from Research.windIndexWgtsSSE where tradedate>='"+start+"' and tradedate<='"+end+"' and "+index_code+">0 ; " 155 cnn=mdb.connect('10.10.40.310', 'report', 'raP1_Hdr2', 'Wind'); 156 cnn = cnn.cursor(mdb.cursors.SSDictCursor) 157 cnn.execute(sql) 158 dictionary = cnn.fetchall(); 159 table = pd.DataFrame(list(dictionary)) 160 return table; 161 162 def get_tradecalendar(today): 163 start = today - datetime.timedelta(days=10); start = start.strftime("%Y%m%d"); 164 end = today; end = end.strftime("%Y%m%d"); 165 sql="select distinct str_to_date(trade_days,'%Y%m%d') as trade_days from Wind.ASHARECALENDAR where trade_days>='"+start+"' and trade_days<='"+end+"' and s_info_exchmarket='SZSE' order by trade_days;" 166 print sql 167 cnn = mdb.connect('10.10.40.310', 'report', 'raP1_Hdr2', 'Wind'); 168 cnn = cnn.cursor(mdb.cursors.SSDictCursor) 169 cnn.execute(sql) 170 dictionary = cnn.fetchall(); 171 table = pd.DataFrame(list(dictionary)) 172 table=sorted(set(table.trade_days.values)); 173 return table; 174 175 if __name__ == "__main__": 176 main()
第二部分:文本處理:
我的目的是要關注熱點事件,思路是選取評論數多,且不同評論間有重復詞匯的證券
1.選取條件:最近兩天評論數>3;高頻詞匯出現次數>3;
2.針對有些人重復評論,作了去重復處理
3.針對某一個人多次評論相似內容,可以對爬取評論者昵稱並選取最長的一條。但是考慮到深交所爬取速度,我們並沒有做這個處理。后續可以改進。
1 # -*- coding:utf-8 -*- 2 import pandas as pd 3 import jieba.analyse 4 import MySQLdb as mdb 5 from collections import Counter 6 import datetime 7 def main(): 8 today = datetime.datetime.now().date(); 9 useless_words = ['謝謝','上市','股價','有限公司','你好','的', '董秘', '公司', '管理層', '請問', '嗎', '我', '您好','有沒有', '有何', '方案', '貴司', '貴公司','問下','信心','感謝','們','是否','多久','應該','應當','建議','最好','為何','為什么','多少','!',',','。','?','、','…','……','...',',','哪些','已','貴','披露','公告','影響','股東','關心','客觀'];#,'!',',','。','?','、','…','……','...',',' 10 useless_words.extend(['潛力','業務','投資者','是不是','如何','17','18','20']) 11 data=pd.read_csv('comments_'+today.strftime("%Y%m%d")+'.xls'); 12 grouped=data.groupby(['code']).size() 13 grouped=pd.DataFrame(grouped,columns=['ask_num']); 14 grouped['code'] = grouped.index;grouped.index=range(len(grouped)); 15 data=pd.merge(data,grouped,on=['code']); 16 data=data[data.ask_num>=3].copy(); 17 codes=sorted(set(data.code.values)); 18 ## 讀取證券名稱 19 Names=get_names_from_wind(codes) 20 data=pd.merge(data,Names,on=['code']) 21 22 data.sort_values(by=['ask_num'],ascending=False,inplace=True) 23 #print data 24 ## t,code,ask,ask_num,name 25 Lists=[] 26 for code in codes: 27 data_i=data[data.code==code].copy(); 28 n=data_i.ask_num.max(); 29 name_i=data_i.name.max() 30 ## 每一條評論篩選關鍵詞,去除單條評論中的重復詞語 31 questions='' 32 for j in data_i.ask: 33 keywords=[]; 34 for k in jieba.cut(j): 35 keywords.append(k); 36 keywords=set(keywords) 37 questions=questions+ ' '.join(keywords); 38 if code == '002642.SZ': 39 print questions; 40 ##將無意義詞匯刪除(打招呼、公司名、語氣詞等,股票簡稱) 41 name_i_ = [x for x in jieba.cut(name_i,cut_all=True,HMM=True)];## 列表。如果是iteration,則第二次不能迭代 42 for j in name_i_: 43 useless_words.append(j) 44 for word in useless_words: 45 questions=questions.replace(word,'') 46 for j in name_i_: 47 useless_words.pop(useless_words.index(j)) 48 questions=questions.strip()##只去除了兩邊的空格 49 50 keywords=jieba.analyse.extract_tags(questions, topK=5, withWeight=True, allowPOS=()); 51 if code == '002642.SZ': 52 print questions; 53 for x in keywords: 54 print x[0],x[1] 55 56 keyword1 = keywords[0][0]; keyword1_weight = keywords[0][1]; 57 keyword2 = keywords[1][0]; keyword2_weight = keywords[1][1]; 58 keyword3 = keywords[2][0]; keyword3_weight = keywords[2][1] 59 ##添加詞(有些新生關鍵詞,extract可以提取出來重復詞,但cut不能識別) 60 keywords = [keyword1, keyword2, keyword3]; 61 for word in keywords: 62 jieba.add_word(word,tag=None) 63 ##統計關鍵詞keywords1,2,3 出現次數 64 words_all=[];appeared_times=[] 65 66 for x in jieba.cut(questions,cut_all=True,HMM=True): 67 words_all.append(x) 68 69 keywords=[keyword1,keyword2,keyword3]; 70 for i in range(len(keywords)): 71 appeared_times.append(0) 72 73 for i in range(len(keywords)): 74 for word in words_all: 75 if len(word)==0:## 速度快 76 continue 77 elif word==keywords[i]: 78 appeared_times[i]=appeared_times[i]+1; 79 else: 80 continue 81 Lists.append([code,name_i,n,keyword1,appeared_times[0],keyword1_weight,keyword2,appeared_times[1],keyword2_weight,keyword3,appeared_times[2],keyword3_weight]) 82 results=pd.DataFrame(Lists,columns=['code','name','num','keyword1','n1','weight1','keyword2','n2','wight2','keyword3','n3','wight3']) 83 results=results[(results.n1>2)|(results.n2>2)|(results.n3>2)].copy() 84 results.sort_values(by=['num'],ascending=False,inplace=True);results.index=range(len(results.index)) 85 print results 86 87 results=results[['code','name','num','keyword1','n1','keyword2','n2','keyword3','n3']].copy() 88 results.to_csv('keywords_'+today.strftime("%Y%m%d")+'.xls',index=False,encoding='utf-8_sig') 89 90 data = pd.read_csv('comments_' + today.strftime("%Y%m%d") + '.xls'); 91 tem=results[['code','name']].copy(); 92 data=pd.merge(data,tem,on=['code']); 93 data.to_csv('comments_abnormal'+today.strftime("%Y%m%d")+'.xls',index=False,encoding='utf-8_sig') 94 95 96 97 ### codes is a list 98 def get_names_from_wind(codes): 99 codes="','".join(codes); 100 codes="('"+codes+"')" 101 sql="select s_info_windcode as code,s_info_name as name from Wind.ASHAREDESCRIPTION where s_info_windcode in "+codes+";" 102 cnn = mdb.connect('10.10.40.310', 'report', 'raP1_Hdr2', 'Wind',charset='utf8'); 103 cnn = cnn.cursor(mdb.cursors.SSDictCursor) 104 cnn.execute(sql) 105 dictionary = cnn.fetchall(); 106 table = pd.DataFrame(list(dictionary)) 107 return table; 108 109 if __name__=='__main__': 110 main()