給大家分享一個早前爬取東方財富網股票信息的爬蟲程序,回頭來看做了好多改進,特別是數據處理部分使用了heapd模塊,方便快捷一步到位...
1 # _*_ coding:utf-8 _*_ 2 3 import requests,re,json,time,os 4 import heapq 5 from bs4 import BeautifulSoup 6 7 class GPINFO(object): 8 """docstring for GPINFO""" 9 def __init__(self): 10 self.Url = 'http://quote.eastmoney.com/stocklist.html' 11 self.BaseData = [] 12 self.Date = time.strftime('%Y%m%d') 13 self.Record = 'basedata'+self.Date 14 if os.path.exists(self.Record): 15 print ('record exist...') 16 self.BaseData = self.get_base_data_from_record() 17 else: 18 print ('fuck-get data again...') 19 self.get_data() 20 21 def write_record(self,text): 22 with open(self.Record,'ab') as f: 23 f.write((text+'\n').encode('utf-8')) 24 25 def get_base_data_from_record(self): 26 ll = [] 27 with open(self.Record,'rb') as f: 28 json_l = f.readlines() 29 for j in json_l: 30 ll.append(json.loads(j.decode('utf-8'))) 31 return ll 32 33 def get_data(self): 34 #請求數據 35 orihtml = requests.get(self.Url).content 36 #創建 beautifulsoup 對象 37 soup = BeautifulSoup(orihtml,'lxml') 38 #采集每一個股票的信息 39 count = 0 40 for a in soup.find('div',class_='quotebody').find_all('a',{'target':'_blank'}): 41 record_d = {} 42 #代號 43 num = a.get_text().split('(')[1].strip(')') 44 if not (num.startswith('00') or num.startswith('60')):continue #只需要6*/0* 45 record_d['num']=num 46 #名稱 47 name = a.get_text().split('(')[0] 48 record_d['name']=name 49 #詳情頁 50 detail_url = a['href'] 51 record_d['detail_url']=detail_url 52 53 cwzburl = detail_url 54 #發送請求 55 try: 56 cwzbhtml = requests.get(cwzburl,timeout=30).content 57 except Exception as e: 58 print ('perhaps timeout:',e) 59 continue 60 #創建soup對象 61 cwzbsoup = BeautifulSoup(cwzbhtml,'lxml') 62 63 #財務指標列表 [浦發銀行,總市值 凈資產 凈利潤 市盈率 市凈率 毛利率 凈利率 ROE] roe:凈資產收益率 64 try: 65 cwzb_list = cwzbsoup.find('div',class_='cwzb').tbody.tr.get_text().split() 66 except Exception as e: 67 print ('error:',e) 68 continue 69 #去除退市股票 70 if '-' not in cwzb_list: 71 record_d['data']=cwzb_list 72 self.BaseData.append(record_d) 73 self.write_record(json.dumps(record_d)) 74 count=count+1 75 print (len(self.BaseData)) 76 77 def main(): 78 test = GPINFO() 79 result = test.BaseData 80 #[浦發銀行,總市值 凈資產 凈利潤 市盈率 市凈率 毛利率 凈利率 ROE] roe:凈資產收益率] 81 top_10 = heapq.nlargest(10,result,key=lambda r:float(r['data'][7].strip('%'))) 82 for i in top_10: 83 print(i['data']) 84 85 if __name__ == '__main__': 86 main()
程序主函數部分是為了獲取凈利率前10名的股票信息,打印結果如下:
['綿石投資', '52.2億', '14.0億', '1.25億', '30.90', '3.73', '42.25%', '2047.04%', '9.27%'] ['國投安信', '556億', '270億', '21.1億', '19.80', '2.12', '5.90%', '487.53%', '7.79%'] ['川投能源', '379億', '202億', '28.0億', '10.16', '1.91', '37.01%', '402.64%', '14.58%'] ['ST明科', '47.6億', '9.25億', '5.11千萬', '68.00', '5.14', '2.38%', '345.11%', '5.68%'] ['華聯控股', '93.6億', '31.5億', '4.76億', '14.54', '3.74', '46.25%', '328.53%', '20.88%'] ['上海九百', '68.2億', '12.3億', '1.61億', '31.67', '5.56', '54.00%', '297.99%', '13.21%'] ['凱瑞德', '46.7億', '1.14億', '3.27千萬', '107.10', '40.94', '16.07%', '294.19%', '33.41%'] ['魯信創投', '172億', '38.6億', '3.32億', '38.48', '4.64', '28.67%', '244.43%', '9.26%'] ['博聞科技', '35.0億', '6.56億', '2.23千萬', '117.65', '5.36', '-16.07%', '215.27%', '3.41%'] ['萬澤股份', '71.8億', '13.7億', '6.87千萬', '78.38', '5.29', '22.57%', '203.15%', '5.13%']
