1. 總述
慕課中這段代碼的功能是首先從東方財富網上獲得所有股票的代碼,再利用我們所獲得的股票代碼輸入url中進入百度股票頁面爬取該只股票的詳細信息。
1 import requests 2 from bs4 import BeautifulSoup 3 import traceback 4 import re 5 6 7 def getHTMLText(url): 8 try: 9 r = requests.get(url) 10 r.raise_for_status() 11 r.encoding = r.apparent_encoding 12 return r.text 13 except: 14 return "" 15 16 17 def getStockList(lst, stockURL): 18 html = getHTMLText(stockURL) 19 soup = BeautifulSoup(html, 'html.parser') 20 a = soup.find_all('a') 21 for i in a: 22 try: 23 href = i.attrs['href'] 24 lst.append(re.findall(r'[s][hz]\d{6}', href)[0]) 25 except: 26 continue 27 28 29 def getStockInfo(lst, stockURL, fpath): 30 for stock in lst: 31 url = stockURL + stock + ".html" 32 html = getHTMLText(url) 33 try: 34 if html == "": 35 continue 36 infoDict = {} 37 soup = BeautifulSoup(html, 'html.parser') 38 stockInfo = soup.find('div', attrs={'class': 'stock-bets'}) 39 40 name = stockInfo.find_all(attrs={'class': 'bets-name'})[0] 41 infoDict.update({'股票名稱': name.text.split()[0]}) 42 43 keyList = stockInfo.find_all('dt') 44 valueList = stockInfo.find_all('dd') 45 for i in range(len(keyList)): 46 key = keyList[i].text 47 val = valueList[i].text 48 infoDict[key] = val 49 50 with open(fpath, 'a', encoding='utf-8') as f: 51 f.write(str(infoDict) + '\n') 52 except: 53 traceback.print_exc() 54 continue 55 56 57 def main(): 58 stock_list_url = 'http://quote.eastmoney.com/stocklist.html' 59 stock_info_url = 'http://gupiao.baidu.com/stock/' 60 output_file = 'D:/BaiduStockInfo.txt' 61 slist = [] 62 getStockList(slist, stock_list_url) 63 getStockInfo(slist, stock_info_url, output_file) 64 65 66 main()
2. 具體分析
2.1 獲取源碼
這段代碼的功能就是使用requests庫直接獲得網頁的所有源代碼。
1 def getHTMLText(url): 2 try: 3 r = requests.get(url) 4 r.raise_for_status() 5 r.encoding = r.apparent_encoding 6 return r.text 7 except: 8 return ""
2.2 獲取股票代碼
在源碼中可以看到每支股票都對應着一個6位數字的代碼,這部分要做的工作就是獲取這代碼編號。這編號在a標簽中,所有首先用BeautifulSoup選出所有的a標簽,接下來我們在用attrs[href]來獲取a標簽的href屬性值,最后用正則表達式篩選出我們想要的代碼值。
1 def getStockList(lst, stockURL): 2 html = getHTMLText(stockURL) 3 soup = BeautifulSoup(html, 'html.parser') 4 a = soup.find_all('a') 5 for i in a: 6 try: 7 href = i.attrs['href'] 8 lst.append(re.findall(r'[s][hz]\d{6}', href)[0]) #findall返回的是一個列表,所有這里[0]的作用就是append一個字符串,而不是一個列表進去 9 except: 10 continue
2.3 獲取股票信息
同樣的原理,最后用字典來保存。
1 def getStockInfo(lst, stockURL, fpath): 2 for stock in lst: 3 url = stockURL + stock + ".html" 4 html = getHTMLText(url) 5 try: 6 if html == "": 7 continue 8 infoDict = {} 9 soup = BeautifulSoup(html, 'html.parser') 10 stockInfo = soup.find('div', attrs={'class': 'stock-bets'}) 11 12 name = stockInfo.find_all(attrs={'class': 'bets-name'})[0] 13 infoDict.update({'股票名稱': name.text.split()[0]}) #text是requests的方法 14 15 keyList = stockInfo.find_all('dt') 16 valueList = stockInfo.find_all('dd') 17 for i in range(len(keyList)): 18 key = keyList[i].text 19 val = valueList[i].text 20 infoDict[key] = val 21 22 with open(fpath, 'a', encoding='utf-8') as f: 23 f.write(str(infoDict) + '\n') 24 except: 25 traceback.print_exc() 26 continue
3. 增加進度條顯示
進度條的顯示只需要首先將count賦0,然后在下面的位置加入如下語句即可,\r轉譯不換行。
1 with open(fpath, 'a', encoding='utf-8') as f: 2 f.write(str(infoDict) + '\n') 3 count = count+1 4 print('\r當前速度:{:.2f}%'.format(count*100/len(lst)), end='') 5 except: 6 count = count + 1 7 print('\r當前速度:{:.2f}%'.format(count * 100 / len(lst)), end='') 8 traceback.print_exc() 9 continue