1. 股票數據定向爬蟲
https://gupiao.baidu.com/stock
http://quote.eastmoney.com/stock_list.html
2. 實例編寫
2.1 獲取HTML頁面
def getHTMLText(url): try: r = requests.get(url, timeout = 30) r.raise_for_status() r.encoding = r.apparent_encoding print("url:", r.request.url) return r.text except: return ""
2.2 獲取股票列表信息(bs4+正則)
def getStockList(lst, stockURL): html = getHTMLText(stockURL) soup = BeautifulSoup(html, 'html.parser') # 個股鏈接在 <a> 標簽中 a = soup.find_all('a') for i in a: try: # 個股鏈接在 <a> 標簽的 href 屬性中 # 我們需要獲得 sh/sz + 6 個數字,利用正則表達式 href = i.attrs['href'] lst.append(re.findall(r"[s][hz]\d{6}", href)[0]) except: continue print(lst)
2.3 獲取股票信息主體
def getStockInfo(lst, stockURL, fpath): for stock in lst: url = stockURL + stock + ".html" html = getHTMLText(url) try: if html == "": continue # 使用鍵值對記錄個股信息 infoDict = {} soup = BeautifulSoup(html, 'html.parser') stockInfo = soup.find('div', attrs={'class':'stock-bets'}) # 獲取股票名稱 name = stockInfo.find_all(attrs={'class':'bets-name'})[0] # 將信息添加到字典中 # split()以空格分隔 infoDict.update({'股票名稱' : name.text.split()[0]}) # 在 <dt><dd> 標簽中獲取其他信息,用鍵值對維護 keyList = stockInfo.find_all('dt') valueList = stockInfo.find_all('dd') for i in range(len(keyList)): key = keyList[i].text val = valueList[i].text infoDict[key] = val # 將獲得額信息寫入相應文件中 with open(fpath, 'a', encoding='utf-8') as f: f.write(str(infoDict) + '\n') # 利用 traceback 跟蹤並輸出異常信息 except: traceback.print_exc() continue
3. 完整代碼
# -*- coding: utf-8 -*- """ Created on Sat Feb 1 00:40:47 2020 @author: douzi """ import requests from bs4 import BeautifulSoup # traceback模塊被用來跟蹤異常返回信息 import traceback import re def getHTMLText(url, code = 'utf-8'): try: r = requests.get(url, timeout = 30) r.raise_for_status() # r.encoding = r.apparent_encoding # 定向爬蟲可以直接固定 r.encoding = code print("url:", r.request.url) return r.text except: return "" def getStockList(lst, stockURL): html = getHTMLText(stockURL, "GB2312") soup = BeautifulSoup(html, 'html.parser') # 個股鏈接在 <a> 標簽中 a = soup.find_all('a') for i in a: try: # 個股鏈接在 <a> 標簽的 href 屬性中 # 我們需要獲得 sh/sz + 6 個數字,利用正則表達式 href = i.attrs['href'] lst.append(re.findall(r"[s][hz]\d{6}", href)[0]) except: continue print(lst) def getStockInfo(lst, stockURL, fpath): count = 0 for stock in lst: url = stockURL + stock + ".html" html = getHTMLText(url) try: if html == "": continue # 使用鍵值對記錄個股信息 infoDict = {} soup = BeautifulSoup(html, 'html.parser') stockInfo = soup.find('div', attrs={'class':'stock-bets'}) # 獲取股票名稱 name = stockInfo.find_all(attrs={'class':'bets-name'})[0] # 將信息添加到字典中 # split()以空格分隔 infoDict.update({'股票名稱' : name.text.split()[0]}) # 在 <dt><dd> 標簽中獲取其他信息,用鍵值對維護 keyList = stockInfo.find_all('dt') valueList = stockInfo.find_all('dd') for i in range(len(keyList)): key = keyList[i].text val = valueList[i].text infoDict[key] = val # 將獲得額信息寫入相應文件中 with open(fpath, 'a', encoding='utf-8') as f: f.write(str(infoDict) + '\n') count = count + 1 # \r print('\r當前速度: {:.2f}%'.format(count * 100 / len(lst)), end='') # 利用 traceback 跟蹤並輸出異常信息 except: count = count + 1 print('\r當前速度: {:.2f}%'.format(count * 100 / len(lst)), end='') traceback.print_exc() continue def main(): # 股票列表信息 stock_list_url = "http://quote.eastmoney.com/stock_list.html" # 股票信息主體 stock_info_url = "https://gupiao.baidu.com/stock/" output_file = ".//Result_stock.txt" slist = [] # 獲取股票列表信息 getStockList(slist, stock_list_url) # 獲得股票信息主體 getStockInfo(slist, stock_info_url, output_file) if __name__ == "__main__": main()