python爬蟲筆記(六)網絡爬蟲之實戰(2)——股票數據定向爬蟲


1. 股票數據定向爬蟲

https://gupiao.baidu.com/stock

http://quote.eastmoney.com/stock_list.html

2. 實例編寫

2.1 獲取HTML頁面

def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        print("url:", r.request.url)
        return r.text
    except:
        return ""

2.2 獲取股票列表信息(bs4+正則)

def getStockList(lst, stockURL):
    html = getHTMLText(stockURL)
    soup = BeautifulSoup(html, 'html.parser')
    # 個股鏈接在 <a> 標簽中
    a = soup.find_all('a')
    for i in a:
        try:
            # 個股鏈接在 <a> 標簽的 href 屬性中
            # 我們需要獲得 sh/sz + 6 個數字,利用正則表達式
            href = i.attrs['href']
            lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
        except:
            continue
    print(lst)

2.3 獲取股票信息主體

def getStockInfo(lst, stockURL, fpath):
    for stock in lst:
        url = stockURL + stock + ".html"
        html = getHTMLText(url)
        try:
            if html == "":
                continue
            # 使用鍵值對記錄個股信息
            infoDict = {}
            soup = BeautifulSoup(html, 'html.parser')
            stockInfo = soup.find('div', attrs={'class':'stock-bets'})
            
            # 獲取股票名稱
            name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
            # 將信息添加到字典中
            # split()以空格分隔
            infoDict.update({'股票名稱' : name.text.split()[0]})
            
            # 在 <dt><dd> 標簽中獲取其他信息,用鍵值對維護
            keyList = stockInfo.find_all('dt')
            valueList = stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val
                
            # 將獲得額信息寫入相應文件中
            with open(fpath, 'a', encoding='utf-8') as f:
                f.write(str(infoDict) + '\n')
                
        # 利用 traceback 跟蹤並輸出異常信息
        except:
            traceback.print_exc()
            continue

3. 完整代碼

# -*- coding: utf-8 -*-
"""
Created on Sat Feb  1 00:40:47 2020

@author: douzi
"""

import requests
from bs4 import BeautifulSoup
# traceback模塊被用來跟蹤異常返回信息
import traceback
import re


def getHTMLText(url, code = 'utf-8'):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
#        r.encoding = r.apparent_encoding   # 定向爬蟲可以直接固定
        r.encoding = code
        
        print("url:", r.request.url)
        return r.text
    except:
        return ""


def getStockList(lst, stockURL):
    html = getHTMLText(stockURL, "GB2312")
    soup = BeautifulSoup(html, 'html.parser')
    # 個股鏈接在 <a> 標簽中
    a = soup.find_all('a')
    for i in a:
        try:
            # 個股鏈接在 <a> 標簽的 href 屬性中
            # 我們需要獲得 sh/sz + 6 個數字,利用正則表達式
            href = i.attrs['href']
            lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
        except:
            continue
    print(lst)


def getStockInfo(lst, stockURL, fpath):
    count = 0
    for stock in lst:
        url = stockURL + stock + ".html"
        html = getHTMLText(url)
        try:
            if html == "":
                continue
            # 使用鍵值對記錄個股信息
            infoDict = {}
            soup = BeautifulSoup(html, 'html.parser')
            stockInfo = soup.find('div', attrs={'class':'stock-bets'})
            
            # 獲取股票名稱
            name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
            # 將信息添加到字典中
            # split()以空格分隔
            infoDict.update({'股票名稱' : name.text.split()[0]})
            
            # 在 <dt><dd> 標簽中獲取其他信息,用鍵值對維護
            keyList = stockInfo.find_all('dt')
            valueList = stockInfo.find_all('dd')
            for i in range(len(keyList)):
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val
                
            # 將獲得額信息寫入相應文件中
            with open(fpath, 'a', encoding='utf-8') as f:
                f.write(str(infoDict) + '\n')
                count = count + 1
                # \r
                print('\r當前速度: {:.2f}%'.format(count * 100 / len(lst)), end='')
        # 利用 traceback 跟蹤並輸出異常信息
        except:
            count = count + 1
            print('\r當前速度: {:.2f}%'.format(count * 100 / len(lst)), end='')
            traceback.print_exc()
            continue


def main():
    # 股票列表信息
    stock_list_url = "http://quote.eastmoney.com/stock_list.html"
    # 股票信息主體
    stock_info_url = "https://gupiao.baidu.com/stock/"
    output_file = ".//Result_stock.txt"
    slist = []
    # 獲取股票列表信息
    getStockList(slist, stock_list_url)
    # 獲得股票信息主體
    getStockInfo(slist, stock_info_url, output_file)


if __name__ == "__main__":
    main()    

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM