爬取滬深a股數據


首先從東方財富網獲取股票代碼

再從網易財經下載股票歷史數據

import requests
import random
from bs4 import BeautifulSoup as bs
import time
#import redis
import re
import json

def get_stock_names():
    """
    通過東方財富網上爬取股票的名稱代碼,並存入redis數據庫和本地txt文檔
    """
    rds = redis.from_url('redis://:666666@192.168.3.98:6379', db=1, decode_responses=True)   # 連接redis db1

    url = "http://quote.eastmoney.com/stocklist.html"
    headers = {
            'Referer': 'http://quote.eastmoney.com/center/gridlist.html',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }

    response = requests.get(url, headers=headers).content.decode('utf-8')   # 網站編碼為gbk 需要解碼
    soup = bs(response, 'lxml')
    all_ul = soup.find('div', id='table_wrapper-table').find_all('ul')   # 獲取兩個ul 標簽數據
    with open('stock_names.txt', 'w+', encoding='utf-8') as f:  
        for ul in all_ul:
            all_a = ul.find_all('a')            # 獲取ul 下的所有的a 標簽
            for a in all_a:
                rds.rpush('stock_names', a.text)       # a.text 為a標簽中的text數據  rpush將數據右側插入數據庫
                f.write(a.text + '\n')


def get_data(stocklist, outfile=r'D:\PycharmProjects\web_scraping\stockdata'):
    headers = {
        'Referer': 'http://quotes.money.163.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    #filelist = [os.path.splitext(file)[0] for file in os.listdir(r'D:\PycharmProjects\web_scraping\stockdata')]
    for stock_code, stock_name in stocklist:
        #if stock_code in filelist: continue
        try:
            #stock_code = stock_name.split('(')[1].split(')')[0]
            # 由於東方財富網上獲取的代碼一部分為基金,無法獲取數據,故將基金剔除掉。
            # 滬市股票以6,9開頭,深市以0,2,3開頭,但是部分基金也是2開頭,201/202/203/204這些也是基金
            # 另外獲取data的網址股票代碼 滬市前加0, 深市前加1
            if int(stock_code[0]) in [0, 2, 3, 6, 9]:
                if int(stock_code[0]) in [6, 9]:
                    stock_code_new = '0' + stock_code
                elif int(stock_code[0]) in [0, 2, 3]:
                    if not int(stock_code[:3]) in [201, 202, 203, 204]:
                        stock_code_new = '1' + stock_code
                    else: continue
                else: continue
            else: continue

            stock_url = 'http://quotes.money.163.com/trade/lsjysj_{}.html'.format(stock_code)
            respones = requests.get(stock_url, headers=headers).text
            soup = bs(respones, 'lxml')
            start_time = soup.find('input', {'name': 'date_start_type'}).get('value').replace('-', '')  #獲取起始時間
            end_time = soup.find('input', {'name': 'date_end_type'}).get('value').replace('-', '')  #獲取結束時間
            time.sleep(random.choice([1, 2]))  #兩次訪問之間休息1-2秒
            download_url = "http://quotes.money.163.com/service/chddata.html?code={}&start={}&end={}&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP".format(stock_code_new, start_time, end_time)
            data = requests.get(download_url, headers=headers)
            file_name = outfile + '\\{}.csv'.format(stock_code)
            with open(file_name, 'wb') as f:
                for chunk in data.iter_content(chunk_size=10000):  #批量寫入數據
                    if chunk:
                        f.write(chunk)
            print("{}數據已下載".format(stock_code))

        except Exception as e:
            print("{}({})數據下載報錯".format(stock_name, stock_code))
            print(e)



import os  
# 獲取目錄下所有文件,絕對路徑
# 方法一
def file_name(file_dir):   
    L=[]   
    for root, dirs, files in os.walk(file_dir):  
        for file in files:
            if os.path.splitext(file)[1] == '.jpeg':
                L.append(os.path.join(root, file))
    return L

# 方法二
def listdir(path, list_name):  
    for file in os.listdir(path):  #不包括子目錄文件 -> 遞歸
        file_path = os.path.join(path, file)
        if os.path.isdir(file_path):
            listdir(file_path, list_name)
        elif os.path.splitext(file_path)[1]=='.jpeg':
            list_name.append(file_path)


stocklist = []  #3770支,只有'0','3','6'開頭的
max_page = 189
for i in range(max_page):
    url = '''http://1.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112405721872315676919_1566176986516&pn={}
    &pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2&
    fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152
    &_=1566176986517'''.format(i+1)
    response = requests.get(url).content.decode('utf-8')
    json_text = re.sub(r'jQuery112405721872315676919_1566176986516\(', '', response)[:-2]
    #json_str = re.sub(r'\)', '', response)
    json_text = json.loads(json_text)
    for fi in json_text['data']['diff']:
        stocklist.append([fi['f12'], fi['f14']])
        

# 下載數據
get_data(stocklist, outfile=r'D:\PycharmProjects\web_scraping\stockdata')

  

參考資料:

爬蟲:爬取股票歷史交易數據

爬取東方財富股票信息網

Python爬蟲(5):比Selenium快100倍的方法爬東方財富網財務報表


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM