python爬蟲爬去東方財富財務數據
import requests import re from multiprocessing import Pool import json import csv import pandas as pd import os import time # 設置文件保存在D盤eastmoney文件夾下 file_path = r'C:\Users\admir\Desktop\銀行競爭\報表數據' if not os.path.exists(file_path): os.mkdir(file_path) os.chdir(file_path) # 1 設置表格爬取時期 def set_table(): # 1 設置財務報表獲取時期 year = int(float(input('請輸入要查詢的年份(四位數2007-2020):\n'))) # int表示取整,里面加float是因為輸入的是str,直接int會報錯,float則不會 quarter = int(float(input('請輸入小寫數字季度(1:1季報,2-年中報,3:3季報,4-年報):\n'))) while (quarter < 1 or quarter > 4): quarter = int(float(input('季度數值輸入錯誤,請重新輸入:\n'))) quarter = '{:02d}'.format(quarter * 3) # 確定季度所對應的最后一天是30還是31號 if (quarter == '06') or (quarter == '09'): day = 30 else: day = 31 date = '{}-{}-{}' .format(year, quarter, day) # 2 設置財務報表種類 tables = int( input('請輸入查詢的報表種類對應的數字(1-業績報表;2-業績快報表:3-業績預告表;4-預約披露時間表;5-資產負債表;6-利潤表;7-現金流量表): \n')) dict_tables = {1: '業績報表', 2: '業績快報表', 3: '業績預告表', 4: '預約披露時間表', 5: '資產負債表', 6: '利潤表', 7: '現金流量表'} dict = {1: 'YJBB', 2: 'YJKB', 3: 'YJYG', 4: 'YYPL', 5: 'ZCFZB', 6: 'LRB', 7: 'XJLLB'} category = dict[tables] # js請求參數里的type,第1-4個表的前綴是'YJBB20_',后3個表是'CWBB_' # 設置set_table()中的type、st、sr、filter參數 if tables == 1: category_type = 'YJBB20_' st = 'latestnoticedate' sr = -1 filter = "(securitytypecode in ('058001001','058001002'))(reportdate=^%s^)" %(date) elif tables == 2: category_type = 'YJBB20_' st = 'ldate' sr = -1 filter = "(securitytypecode in ('058001001','058001002'))(rdate=^%s^)" %(date) elif tables == 3: category_type = 'YJBB20_' st = 'ndate' sr = -1 filter=" (IsLatest='T')(enddate=^2018-06-30^)" elif tables == 4: category_type = 'YJBB20_' st = 'frdate' sr = 1 filter = "(securitytypecode ='058001001')(reportdate=^%s^)" %(date) else: category_type = 'CWBB_' st = 'noticedate' sr = -1 filter = '(reportdate=^%s^)' % (date) category_type = category_type + category # print(category_type) # 設置set_table()中的filter參數 yield{ 'date':date, 'category':dict_tables[tables], 'category_type':category_type, 'st':st, 'sr':sr, 'filter':filter } # 2 設置表格爬取起始頁數 def page_choose(page_all): # 選擇爬取頁數范圍 start_page = int(input('請輸入下載起始頁數:\n')) nums = input('請輸入要下載的頁數,(若需下載全部則按回車):\n') print('*' * 80) # 判斷輸入的是數值還是回車空格 if nums.isdigit(): end_page = start_page + int(nums) elif nums == '': end_page = int(page_all.group(1)) else: print('頁數輸入錯誤') # 返回所需的起始頁數,供后續程序調用 yield{ 'start_page': start_page, 'end_page': end_page } # 3 表格正式爬取 def get_table(date, category_type,st,sr,filter,page): # 參數設置 params = { # 'type': 'CWBB_LRB', 'type': category_type, # 表格類型 'token': '70f12f2f4f091e459a279469fe49eca5', 'st': st, 'sr': sr, 'p': page, 'ps': 50, # 每頁顯示多少條信息 'js': 'var LFtlXDqn={pages:(tp),data: (x)}', 'filter': filter, # 'rt': 51294261 可不用 } url = 'http://dcfm.eastmoney.com/em_mutisvcexpandinterface/api/js/get?' # print(url) response = requests.get(url, params=params).text # print(response) # 確定頁數 pat = re.compile('var.*?{pages:(\d+),data:.*?') page_all = re.search(pat, response) print(page_all.group(1)) # ok # 提取{},json.loads出錯 # pattern = re.compile('var.*?data: \[(.*)]}', re.S) # 提取出list,可以使用json.dumps和json.loads pattern = re.compile('var.*?data: (.*)}', re.S) items = re.search(pattern, response) # 等價於 # items = re.findall(pattern,response) # print(items[0]) data = items.group(1) data = json.loads(data) # data = json.dumps(data,ensure_ascii=False) return page_all, data,page # 寫入表頭 # 方法1 借助csv包,最常用 def write_header(data,category): with open('{}.csv' .format(category), 'a', encoding='utf_8_sig', newline='') as f: headers = list(data[0].keys()) # print(headers) # 測試 ok writer = csv.writer(f) writer.writerow(headers) def write_table(data,page,category): print('\n正在下載第 %s 頁表格' % page) # 寫入文件方法1 for d in data: with open('{}.csv' .format(category), 'a', encoding='utf_8_sig', newline='') as f: w = csv.writer(f) w.writerow(d.values()) def main(date, category_type,st,sr,filter,page): func = get_table(date, category_type,st,sr,filter,page) data = func[1] page = func[2] write_table(data,page,category) if __name__ == '__main__': # 獲取總頁數,確定起始爬取頁數 for i in set_table(): date = i.get('date') category = i.get('category') category_type = i.get('category_type') st = i.get('st') sr = i.get('sr') filter = i.get('filter') constant = get_table(date,category_type,st,sr,filter, 1) page_all = constant[0] for i in page_choose(page_all): start_page = i.get('start_page') end_page = i.get('end_page') # 寫入表頭 write_header(constant[1],category) start_time = time.time() # 下載開始時間 # 爬取表格主程序 for page in range(start_page, end_page): main(date,category_type,st,sr,filter, page) end_time = time.time() - start_time # 結束時間 print('下載完成') print('下載用時: {:.1f} s' .format(end_time))
https://github.com/makcyun/eastmoney_spider
- 發表於: 2018-10-13
- 原文鏈接:https://kuaibao.qq.com/s/20181013G1EQ5V00?refer=cp_1026
- 騰訊「雲+社區」是騰訊內容開放平台帳號(企鵝號)傳播渠道之一,根據《騰訊內容開放平台服務協議》轉載發布內容。
- 如有侵權,請聯系 yunjia_community@tencent.com 刪除。
