python3爬取東方財富股東戶數2013-2019年數據


!/usr/bin/env python
 -*- coding:utf-8 -*-


import re
import csv
import time
import random
import requests
import json
from bokeh.models import pd
from requests import session


class spider_DongFangCaiFu(object):
    def __init__(self):
        #初始化需要記錄斷點數據
        self.sync_log_dict = {
            #分類url
            "category_num": 0,
            #頁數url
            "page_num": 1,
           #總頁數
            "total_page":100,
        }
        # 日期
        self.start_date_list=[
            "2019-12-31",
            "2018-12-31",
            "2017-12-31",
            "2016-12-31",
            "2015-12-31",
            "2014-12-31",
            "2013-12-31",
        ]

    # 詳情解析
    def parse_detail(self):
        for one_date_index in range(self.sync_log_dict["category_num"], 2):
            # 設置時間間隔
            time.sleep(random.random())
            start_date = self.start_date_list[one_date_index]
            for two_page_index in range(self.sync_log_dict["page_num"], 60):
                time.sleep(random.uniform(1, 3))
                url = "http://data.eastmoney.com/DataCenter_V3/gdhs/GetList.ashx?"
                params = {
                    "reportdate": start_date,
                    "market": "",
                    "changerate": "",
                    "range": "",
                    "pagesize": "50",
                    "page": str(two_page_index),
                    "sortRule": "-1",
                    "sortType": "NoticeDate",
                    "js": "var%20DzSDuvmw",
                    "param": "",
                    "rt": "51196634",
                }
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
                }

                response = requests.get(url, headers=headers, params=params, verify=False, timeout=5)
                res_dict = response.text
                # print(res_dict)
                # data_list= re.search('"data":(\[.*\])', res_dict).group(1)
                data_list= re.search('"data":(\[.*\])', res_dict).group(1)
                # print(data_list)
                if len(data_list)<=2:
                    break
                # data_str = re.findall('\[(.*?)\]', data_list)[0]
                ## python3 字符串轉換dict,並換行寫入txt  encoding = "utf-8"
                for i in eval(data_list):
                    with open('dfcf.txt', 'a+',encoding = "utf-8" ) as file:
                        """
                        import json
                        print json.dumps('中國')
                        "\u4e2d\u56fd"
                        print json.dumps('中國',ensure_ascii=False)
                        "中國"
                        """
                        line = json.dumps(i,ensure_ascii=False)
                        file.write(line + '\n',)

if __name__ == '__main__':
    run=spider_DongFangCaiFu()
    run.parse_detail()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM