中國證券投資基金業協會爬取

本文轉載自查看原文 2020-09-06 22:28 416

爬取要求：

網頁爬取范圍：5875頁-尾頁；
基金名稱點擊進去的二級頁面！需要提取《基金類型》、《管理類型》兩個字段；
私募基金管理人名稱點進去的二級頁面，需要提取《登記時間》、《成立時間》字段；
二級頁面的四個字段跟到列表頁后面形成表格。

代碼：

import codecs
import csv
from lxml import etree
import requests
import random
import json
import time
import pandas as pd
import threading


# 輸入毫秒級的時間，轉出正常格式的時間
def timeStamp(timeNum):
    timeStamp = float(timeNum / 1000)
    timeArray = time.localtime(timeStamp)
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    return otherStyleTime


def save(rows):
    with codecs.open('證券.csv', 'ab', encoding='utf8') as f:
        writer = csv.writer(f)
        writer.writerows(rows)


baocuo_list = []


def craw(num):
    rows = []
    try:
        print('開始爬取=========', num)
        headers = {
            'Accept': 'application/json,text/javascript,*/*; q=0.01',
            'Accept-Encoding': 'gzip,deflate',
            'Connection': 'keep-alive',
            'Host': 'gs.amac.org.cn',
            'Content-Type': 'application/json;charset=UTF-8',
            'Origin': 'http://gs.amac.org.cn',
            'X-Requested-With': 'XMLHttpRequest',
            'Referer': 'http://gs.amac.org.cn/amac-infodisc/res/pof/fund/index.html',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Mobile Safari/537.36'
        }
        r = random.random()
        url = "http://gs.amac.org.cn/amac-infodisc/api/pof/fund?rand=" + str(r) + "&page=" + str(num) + "&size=20"
        data = {}
        data = json.dumps(data)
        response = requests.post(url=url, data=data, headers=headers)
        data_list = json.loads(response.text)["content"]
        count = 0
        for data in data_list:
            # print(data)
            fund_name = data['fundName']
            manager_name = data['managerName']
            mandator_name = data['mandatorName']
            establishDate = timeStamp(data['establishDate'])
            putOnRecordDate = str(establishDate)[:11]
            count += 1
            # 中國證券投資基金業協會提示地址
            url = 'http://gs.amac.org.cn/amac-infodisc/res/pof/fund/' + data['url']
            manager_url = 'http://gs.amac.org.cn/amac-infodisc/res/pof/' + data.get('managerUrl')[3:]
            response = requests.get(url=url, headers=headers)
            response.encoding = 'utf-8'
            # 管理類型
            manager_type = data['managerType']
            # 基金類型
            text = response.text
            text = etree.HTML(text)
            basic_type = \
                text.xpath('/ html / body / div[3] / div / div[2] / div[1] / div / table / tbody')[0].xpath(
                    'string(.)').strip().split(":")
            a = 0

            for i in basic_type:
                if '基金類型' in i:
                    a = basic_type.index(i)
            basic_type = basic_type[a + 1]
            basic_type = basic_type.split()[0].strip()

            # 備案時間
            beian_time = \
                text.xpath('/ html / body / div[3] / div / div[2] / div[1] / div / table / tbody / tr[4] / td[2]')[
                    0].xpath(
                    'string(.)').replace(
                    '\r\n', '').replace(" ", "").replace("\t", "")
            response = requests.get(url=manager_url, headers=headers)
            response.encoding = 'utf-8'
            text = response.text
            text = etree.HTML(text)
            # 成立時間
            establish_time = \
                text.xpath('/ html / body / div[3] / div / div[4] / div[2] / div[2] / table / tbody / tr[6] / td[2]')[
                    0].xpath('string(.)').replace(
                    '\r\n', '').replace(" ", "").replace("\t", "").split(':')[-1]
            # 登記時間
            register_time = text.xpath('/html/body/div[3]/div/div[4]/div[2]/div[2]/table/tbody/tr[5]/td[2]')[0].xpath(
                'string(.)').replace(
                '\r\n', '').replace(" ", "").replace("\t", "").split(':')[-1]
            row = (
                fund_name, manager_name, mandator_name, putOnRecordDate, beian_time, basic_type, manager_type,
                register_time,
                establish_time)
            rows.append(row)
            if num in baocuo_list:
                baocuo_list.pop(num)


        if len(rows) > 0:
            save(rows)
        print('爬取完成==========', num)
    except Exception as e:
        print('爬不了的======', num)
        print('爬不了的原因======', e)
        baocuo_list.append(num)


if __name__ == '__main__':
    with codecs.open('證券.csv', 'ab', encoding='utf8') as f:
        writer = csv.writer(f)
        writer.writerow(["基金名稱", "私募基金管理人名稱", "托管人名稱", "成立時間", "備案時間", "基金類型", "管理類型", "登記時間", "成立時間-管理人"])
    for num in range(5874, 6620):
        t = threading.Thread(target=craw, args=(num,))
        t.start()
        t.join()

    print(baocuo_list)
    while 1:
        for i in baocuo_list:
            print('重新爬取===========', i)
            t = threading.Thread(target=craw, args=(i,))
            t.start()
            t.join()
            if len(baocuo_list) <= 0:
                break
        if len(baocuo_list) <= 0:
            break

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 證券投資基金-基金的估值、費用與利潤分配證券行業指數基金，適合長期投資嗎？證券投資基金會計核算准則學習證券投資基金會計核算業務指引解密中國證券金融股份有限公司【重磅】中國集成電路產業基金投資版圖詳解 scrapy學習-爬取天天基金網基金列表全網爬取6500多只基金|看看哪家基金最強 python 爬取網頁天天基金 •《大話數據結構》獲中國書刊發行業協會評選的“2011年度全行業優秀暢銷品種”