爬取要求:
網頁爬取范圍:5875頁-尾頁;
基金名稱點擊進去的二級頁面!需要提取《基金類型》、《管理類型》兩個字段;
私募基金管理人名稱點進去的二級頁面,需要提取《登記時間》、《成立時間》字段;
二級頁面的四個字段跟到列表頁后面形成表格。
代碼:
import codecs
import csv
from lxml import etree
import requests
import random
import json
import time
import pandas as pd
import threading
# 輸入毫秒級的時間,轉出正常格式的時間
def timeStamp(timeNum):
timeStamp = float(timeNum / 1000)
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
def save(rows):
with codecs.open('證券.csv', 'ab', encoding='utf8') as f:
writer = csv.writer(f)
writer.writerows(rows)
baocuo_list = []
def craw(num):
rows = []
try:
print('開始爬取=========', num)
headers = {
'Accept': 'application/json,text/javascript,*/*; q=0.01',
'Accept-Encoding': 'gzip,deflate',
'Connection': 'keep-alive',
'Host': 'gs.amac.org.cn',
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'http://gs.amac.org.cn',
'X-Requested-With': 'XMLHttpRequest',
'Referer': 'http://gs.amac.org.cn/amac-infodisc/res/pof/fund/index.html',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Mobile Safari/537.36'
}
r = random.random()
url = "http://gs.amac.org.cn/amac-infodisc/api/pof/fund?rand=" + str(r) + "&page=" + str(num) + "&size=20"
data = {}
data = json.dumps(data)
response = requests.post(url=url, data=data, headers=headers)
data_list = json.loads(response.text)["content"]
count = 0
for data in data_list:
# print(data)
fund_name = data['fundName']
manager_name = data['managerName']
mandator_name = data['mandatorName']
establishDate = timeStamp(data['establishDate'])
putOnRecordDate = str(establishDate)[:11]
count += 1
# 中國證券投資基金業協會提示地址
url = 'http://gs.amac.org.cn/amac-infodisc/res/pof/fund/' + data['url']
manager_url = 'http://gs.amac.org.cn/amac-infodisc/res/pof/' + data.get('managerUrl')[3:]
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
# 管理類型
manager_type = data['managerType']
# 基金類型
text = response.text
text = etree.HTML(text)
basic_type = \
text.xpath('/ html / body / div[3] / div / div[2] / div[1] / div / table / tbody')[0].xpath(
'string(.)').strip().split(":")
a = 0
for i in basic_type:
if '基金類型' in i:
a = basic_type.index(i)
basic_type = basic_type[a + 1]
basic_type = basic_type.split()[0].strip()
# 備案時間
beian_time = \
text.xpath('/ html / body / div[3] / div / div[2] / div[1] / div / table / tbody / tr[4] / td[2]')[
0].xpath(
'string(.)').replace(
'\r\n', '').replace(" ", "").replace("\t", "")
response = requests.get(url=manager_url, headers=headers)
response.encoding = 'utf-8'
text = response.text
text = etree.HTML(text)
# 成立時間
establish_time = \
text.xpath('/ html / body / div[3] / div / div[4] / div[2] / div[2] / table / tbody / tr[6] / td[2]')[
0].xpath('string(.)').replace(
'\r\n', '').replace(" ", "").replace("\t", "").split(':')[-1]
# 登記時間
register_time = text.xpath('/html/body/div[3]/div/div[4]/div[2]/div[2]/table/tbody/tr[5]/td[2]')[0].xpath(
'string(.)').replace(
'\r\n', '').replace(" ", "").replace("\t", "").split(':')[-1]
row = (
fund_name, manager_name, mandator_name, putOnRecordDate, beian_time, basic_type, manager_type,
register_time,
establish_time)
rows.append(row)
if num in baocuo_list:
baocuo_list.pop(num)
if len(rows) > 0:
save(rows)
print('爬取完成==========', num)
except Exception as e:
print('爬不了的======', num)
print('爬不了的原因======', e)
baocuo_list.append(num)
if __name__ == '__main__':
with codecs.open('證券.csv', 'ab', encoding='utf8') as f:
writer = csv.writer(f)
writer.writerow(["基金名稱", "私募基金管理人名稱", "托管人名稱", "成立時間", "備案時間", "基金類型", "管理類型", "登記時間", "成立時間-管理人"])
for num in range(5874, 6620):
t = threading.Thread(target=craw, args=(num,))
t.start()
t.join()
print(baocuo_list)
while 1:
for i in baocuo_list:
print('重新爬取===========', i)
t = threading.Thread(target=craw, args=(i,))
t.start()
t.join()
if len(baocuo_list) <= 0:
break
if len(baocuo_list) <= 0:
break