Python爬虫
Python爬虫在我们生活中应用很广, 大数据分析/量化投资/研究各地的房价/调查B站所有UP主, 等... 都需要用到收集大量的数据. 人生处处皆数学, 人生处处皆Python, 所以,别再问"学习数学有什么用?", 也别再问"学习Python有什么用?". 下面直接上例子
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import random
import time
from requests import get
# user_agent列表
user_agent_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]
# referer列表
referer_list = [
'http://fund.eastmoney.com/110022.html',
'http://fund.eastmoney.com/110023.html',
'http://fund.eastmoney.com/',
'http://fund.eastmoney.com/110025.html'
]
def get_html(baseUrl):
# 获取一个随机user_agent和Referer
headers = {'User-Agent': random.choice(user_agent_list), 'Referer': random.choice(referer_list)}
try:
resp = get(baseUrl, headers=headers)
# print(resp.status_code)
if resp.status_code == 200:
# print(resp.content)
return resp.text
print("没有爬取到相应的内容")
return None
except RequestException:
print("没有爬取到相应的内容")
return None
if __name__ == "__main__":
t = time.time()
rt = int(round(t * 1000))
baseUrl = "http://api.fund.eastmoney.com/f10/lsjz?callback=jQuery183006797018539211241_1593855325551&fundCode=004070&pageIndex=1&pageSize=20&startDate=2020-06-01&endDate=2020-07-01&_=" + str(rt)
print(baseUrl)
data = get_html(baseUrl)
print(data)
数据返回的可能是html/json/list/tuple, 这里介绍针对json格式的处理,其他格式解析就不赘述了. 以上爬虫返回的是Json格式(是我最喜欢的格式_), 如下:
{
"Data":{
"LSJZList":[
{
"FSRQ":"2020-07-01",
"DWJZ":"1.0396",
"LJJZ":"1.0396",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"1.88",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-30",
"DWJZ":"1.0204",
"LJJZ":"1.0204",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"3.26",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-29",
"DWJZ":"0.9882",
"LJJZ":"0.9882",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-2.96",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-24",
"DWJZ":"1.0183",
"LJJZ":"1.0183",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.12",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-23",
"DWJZ":"1.0195",
"LJJZ":"1.0195",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"0.97",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-22",
"DWJZ":"1.0097",
"LJJZ":"1.0097",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"2.68",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-19",
"DWJZ":"0.9833",
"LJJZ":"0.9833",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"3.35",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-18",
"DWJZ":"0.9514",
"LJJZ":"0.9514",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"1.03",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-17",
"DWJZ":"0.9417",
"LJJZ":"0.9417",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.32",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-16",
"DWJZ":"0.9447",
"LJJZ":"0.9447",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"1.08",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-15",
"DWJZ":"0.9346",
"LJJZ":"0.9346",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.11",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-12",
"DWJZ":"0.9356",
"LJJZ":"0.9356",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.38",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-11",
"DWJZ":"0.9392",
"LJJZ":"0.9392",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.53",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-10",
"DWJZ":"0.9442",
"LJJZ":"0.9442",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.83",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-09",
"DWJZ":"0.9521",
"LJJZ":"0.9521",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"1.04",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-08",
"DWJZ":"0.9423",
"LJJZ":"0.9423",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.43",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-05",
"DWJZ":"0.9464",
"LJJZ":"0.9464",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"0.94",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-04",
"DWJZ":"0.9376",
"LJJZ":"0.9376",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.37",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-03",
"DWJZ":"0.9411",
"LJJZ":"0.9411",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-1.01",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-02",
"DWJZ":"0.9507",
"LJJZ":"0.9507",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.27",
"SGZT":"开放申购",
"SHZT":"开放赎回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
}
],
"FundType":"001",
"SYType":null,
"isNewType":false,
"Feature":"030,031,050,051,054"
},
"ErrCode":0,
"ErrMsg":null,
"TotalCount":21,
"Expansion":null,
"PageSize":20,
"PageIndex":1
}
json格式处理
解析以上数据
import json
jsonText = json.loads(data)
infos = jsonText['Data']['LSJZList']
for info in infos:
print(info)
FSRQ = info['FSRQ'] # 日期
DWJZ = info['DWJZ'] # 单位净值
LJJZ = info['LJJZ'] # 累计净值
JZZZL = info['JZZZL'] # 增长率
print(FSRQ)
print(type(DWJZ))
print(type(LJJZ))
print(type(JZZZL))
将数据转成List或pandas
import pandas as pd
infosList = []
indexList = []
titleList = ['FSRQ','DWJZ','LJJZ','JZZZL']
for info in infos:
# print(info)
FSRQ = info['FSRQ'] # 日期
DWJZ = info['DWJZ'] # 单位净值
LJJZ = info['LJJZ'] # 累计净值
JZZZL = info['JZZZL'] # 增长率
print(FSRQ)
print(float(DWJZ))
print(float(LJJZ))
print(float(JZZZL))
indexList.append(FSRQ)
infoList = []
infoList.append(FSRQ)
infoList.append(float(DWJZ))
infoList.append(float(LJJZ))
infoList.append(float(JZZZL))
infosList.append(infoList)
df = pd.DataFrame(infosList, index=indexList, columns=titleList).sort_index()
print(df)
完整代码查看gitee