Python爬蟲
Python爬蟲在我們生活中應用很廣, 大數據分析/量化投資/研究各地的房價/調查B站所有UP主, 等... 都需要用到收集大量的數據. 人生處處皆數學, 人生處處皆Python, 所以,別再問"學習數學有什么用?", 也別再問"學習Python有什么用?". 下面直接上例子
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import random
import time
from requests import get
# user_agent列表
user_agent_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]
# referer列表
referer_list = [
'http://fund.eastmoney.com/110022.html',
'http://fund.eastmoney.com/110023.html',
'http://fund.eastmoney.com/',
'http://fund.eastmoney.com/110025.html'
]
def get_html(baseUrl):
# 獲取一個隨機user_agent和Referer
headers = {'User-Agent': random.choice(user_agent_list), 'Referer': random.choice(referer_list)}
try:
resp = get(baseUrl, headers=headers)
# print(resp.status_code)
if resp.status_code == 200:
# print(resp.content)
return resp.text
print("沒有爬取到相應的內容")
return None
except RequestException:
print("沒有爬取到相應的內容")
return None
if __name__ == "__main__":
t = time.time()
rt = int(round(t * 1000))
baseUrl = "http://api.fund.eastmoney.com/f10/lsjz?callback=jQuery183006797018539211241_1593855325551&fundCode=004070&pageIndex=1&pageSize=20&startDate=2020-06-01&endDate=2020-07-01&_=" + str(rt)
print(baseUrl)
data = get_html(baseUrl)
print(data)
數據返回的可能是html/json/list/tuple, 這里介紹針對json格式的處理,其他格式解析就不贅述了. 以上爬蟲返回的是Json格式(是我最喜歡的格式_), 如下:
{
"Data":{
"LSJZList":[
{
"FSRQ":"2020-07-01",
"DWJZ":"1.0396",
"LJJZ":"1.0396",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"1.88",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-30",
"DWJZ":"1.0204",
"LJJZ":"1.0204",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"3.26",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-29",
"DWJZ":"0.9882",
"LJJZ":"0.9882",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-2.96",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-24",
"DWJZ":"1.0183",
"LJJZ":"1.0183",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.12",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-23",
"DWJZ":"1.0195",
"LJJZ":"1.0195",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"0.97",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-22",
"DWJZ":"1.0097",
"LJJZ":"1.0097",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"2.68",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-19",
"DWJZ":"0.9833",
"LJJZ":"0.9833",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"3.35",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-18",
"DWJZ":"0.9514",
"LJJZ":"0.9514",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"1.03",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-17",
"DWJZ":"0.9417",
"LJJZ":"0.9417",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.32",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-16",
"DWJZ":"0.9447",
"LJJZ":"0.9447",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"1.08",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-15",
"DWJZ":"0.9346",
"LJJZ":"0.9346",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.11",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-12",
"DWJZ":"0.9356",
"LJJZ":"0.9356",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.38",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-11",
"DWJZ":"0.9392",
"LJJZ":"0.9392",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.53",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-10",
"DWJZ":"0.9442",
"LJJZ":"0.9442",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.83",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-09",
"DWJZ":"0.9521",
"LJJZ":"0.9521",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"1.04",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-08",
"DWJZ":"0.9423",
"LJJZ":"0.9423",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.43",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-05",
"DWJZ":"0.9464",
"LJJZ":"0.9464",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"0.94",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-04",
"DWJZ":"0.9376",
"LJJZ":"0.9376",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.37",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-03",
"DWJZ":"0.9411",
"LJJZ":"0.9411",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-1.01",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
},
{
"FSRQ":"2020-06-02",
"DWJZ":"0.9507",
"LJJZ":"0.9507",
"SDATE":null,
"ACTUALSYI":"",
"NAVTYPE":"1",
"JZZZL":"-0.27",
"SGZT":"開放申購",
"SHZT":"開放贖回",
"FHFCZ":"",
"FHFCBZ":"",
"DTYPE":null,
"FHSP":""
}
],
"FundType":"001",
"SYType":null,
"isNewType":false,
"Feature":"030,031,050,051,054"
},
"ErrCode":0,
"ErrMsg":null,
"TotalCount":21,
"Expansion":null,
"PageSize":20,
"PageIndex":1
}
json格式處理
解析以上數據
import json
jsonText = json.loads(data)
infos = jsonText['Data']['LSJZList']
for info in infos:
print(info)
FSRQ = info['FSRQ'] # 日期
DWJZ = info['DWJZ'] # 單位凈值
LJJZ = info['LJJZ'] # 累計凈值
JZZZL = info['JZZZL'] # 增長率
print(FSRQ)
print(type(DWJZ))
print(type(LJJZ))
print(type(JZZZL))
將數據轉成List或pandas
import pandas as pd
infosList = []
indexList = []
titleList = ['FSRQ','DWJZ','LJJZ','JZZZL']
for info in infos:
# print(info)
FSRQ = info['FSRQ'] # 日期
DWJZ = info['DWJZ'] # 單位凈值
LJJZ = info['LJJZ'] # 累計凈值
JZZZL = info['JZZZL'] # 增長率
print(FSRQ)
print(float(DWJZ))
print(float(LJJZ))
print(float(JZZZL))
indexList.append(FSRQ)
infoList = []
infoList.append(FSRQ)
infoList.append(float(DWJZ))
infoList.append(float(LJJZ))
infoList.append(float(JZZZL))
infosList.append(infoList)
df = pd.DataFrame(infosList, index=indexList, columns=titleList).sort_index()
print(df)
完整代碼查看gitee