Python抓取第一網貸中國網貸理財每日收益率指數

本文轉載自查看原文 2017-02-14 22:13 863 爬蟲/ python/ 數據爬蟲/ 數據采集

鏈接：http://www.p2p001.com/licai/index/id/147.html

所需獲取數據鏈接類似於：http://www.p2p001.com/licai/shownews/id/454.html：

庫：

requests （For human）

re （正則）

pandas （用來處理數據）

BeautifulSoup （用來解析網頁文本）

此次抓取邏輯思維在代碼之后

上代碼：

#coding utf-8
import requests
import re
import pandas
from bs4 import BeautifulSoup
user_agent = 'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)'
headers = {'User-Agent':user_agent}
#定義函數,得到每日報的鏈接,並以列表形式返回
def get_newsurl():
    newsurl=[]
    url1='http://www.p2p001.com/licai/index/id/147/p/'
    num=1
    url2='.html'
    while num<=22:
        url=url1+str(num)+url2
        try:
            r1=requests.get(url,headers=headers)        
        except:
            print ('wrong %s' % url)
        else:
            s1=BeautifulSoup(r1.text,'lxml')
            for x in s1.find_all(href=re.compile('licai/shownews')):
                newsurl.append(x['href'])
            num=num+1        
    return newsurl
#定義函數,得到的數據,以字典形式返回
def get_info():
    url='http://www.p2p001.com'
    date=[]
    zonghe=[]
    one=[]
    one_three=[]
    three_six=[]
    six_twelve=[]
    twelve_most=[]
    for y in get_newsurl():
        try:
            main_url=url+y
            r2=requests.get(main_url,headers=headers)
        except:
                print ('wrong %s' % main_url)
        else:
            s2=BeautifulSoup(r2.text,'lxml')
            date.append(s2.find(text=re.compile('統計日期'))[5:])           
            rate=s2.find_all('td')
            zonghe.append(rate[10].string)
            one.append(rate[11].string)
            one_three.append(rate[12].string)
            three_six.append(rate[13].string)
            six_twelve.append(rate[14].string)
            twelve_most.append(rate[15].string)
    p={'Date':date,
        '綜合':zonghe,
        '1個月':one,
        '1-3個月':one_three,
        '3-6個月':three_six,
        '6-12個月':six_twelve,
        '12個月及以上':twelve_most}
    return p
#pandas存儲數據
p=pd.DataFrame(get_info())