因為工作原因,需要爬取相關網站的數據做統計。包括中基協網站和天眼查部分數據。
一、中基協網站
爬取思路:
1.查看目標頁:http://gs.amac.org.cn/amac-infodisc/api/pof/manager?rand=0.9775162173180119&page=%s&size=50
發現有隨機數字串(刷新反爬措施),以及頁碼和每頁信息條數,可以用來拼接爬取url
用一個循環爬取所有展示頁面,用到requests庫請求訪問頁面以及random函數生成隨機數
返回的是json數據,直接用request的json函數解析數據。
2.save函數用來保存目標頁面的詳細數據,可根據需要爬取。
1 import requests 2 import random 3 import json 4 5 def save(school_datas): 6 for data1 in school_datas: 7 # print(data) 8 id = data1['id'] 9 managerName = data1['managerName'] 10 artificialPersonName = data1['artificialPersonName'] 11 regAdrAgg = data1['regAdrAgg'] 12 registerNo = data1['registerNo'] 13 print(id, managerName, artificialPersonName, regAdrAgg,registerNo) 14 15 for i in range(0, 427): 16 print("第%s頁====================="%str(i)) 17 header={ 18 'Accept':'application/json, text/javascript, */*; q=0.01', 19 'Accept-Encoding':'gzip, deflate', 20 'Connection':'keep-alive', 21 'Host':'gs.amac.org.cn', 22 'Origin':'http://gs.amac.org.cn', 23 'Referer':'http://gs.amac.org.cn/amac-infodisc/res/pof/manager/managerList.html', 24 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 25 } 26 r=random.random() 27 print(str(r)) 28 30 # json={"rand":'0.0045470034372876444',"page":str(i),"size":"50"} 31 # http://gs.amac.org.cn/amac-infodisc/api/pof/manager?rand=0.9775162173180119&page=1&size=50 32 # data= requests.post("http://gs.amac.org.cn/amac-infodisc/api/pof/manager",json={'rand':str(r),'page':str(i),'size':'50'},headers=header)#.json() 33 url="http://gs.amac.org.cn/amac-infodisc/api/pof/manager?rand=0.9775162173180119&page=%s&size=50" 34 data= requests.post(url%i,json={'rand':str(r),'page':str(i),'size':'50'}).json() 35 40 41 # print (type(r)) 42 # print (r.status_code) 43 45 # print (r.cookies) 46 # print(r.text,"\n") 47 # print(r.json()) 48 55 56 print("每一頁信息條數——>", len(data['content'])) 57 print("全部信息條數——>", data["totalElements"]) 58 print("每頁有——>", data["size"]) 59 print("總頁數-->>", data["totalPages"]) 60 61 school_datas = data["content"] 62 save(school_datas)
二、天眼查
爬取思路:
1,首先考慮將協會提供的已經登記的廣州所有基金或投資公司名單一個個讀入到天眼查首頁搜索框內(大概7000個公司):(page=requests.get(url1))
2,這樣會得到一個返回的搜索結果頁面(tree=html.fromstring(page.text)),此頁面必定是按名稱匹配最精確的排序,因此可以直接考慮第一條信息(名單上的公司名稱)
3,通過解析第一條信息(href1=tree.xpath("//*[@id='web-content']/div/div/div/div[1]/div[3]/div/div[2]/div[1]/a/@href")),就可得到它轉向的下級靜態頁面地址,爬取此頁面數據即可得到需要的每一個具體公司詳細資料!(采用的是xpath讀取數據)
4,將得到數據保存到excel表格里面。(如果搜索首頁爬取被禁止,可以考慮移動端!)
注意:天眼查反扒措施做的很嚴格,頻繁爬取就會觸發反爬取規則,需要用到time函數降低爬取速度,但如果考慮時間成本,可以采用代理池方法爬取此網站的數據!
1 import xlrd 2 import requests 3 from lxml import html 4 import xlwt 5 import time 6 # # import os 7 # # print(os.getcwd()) 8 #打開源工作簿 9 data = xlrd.open_workbook(r'C:\Users\lin\Desktop\wx\j.xlsx')#字符串中\是被當作轉義字符來使用,所以’d:\a.txt’會被轉義成’d:\a.txt’這是正確路徑,所以要雙\\或者加r或者反/ 10 table = data.sheet_by_index(0) 11 #創建目標工作簿 12 book = xlwt.Workbook() 13 sheet1 = book.add_sheet('tyc_data') 14 print(table.nrows) 15 # headers={ 16 # #'Referer':'http://www.qichacha.com/search?key=%E5%B9%BF%E5%B7%9E%E5%B8%82%E9%BC%8E%E9%94%8B%E6%8E%A7%E8%82%A1%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8', 17 # 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 18 # # 'Accept-Encoding':'gzip, deflate, br', 19 # # 'Connection':'keep-alive', 20 # # 'Cookie':'TYCID=b1367520cc2211e7ac3c4d1875dff195; undefined=b1367520cc2211e7ac3c4d1875dff195; ssuid=31903499; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTMyMjMyNDc3OSIsImlhdCI6MTUxMDk4ODc4NSwiZXhwIjoxNTI2NTQwNzg1fQ.N7mPw74UB9NRY7rganfmUWM8DC_o85LgquR5qR-R8vV1YkS40wQLOsuHiZL8xOalpZnfIxTqb5wAYhAnpom8hw%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215322324779%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTMyMjMyNDc3OSIsImlhdCI6MTUxMDk4ODc4NSwiZXhwIjoxNTI2NTQwNzg1fQ.N7mPw74UB9NRY7rganfmUWM8DC_o85LgquR5qR-R8vV1YkS40wQLOsuHiZL8xOalpZnfIxTqb5wAYhAnpom8hw; RTYCID=a6e769163818420593243d0a51e56e0b; _csrf=RcxvwHNl4qydC+/W33R21g==; OA=vYm5oBQzSUVS6+DX2qis7FlZX+DsbDSKXygIqyxLjVH0n5HhrWY+oxVPIj9Pf6PZrP+HWwgSzGdxLkTHf9m+UJwWuc+PRPq+F6ASd2BjSMsLu772U8GZ7mcDQ+/byYHOa59bn0Qkib3TDKxviYpbPUvrKGpaeTgJqKznbPOVkosZd5j6Dr0dA8SIGjCWA6R6z4gchCC8oldSjK+QdZysAolmEJ+HyfaGBteSo65AWV0=; _csrf_bk=463462e0ebb3936aa444b2fec7cfdcf2; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1510983505,1510988772; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1511107547' 21 # } 22 for i in range(0,1264):#1224,1264 23 24 t1=table.cell(i, 1).value#第3行第2列 25 print("取單元格:",t1) 26 27 # c1=table.col_values(1) 28 # print(c1) 29 # url="http://www.qichacha.com/search?key=%s"%t1 30 # print(url) 31 # //*[@id="searchlist"]/table/tbody/tr/td[2]/a 32 # //*[@id="searchlist"]/table/tbody/tr/td[2]/a 33 34 35 36 37 url1='https://www.tianyancha.com/search?key=%s'%t1 38 39 print("拼接后地址:",url1) 40 time.sleep(5) 41 page=requests.get(url1) 42 # print(headers) 43 print("拼接后訪問:",page) 44 45 tree=html.fromstring(page.text) 46 47 print(tree) 48 #//*[@id="web-content"]/div/div/div/div[1]/div[3]/div/div[2]/div[1]/a #href地址 49 50 href1=tree.xpath("//*[@id='web-content']/div/div/div/div[1]/div[3]/div/div[2]/div[1]/a/@href") 51 print(type(href1),len(href1)) 52 print(href1) 53 if len(href1)==0: 54 all_data=(1,1,1,1,1,1,1,1) 55 else: 56 # url2='https://m.tianyancha.com%s'%href1[0] 57 url2=href1[0] 58 59 print("取搜索到的鏈接:",url2) 60 page2=requests.get(url2) 61 print(page2) 62 ####開始取數據 63 print("開始取目標數據") 64 # # zcd=tree.xpath("//table/tbody/tr/td[2]/a[@class='ma_h1']") 65 # # # //*[@id="web-content"]/div/div/div/div[1]/div[3]/div[1]/div[2]/div[1]/a 66 tree2=html.fromstring(page2.text) 67 all_data=[] 68 69 70 all_data.append(tree2.xpath("//*[@id='company_web_top']/div[2]/div[2]/div[1]/span[1]/text()")[0]) 71 all_data.append(tree2.xpath("//*[@id='_container_baseInfo']/div/div[2]/table/tbody/tr[5]/td[2]/text()")[0]) 72 all_data.append(tree2.xpath("//*[@id='_container_baseInfo']/div/div[1]/table/tbody/tr/td[2]/div[2]/div[2]/div/text()")[0]) 73 all_data.append(tree2.xpath("//*[@id='_container_baseInfo']/div/div[1]/table/tbody/tr/td[2]/div[1]/div[2]/div/text()")[0]) 74 all_data.append(tree2.xpath("//*[@id='_container_baseInfo']/div/div[2]/table/tbody/tr[4]/td[4]/text()")[0]) 75 all_data.append(tree2.xpath("//*[@id='_container_baseInfo']/div/div[2]/table/tbody/tr[3]/td[4]/text()")[0]) 76 all_data.append(tree2.xpath("//*[@id='web-content']/div/div/div[2]/div/div[2]/div/div[2]/div[3]/div/div[2]/div[1]/div[1]/span[1]/text()")) 77 all_data.append(tree2.xpath("//*[@id='web-content']/div/div/div[2]/div/div[2]/div/div[2]/div[3]/div/div[2]/div[2]/div/span[1]/text()")) 78 79 print(all_data) 80 81 # # # 打印搜索頁面所有鏈接地址 82 # djjg_page=tree.xpath("//*[@id='_container_baseInfo']/div/div[2]/table/tbody/tr[5]/td[2]/text()") 83 # print('企業名稱:',all_data[0],'登記機關:',all_data[0],"\n注冊時間:",all_data[1],"\n注冊資本:",all_data[2],"\n核准日期:",all_data[3], \ 84 # "\n行業:",all_data[4],"\n自身風險:",all_data[5],"\周邊風險:",all_data[6]) 85 86 # workbook = xlwt.Workbook(encoding = 'ascii') 87 88 # for i2 in range(3): 89 # print(i2,j2,item,"\n") 90 91 # print(item,"\n") 92 for j2,item in zip(range(8),all_data): 93 print(i,j2,item) 94 sheet1.write(i,j2,item) 95 96 book.save('C:\\Users\\lin\\Desktop\\wx\\j2.xlsx') 97 print('the excel save success:%s'%i)
由於各種網站一直在變化,可能爬取策略一直需要變動 ,具體問題具體分析。