要爬取的鏈接:
(點我試試)
要爬取的頁面:
要爬取的內容:
先研究下,
如圖,每一個用戶信息在一個li標簽里面,靠css選擇器就能很容易獲取到,但是看源碼卻發現那些關鍵的數字有字體反爬。如下圖:
然后看到這篇(點我試試)博客后,完成了這只小爬蟲~
代碼:
import requests from pyquery import PyQuery as pq from urllib.parse import urljoin import json def parsingChar(type,data): fontscn_h57yip2q = { '\\uabcf':'4', '\\uaced':'3', '\\uaedd':'8', '\\uaede':'0', '\\uafcd':'6', '\\ubdaa':'5', '\\ubdcd':'1', '\\ubfad':'9', '\\uccda':'2', '\\ucfbe':'7', } fontscn_3jqwe90k = { '\\uaacb':'4', '\\uabcd':'3', '\\uacdd':'0', '\\uaefb':'8', '\\uafbc':'6', '\\ubbca':'1', '\\ubdca':'5', '\\ubfee':'9', '\\uccac':'2', '\\ucfba':'7', } fontscn_yuh4hy4p = { '\\uaabd':'5', '\\uaadd':'0', '\\uacde':'9', '\\uadaa':'2', '\\uadac':'1', '\\uadcb':'7', '\\uaeed':'8', '\\ubebb':'3', '\\ucbdc':'6', '\\ucccf':'4', } fontscn_qw2f1m1o = { '\\uabcb':'4', '\\uaccd':'3', '\\uacda':'0', '\\uaeff':'8', '\\uafbb':'6', '\\ubdca':'1', '\\ubdcc':'5', '\\ubfef':'9', '\\uccaa':'2', '\\ucfba':'7', } fontscn_yx77i032 = { '\\uabce':'4', '\\uaccd':'6', '\\uaeda':'8', '\\uaefe':'0', '\\uafed':'3', '\\ubaaa':'5', '\\ubddd':'1', '\\ubfad':'2', '\\ubfae':'9', '\\uc44f':'7', } woff_dict = {'h57yip2q': fontscn_h57yip2q, '3jqwe90k': fontscn_3jqwe90k, 'yuh4hy4p': fontscn_yuh4hy4p, 'qw2f1m1o': fontscn_qw2f1m1o, 'yx77i032': fontscn_yx77i032} li = [] new_data = (list(map(lambda x: x.encode('unicode_escape'), data))) #這里將data轉為編碼byte型的數據,如b'\\ubdca' for i in new_data: if len(str(i)) > 5: num = woff_dict[type][str(i)[3:-1]] #str(i)[3:-1]這里是將比如b'\\ubdca'轉為字符串\ubdca,好去字典中匹配值 li.append(num) else: li.append(str(i)[2:-1]) res = ''.join(li) return res def handling_detail(word,type): ''' :param word: 含細節的字符串 :param type: 當前頁面字體類型 :return: 將數字轉換成正常的后返回 ''' try: words = word.split(' ') if 'w粉絲' in words[0]: fans = words[0].replace('w粉絲', '').strip() fans = parsingChar(type, fans)+'w粉絲' else: fans = words[0].replace('粉絲', '').strip() fans = parsingChar(type, fans)+'粉絲' #轉換粉絲數為正常數字 follows = words[1].strip().replace('關注','') follows = parsingChar(type,follows) # 轉換關注數為正常數字 works = words[2].strip().replace('作品', '') works = parsingChar(type,works) # 轉換作品數為正常數字 all = fans+follows+'關注'+works+'作品' return all except: print(word,'handling_detail error') def judge(html): ''' :param html: html源碼 :return: 當前頁面字體類型 ''' for i in ['h57yip2q', '3jqwe90k','yuh4hy4p', 'qw2f1m1o', 'yx77i032']: if i in html: return i def getList(key,page): ''' :param key: 搜索的關鍵字 :param page: 頁數 :return: 用戶的一些細節,用戶名,用戶主頁url,用戶畫像,用戶簽名,用戶粉絲數等等... ''' all = {} url = 'https://live.kuaishou.com/search/author?keyword='+key+'&page='+str(page) original_url = 'https://live.kuaishou.com' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} r = requests.get(url=url,headers=headers) type = judge(r.text)#判斷當前頁面字體類型 html = pq(r.text) lis = html('.search-detail ul .author-card').items() for li in lis: ''' 這里是用pyquery的css選擇器對用戶信息的獲取 ''' name = li('.profile-card-user-info-intro').attr('title').strip() #用戶名 detail = li('.profile-card-user-info-counts').text().strip() detail = handling_detail(detail,type)#對有字體反爬地方處理 #粉絲數作品數有反爬的部分 sign = li('.profile-card-user-info-description.notlive').text().strip() #簽名 user_url = li('.profile-card-user-info a').attr('href').strip() user_url = urljoin(original_url,user_url) #主播首頁url user_img = li('img').attr('src').strip() #用戶畫像url all[name] = {'user_url':user_url,'detail':detail,'sign':sign,'user_img':user_img} return all if __name__ == '__main__': key = '技能' for i in range(1,11): with open('kuaishou.json','a',encoding='utf-8') as f: json.dump(getList(key,i), f, ensure_ascii=False, sort_keys=True, indent=4) #json文件保存獲取的內容
結果: