一只快手小爬蟲(已失效)


要爬取的鏈接:

(點我試試)

 

要爬取的頁面:

 

要爬取的內容:

 

先研究下,

 

 

 

如圖,每一個用戶信息在一個li標簽里面,靠css選擇器就能很容易獲取到,但是看源碼卻發現那些關鍵的數字有字體反爬。如下圖:

 

然后看到這篇(點我試試)博客后,完成了這只小爬蟲~

代碼:

import requests
from pyquery import PyQuery as pq
from urllib.parse import urljoin
import json

def parsingChar(type,data):
    fontscn_h57yip2q = {
        '\\uabcf':'4',
        '\\uaced':'3',
        '\\uaedd':'8',
        '\\uaede':'0',
        '\\uafcd':'6',
        '\\ubdaa':'5',
        '\\ubdcd':'1',
        '\\ubfad':'9',
        '\\uccda':'2',
        '\\ucfbe':'7',
    }
    fontscn_3jqwe90k = {
        '\\uaacb':'4',
        '\\uabcd':'3',
        '\\uacdd':'0',
        '\\uaefb':'8',
        '\\uafbc':'6',
        '\\ubbca':'1',
        '\\ubdca':'5',
        '\\ubfee':'9',
        '\\uccac':'2',
        '\\ucfba':'7',
    }
    fontscn_yuh4hy4p = {
        '\\uaabd':'5',
        '\\uaadd':'0',
        '\\uacde':'9',
        '\\uadaa':'2',
        '\\uadac':'1',
        '\\uadcb':'7',
        '\\uaeed':'8',
        '\\ubebb':'3',
        '\\ucbdc':'6',
        '\\ucccf':'4',
    }
    fontscn_qw2f1m1o = {
        '\\uabcb':'4',
        '\\uaccd':'3',
        '\\uacda':'0',
        '\\uaeff':'8',
        '\\uafbb':'6',
        '\\ubdca':'1',
        '\\ubdcc':'5',
        '\\ubfef':'9',
        '\\uccaa':'2',
        '\\ucfba':'7',
    }
    fontscn_yx77i032 = {
        '\\uabce':'4',
        '\\uaccd':'6',
        '\\uaeda':'8',
        '\\uaefe':'0',
        '\\uafed':'3',
        '\\ubaaa':'5',
        '\\ubddd':'1',
        '\\ubfad':'2',
        '\\ubfae':'9',
        '\\uc44f':'7',
    }
    woff_dict = {'h57yip2q': fontscn_h57yip2q, '3jqwe90k': fontscn_3jqwe90k, 'yuh4hy4p': fontscn_yuh4hy4p,
                 'qw2f1m1o': fontscn_qw2f1m1o, 'yx77i032': fontscn_yx77i032}
    li = []
    new_data = (list(map(lambda x: x.encode('unicode_escape'), data)))
    #這里將data轉為編碼byte型的數據,如b'\\ubdca'
    for i in new_data:
        if len(str(i)) > 5:
            num = woff_dict[type][str(i)[3:-1]]
            #str(i)[3:-1]這里是將比如b'\\ubdca'轉為字符串\ubdca,好去字典中匹配值
            li.append(num)
        else:
            li.append(str(i)[2:-1])
    res = ''.join(li)
    return res


def handling_detail(word,type):
    '''
    :param word: 含細節的字符串
    :param type: 當前頁面字體類型
    :return: 將數字轉換成正常的后返回
    '''
    try:
        words = word.split('  ')
        if 'w粉絲' in words[0]:
            fans = words[0].replace('w粉絲', '').strip()
            fans = parsingChar(type, fans)+'w粉絲'
        else:
            fans = words[0].replace('粉絲', '').strip()
            fans = parsingChar(type, fans)+'粉絲'
        #轉換粉絲數為正常數字
        follows = words[1].strip().replace('關注','')
        follows = parsingChar(type,follows)
        # 轉換關注數為正常數字
        works = words[2].strip().replace('作品', '')
        works = parsingChar(type,works)
        # 轉換作品數為正常數字
        all = fans+follows+'關注'+works+'作品'
        return all
    except:
        print(word,'handling_detail error')


def judge(html):
    '''
    :param html: html源碼
    :return: 當前頁面字體類型
    '''
    for i in ['h57yip2q', '3jqwe90k','yuh4hy4p', 'qw2f1m1o', 'yx77i032']:
        if i in html:
            return i


def getList(key,page):
    '''
    :param key: 搜索的關鍵字
    :param page: 頁數
    :return: 用戶的一些細節,用戶名,用戶主頁url,用戶畫像,用戶簽名,用戶粉絲數等等...
    '''
    all = {}
    url = 'https://live.kuaishou.com/search/author?keyword='+key+'&page='+str(page)
    original_url = 'https://live.kuaishou.com'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
    r = requests.get(url=url,headers=headers)
    type = judge(r.text)#判斷當前頁面字體類型
    html = pq(r.text)
    lis = html('.search-detail ul .author-card').items()

    for li in lis:
        '''
        這里是用pyquery的css選擇器對用戶信息的獲取
        '''
        name = li('.profile-card-user-info-intro').attr('title').strip()
        #用戶名
        detail = li('.profile-card-user-info-counts').text().strip()
        detail = handling_detail(detail,type)#對有字體反爬地方處理
        #粉絲數作品數有反爬的部分
        sign = li('.profile-card-user-info-description.notlive').text().strip()
        #簽名
        user_url = li('.profile-card-user-info a').attr('href').strip()
        user_url = urljoin(original_url,user_url)
        #主播首頁url
        user_img = li('img').attr('src').strip()
        #用戶畫像url
        all[name] = {'user_url':user_url,'detail':detail,'sign':sign,'user_img':user_img}
    return all


if __name__ == '__main__':
    key = '技能'
    for i in range(1,11):
        with open('kuaishou.json','a',encoding='utf-8') as f:
            json.dump(getList(key,i), f, ensure_ascii=False, sort_keys=True, indent=4)
            #json文件保存獲取的內容

 結果:

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM