實戰-快手H5字體反爬


實戰-快手H5字體反爬

前言

快手H5端的粉絲數是字體反爬,抓到的html文本是亂碼 <SPAN STYLE='FONT-FAMILY: kwaiFont;'>&#xebe9;&#xea80;&#xf6d0;&#xe7c7;&#xed42;&#xeb5e;</SPAN> 可以看到對應的字體格式為 kwaiFont

經過一頓分析操作,發現每次返回的ttf文件內容每次都不太一樣,無法自己做一份映射模板, 那么就不做模板了。可以通過OCR 或者 KNN 進行內容識別。本人采用 OCR 方式進行識別。這里推薦一個很吊的 OCRddddocr.

流程分析

  1. 找到對應ttf文件
  2. 分析ttf文件,將每個字體轉換成圖片
  3. 圖片識別成文本
  4. 亂碼映射

直接上代碼

import re
import ddddocr
import requests
from lxml import etree
from io import BytesIO
from fontTools.ttLib import TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing


class ReportLabPen(BasePen):
    """
    繪圖
    """

    def __init__(self, glyph_set, path=None):
        BasePen.__init__(self, glyph_set)
        if path is None:
            path = Path()
        self.path = path

    def _moveTo(self, p):
        (x, y) = p
        self.path.moveTo(x, y)

    def _lineTo(self, p):
        (x, y) = p
        self.path.lineTo(x, y)

    def _curveToOne(self, p1, p2, p3):
        (x1, y1) = p1
        (x2, y2) = p2
        (x3, y3) = p3
        self.path.curveTo(x1, y1, x2, y2, x3, y3)

    def _closePath(self):
        self.path.closePath()


class KuaiShouSpider(object):
    """
    快手爬蟲
    """

    def __init__(self):
        # OCR 識別類
        self.ocr = ddddocr.DdddOcr()

    def ttf_2_word_map(self, ttf_content, fmt="png"):
        """
        ttf內容轉文本
        :param ttf_content:
        :param fmt:
        :return:
        """
        font = TTFont(BytesIO(ttf_content))
        gs = font.getGlyphSet()
        glyphNames = font.getGlyphNames()

        uniMap = font['cmap'].tables[0].ttFont.getBestCmap()
        key_map = dict()
        for k, v in uniMap.items():
            key_map[v] = hex(k)

        data_dict = dict()
        for i in glyphNames:
            # 跳過'.notdef', '.null'
            if i[0] == '.':
                continue

            g = gs[i]
            pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=5))
            g.draw(pen)
            w = 800
            h = 800
            g = Group(pen.path)
            g.translate(0, 0)

            d = Drawing(w, h)
            d.add(g)
            img = renderPM.drawToString(d, fmt)
            data = self.ocr.classification(img)
            if data == '十':
                data = '+'
            elif data in [',', '。']:
                data = '.'
            key = key_map[i]
            data_dict[key] = data

        return data_dict
    
    @staticmethod
    def uni_code_2_word(uni_code, word_map):
        """
        unicode 轉 文本
        :param uni_code:
        :param word_map:
        :return:
        """

        def _sub(num):
            num = num.group()
            num = re.findall(r'\d+', num)[0]
            num = str(hex(int(num)))
            return word_map[num]

        data = re.sub('&#(.+?);', _sub, uni_code)

        return data

    def get_user_info(self):
        """
        獲取用戶數據
        :return:
        """

        url = 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0&timestamp=1633759010452&captchaToken=HEADCgp6dC5jYXB0Y2hhEscCX569ztU1Y9XCAVp1Q5Rsm1H8fPYfPZBHvTyg5mwPyIQrJSR_j2mphorguzP9cB2sNWhg61OwW_LQEBvnHRS47j0GpmjIBOeqJ9j9kIbNTsXgNSQYZxkdToAm25EKa4ZLXOmE9ez5Bl-UMzRs4P2_g6SzI3fBs1yFvI7_eLd_yFogwimBE5eyopG9qDDm5lFPfSPm0GI6IhqLKpA1VBZd9cjZxsxq4jGlld1vYRxOFyfJis4oFSVM8fpDArN32KQ2pqejgjV8kK42jW-kpg4fl-1g5iWmqSczszEvEdB9s4l3QmQBfztuDSPbGf0yfY-whf93nOynaRmSeLH49sHSaPr_nwcGvjNjqeFdZoTpf2VBLV7mWvkVdthG0yV5Y6BqDPWSr57Js-dvLIcYlyq3gLbNxQOsulNch6o-HQ7dw2CZY006z-_eGhLniyxQb2WiE0ZVkCv0UGAb2gsoBTACTAIL'

        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Cookie': 'did=web_232e842d3bcd4eceb358abfcf31ec030; didv=1634614098000; sid=e7921611a1cbb9669d28ce19; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614100; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614104',
            'DNT': '1',
            'Host': 'c.kuaishou.com',
            'Pragma': 'no-cache',
            'Referer': 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0&timestamp=1633759010452',
            'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
            'sec-ch-ua-mobile': '?1',
            'sec-ch-ua-platform': '"Android"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36',
        }

        response = requests.get(url, headers=headers)

        # 獲取網頁中的ttf文件
        try:
            ttf_file = re.findall(r'url\((https:.+?\.ttf)\)', response.text)[0]
        except Exception as err:
            print('網頁訪問異常')
            return

        ttf_data = requests.get(ttf_file)

        ttf_word = self.ttf_2_word_map(ttf_data.content)

        # 解析
        html = etree.HTML(response.text)

        fans_node = html.xpath('//span[contains(text(),"粉絲")]/preceding-sibling::span[1]')[0]
        focus_node = html.xpath('//span[contains(text(),"關注")]/preceding-sibling::span[1]')[0]
        fans = etree.tostring(fans_node).decode('utf-8')
        focus = etree.tostring(focus_node).decode('utf-8')
        fans = re.findall('>(.+?)<', fans)[0]
        focus = re.findall('>(.+?)<', focus)[0]
        fans = self.uni_code_2_word(fans, ttf_word)
        focus = self.uni_code_2_word(focus, ttf_word)
        print(fans)
        print(focus)


if __name__ == '__main__':
    spider = KuaiShouSpider()
    spider.get_user_info()

后記

可以考慮一下用 KNN的方式根據字體特征進行分類,准備好一些樣本,進行訓練.


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM