實戰-快手H5字體反爬
前言
快手H5端的粉絲數是字體反爬,抓到的html文本是亂碼 <SPAN STYLE='FONT-FAMILY: kwaiFont;'></SPAN>
可以看到對應的字體格式為 kwaiFont
。
經過一頓分析操作,發現每次返回的ttf文件內容每次都不太一樣,無法自己做一份映射模板, 那么就不做模板了。可以通過OCR
或者 KNN
進行內容識別。本人采用 OCR
方式進行識別。這里推薦一個很吊的 OCR
庫 ddddocr.
流程分析
- 找到對應ttf文件
- 分析ttf文件,將每個字體轉換成圖片
- 圖片識別成文本
- 亂碼映射
直接上代碼
import re
import ddddocr
import requests
from lxml import etree
from io import BytesIO
from fontTools.ttLib import TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing
class ReportLabPen(BasePen):
"""
繪圖
"""
def __init__(self, glyph_set, path=None):
BasePen.__init__(self, glyph_set)
if path is None:
path = Path()
self.path = path
def _moveTo(self, p):
(x, y) = p
self.path.moveTo(x, y)
def _lineTo(self, p):
(x, y) = p
self.path.lineTo(x, y)
def _curveToOne(self, p1, p2, p3):
(x1, y1) = p1
(x2, y2) = p2
(x3, y3) = p3
self.path.curveTo(x1, y1, x2, y2, x3, y3)
def _closePath(self):
self.path.closePath()
class KuaiShouSpider(object):
"""
快手爬蟲
"""
def __init__(self):
# OCR 識別類
self.ocr = ddddocr.DdddOcr()
def ttf_2_word_map(self, ttf_content, fmt="png"):
"""
ttf內容轉文本
:param ttf_content:
:param fmt:
:return:
"""
font = TTFont(BytesIO(ttf_content))
gs = font.getGlyphSet()
glyphNames = font.getGlyphNames()
uniMap = font['cmap'].tables[0].ttFont.getBestCmap()
key_map = dict()
for k, v in uniMap.items():
key_map[v] = hex(k)
data_dict = dict()
for i in glyphNames:
# 跳過'.notdef', '.null'
if i[0] == '.':
continue
g = gs[i]
pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=5))
g.draw(pen)
w = 800
h = 800
g = Group(pen.path)
g.translate(0, 0)
d = Drawing(w, h)
d.add(g)
img = renderPM.drawToString(d, fmt)
data = self.ocr.classification(img)
if data == '十':
data = '+'
elif data in [',', '。']:
data = '.'
key = key_map[i]
data_dict[key] = data
return data_dict
@staticmethod
def uni_code_2_word(uni_code, word_map):
"""
unicode 轉 文本
:param uni_code:
:param word_map:
:return:
"""
def _sub(num):
num = num.group()
num = re.findall(r'\d+', num)[0]
num = str(hex(int(num)))
return word_map[num]
data = re.sub('&#(.+?);', _sub, uni_code)
return data
def get_user_info(self):
"""
獲取用戶數據
:return:
"""
url = 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0×tamp=1633759010452&captchaToken=HEADCgp6dC5jYXB0Y2hhEscCX569ztU1Y9XCAVp1Q5Rsm1H8fPYfPZBHvTyg5mwPyIQrJSR_j2mphorguzP9cB2sNWhg61OwW_LQEBvnHRS47j0GpmjIBOeqJ9j9kIbNTsXgNSQYZxkdToAm25EKa4ZLXOmE9ez5Bl-UMzRs4P2_g6SzI3fBs1yFvI7_eLd_yFogwimBE5eyopG9qDDm5lFPfSPm0GI6IhqLKpA1VBZd9cjZxsxq4jGlld1vYRxOFyfJis4oFSVM8fpDArN32KQ2pqejgjV8kK42jW-kpg4fl-1g5iWmqSczszEvEdB9s4l3QmQBfztuDSPbGf0yfY-whf93nOynaRmSeLH49sHSaPr_nwcGvjNjqeFdZoTpf2VBLV7mWvkVdthG0yV5Y6BqDPWSr57Js-dvLIcYlyq3gLbNxQOsulNch6o-HQ7dw2CZY006z-_eGhLniyxQb2WiE0ZVkCv0UGAb2gsoBTACTAIL'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': 'did=web_232e842d3bcd4eceb358abfcf31ec030; didv=1634614098000; sid=e7921611a1cbb9669d28ce19; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614100; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614104',
'DNT': '1',
'Host': 'c.kuaishou.com',
'Pragma': 'no-cache',
'Referer': 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0×tamp=1633759010452',
'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?1',
'sec-ch-ua-platform': '"Android"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36',
}
response = requests.get(url, headers=headers)
# 獲取網頁中的ttf文件
try:
ttf_file = re.findall(r'url\((https:.+?\.ttf)\)', response.text)[0]
except Exception as err:
print('網頁訪問異常')
return
ttf_data = requests.get(ttf_file)
ttf_word = self.ttf_2_word_map(ttf_data.content)
# 解析
html = etree.HTML(response.text)
fans_node = html.xpath('//span[contains(text(),"粉絲")]/preceding-sibling::span[1]')[0]
focus_node = html.xpath('//span[contains(text(),"關注")]/preceding-sibling::span[1]')[0]
fans = etree.tostring(fans_node).decode('utf-8')
focus = etree.tostring(focus_node).decode('utf-8')
fans = re.findall('>(.+?)<', fans)[0]
focus = re.findall('>(.+?)<', focus)[0]
fans = self.uni_code_2_word(fans, ttf_word)
focus = self.uni_code_2_word(focus, ttf_word)
print(fans)
print(focus)
if __name__ == '__main__':
spider = KuaiShouSpider()
spider.get_user_info()
后記
可以考慮一下用 KNN的方式根據字體特征進行分類,准備好一些樣本,進行訓練.