from bluextracter import Extractor
if __name__ == '__main__': extacert = Extractor()#實例提取類 url = 'https://m.huicaiba.com/ask/5426118.html' resp = requests.get(url) resp.encoding = 'utf-8'#手動設置網頁源碼 source = resp.text extacert.extract(url,source) # print('得分:',extacert.score)#得分 # print('標題:', extacert.title) # 標題 # print('文本鏈接比例:',extacert.link_text_ratio)# # print('圖片數量:',extacert.img_count) # print('內容字數:',extacert.text_count) # # print('純文本內容:',extacert.clean_text)#純文本內容 print('html內容:',extacert.format_text)#用html標簽格式化的內容 # top_node = extacert.top_node #原始html是一個elem # cc = etree.tostring(top_node,encoding='utf-8').decode('utf-8') # print(unescape(cc))