from bluextracter import Extractor
if __name__ == '__main__': extacert = Extractor()#实例提取类 url = 'https://m.huicaiba.com/ask/5426118.html' resp = requests.get(url) resp.encoding = 'utf-8'#手动设置网页源码 source = resp.text extacert.extract(url,source) # print('得分:',extacert.score)#得分 # print('标题:', extacert.title) # 标题 # print('文本链接比例:',extacert.link_text_ratio)# # print('图片数量:',extacert.img_count) # print('内容字数:',extacert.text_count) # # print('纯文本内容:',extacert.clean_text)#纯文本内容 print('html内容:',extacert.format_text)#用html标签格式化的内容 # top_node = extacert.top_node #原始html是一个elem # cc = etree.tostring(top_node,encoding='utf-8').decode('utf-8') # print(unescape(cc))