數據清洗之微博內容清洗


獲取文字加表情(alt標簽的屬性)

#!/usr/bin/env python  
# encoding: utf-8
from functools import reduce
from lxml import html
from bs4 import BeautifulSoup
html="""
<div><span class="url-icon"><img alt="[饞嘴]" src="//h5.sinaimg.cn/m/emoticon/icon/default/d_chanzui-ad3f4f182c.png" style="width:1em; height:1em;"/></span>聽着就很好吃​</div>
"""

def main():
    bs=BeautifulSoup(html,'html.parser')
    main_div=bs.find('div')
    contents=parse_div(main_div)
    print(contents)
def parse_div(div_tags):
    contents=div_tags.contents
    result=[]
    for content in contents:
        if isinstance(content,str):
            content=content.replace('\n','').replace(' ','')
            result.append(content)
        elif content.has_attr('alt'):
            result.append(content.get('alt',''))
        else:
            new_contents=parse_div(content)
            result.append(new_contents)
    return ''.join(result)
#最優解
def main(self, htmlstr):
        root = html.fromstring(htmlstr)
        nodes = root.xpath(".//text()|.//@alt")
        return ''.join([i.replace('\n','').replace(" ", "").replace("\u200b", "") for i in nodes])



if __name__ == '__main__':
        main()






免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM