python 使用spaCy 進行NLP處理


原文:http://mp.weixin.qq.com/s/sqa-Ca2oXhvcPHJKg9PuVg

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The big grey dog ate all of the chocalate,but fortunately he wasn't sick!")

# 利用空格分開
print(doc.text.split())

# 利用token的.orth_方法,可以識別標點符號
print([token.orth_ for token in doc])

# 帶下划線的方法返回字符、不帶下划線的方法返回數字
print([(token, token.orth_, token.orth) for token in doc])

# 分詞,去除標點和空格
print([token.orth_ for token in doc if not token.is_punct | token.is_space])

# 標准化到基本形式
practice = "practice practiced practicing"
nlp_practice = nlp(practice)
print([word.lemma_ for word in nlp_practice])

# 詞性標注 可以使用.pos_ 和 .tag_方法訪問粗粒度POS標記和細粒度POS標記
doc2 = nlp("Conor's dog's toy was hidden under the man's sofa in the woman's house")
pos_tags = [(i, i.tag_) for i in doc2]
print(pos_tags)

# 's 的標簽被標記為 POS.可以利用這個標記提取所有者和他們擁有的東西
owners_possessions = []
for i in pos_tags:
    if i[1] == "POS":
        owner = i[0].nbor(-1)
        possession = i[0].nbor(1)
        owners_possessions.append((owner, possession))

print(owners_possessions)

# 簡化代碼
print([(i[0].nbor(-1), i[0].nbor(1)) for i in pos_tags if i[1] == "POS"])

# 實體識別 PERSON 是不言自明的;NORP是國籍或宗教團體;GGPE標識位置(城市、國家等等);DATE 標識特定的日期或日期范圍, ORDINAL標識一個表示某種類型的順序的單詞或數字。
wiki_obama = """Barack Obama is an American politician who served as the 44th President of the United States from 2009 to 2017. He is the first African American to have served as president, as well as the first born outside the contiguous United States."""
nlp_obama = nlp(wiki_obama)
print([(i, i.label_, i.label) for i in nlp_obama.ents])

# 將文章分成句子
for ix, sent in enumerate(nlp_obama.sents,1):
    print("Sentence number {}:{}".format(ix,sent))

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM