02 NLTK 分句、分詞、詞干提取、詞型還原


NLTK 分句、分詞、詞干提取、詞型還原

 

print("==========案例1:分句、分詞===============")
import nltk.tokenize as tk
doc ="Are you curious about tokenization? ""Let's see how it works! "\
"We need to analyze a couple of sentences " "with punctuations to see it in action."

print(doc)

# 按句拆分:tk.sent_tokenize(doc)
# 問:tk.sent_tokenize()為何能識別出到哪里是一句?
# 答:1、看首字母是大寫 ;2、結尾有標點符號
tokens = tk.sent_tokenize(doc)
for i,token in enumerate(tokens):
    print("%2d" % (i+1),token)

print("-----------------------------")

# 按詞拆分:tk.word_tokenize(doc)
tokens = tk.word_tokenize(doc)
for i,token in enumerate(tokens):
    print("%2d" % (i+1),token)


# 按詞和標點拆分:tk.WordPunctTokenizer().tokenize(doc)
tokenizer=tk.WordPunctTokenizer()
tokens = tokenizer.tokenize(doc)
for i,token in enumerate(tokens):
    print("%2d" % (i+1),token)
    
print("=============案例2:詞干提取、詞型還原===================")    

# 導入下面三種詞干提取器進行對比
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb

# 導入nltk.stem用來詞型還原
import nltk.stem as ns


words = ['table', 'probably', 'wolves', 'playing',
         'is', 'dog', 'the', 'beaches', 'grounded',
         'dreamt', 'envision']
print(words)

print("----------詞干提取-------------")
# 在名詞和動詞中,除了與數和時態有關的成分以外的核心成分。
# 詞干並不一定是合法的單詞

pt_stemmer = pt.PorterStemmer()  # 波特詞干提取器
lc_stemmer = lc.LancasterStemmer()   # 蘭卡斯詞干提取器
sb_stemmer = sb.SnowballStemmer("english")# 思諾博詞干提取器

for word in words:
    pt_stem = pt_stemmer.stem(word)
    lc_stem = lc_stemmer.stem(word)
    sb_stem = sb_stemmer.stem(word)
    print("%8s %8s %8s %8s" % (word,pt_stem,lc_stem,sb_stem))


print("----------詞型還原器---------------")
# 詞型還原:復數名詞->單數名詞 ;分詞->動詞原型
# 單詞原型一定是合法的單詞

lemmatizer = ns.WordNetLemmatizer()
for word in words:
    # 將名詞還原為單數形式
    n_lemma = lemmatizer.lemmatize(word, pos='n')
    # 將動詞還原為原型形式
    v_lemma = lemmatizer.lemmatize(word, pos='v')
    print('%8s %8s %8s' % (word, n_lemma, v_lemma))

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM