jieba、NLTK學習筆記


  中文分詞 - jiebaimport re

import jieba
 
news_CN = '''
央視315晚會曝光湖北省知名的神丹牌、蓮田牌“土雞蛋”實為普通雞蛋冒充,同時在商標上玩貓膩,
分別注冊“鮮土”、注冊“好土”商標,讓消費者誤以為是“土雞蛋”。3月15日晚間,新京報記者就此
事致電湖北神丹健康食品有限公司方面,其工作人員表示不知情,需要了解清楚情況,截至發稿暫未
取得最新回應。新京報記者還查詢發現,湖北神丹健康食品有限公司為農業產業化國家重點龍頭企
業、高新技術企業,此前曾因涉嫌虛假宣傳“中國最大的蛋品企業”而被罰6萬元。
'''

# 字符串清洗
string = re.sub('[^\w]', '', news_CN)   #使用正則去符號,之后都是用這個str字符串

# 分詞
seg_list = jieba.cut(string, cut_all=False, HMM=False) #精確模式(默認)| 全模式
#seg_list = jieba.cut_for_search(string, HMM=False)  #搜索引擎模式,粒度較細
#jieba.lcut(), jieba.lcut_for_search()  #直接返回list,不加返回生成器
print('/'.join(seg_list))

# 返回詞語在原文中出現位置
seg_list = jieba.tokenize(u'自然語言處理非常有用')  #[('自然語言', 0, 4), ...]

# 詞性標注
import jieba.posseg as psg
seg_list = psg.cut(news_CN)
'''psg.POSTokenizer(tokenizer=None)
#tokenizer參數可使用 jieba.Tokenizer(dictionary=DEFUALT_DICT)  #新建自定義分詞器,可用於同時使用不同字典
#jieba.posseg.dt為默認詞性標注分詞器
'''
print(' '.join(['{0}/{1}'.format(w, t) for w, t in seg_list]))

 
'''
path = ''
file=open(path,'r')
jieba.load_userdict(file)
file.close()
'''
# 加載自定義詞
'''
userdict.txt
一個詞占一行
每一行分三部分:詞語、詞頻(可省略)、詞性(可省略)
用空格隔開,順序不可顛倒
file_name
若為路徑或二進制方式打開的文件,則文件必須為UTF-8編碼
'''
# 結巴默認詞庫位置: {basepath}\Lib\site-packages\jieba\dict.txt
#如果不知道新加詞匯的詞頻詞性的話,可寫成:詞 3 n
jieba.set_dictionary('./data/dict.txt.big')  #加載系統詞典
jieba.load_userdict(['神丹牌','蓮花牌','土雞蛋','新京報'])  #載入詞典, filename='userdict.txt'
jieba.add_word('自定義詞', freq=None, tag=None)  #動態修改詞典
jieba.del_word('自定義詞')
jieba.get_FREQ('神丹牌')
jieba.suggest_freq(('龍頭企業','高新技術企業'), True)  #調節單個詞語的詞頻,使其能(或不能)被分出來

seg_list = jieba.cut(string, cut_all=False) #精確模式  str 為之前的字符串
print('/'.join(seg_list))  #詞典中指定的詞不會拆分

# 基於TF-IDF算法的關鍵詞抽取
import jieba.analyse as aly
#aly.TFIDF(idf_path=None)
aly.set_idf_path('./data/idf.txt.big')  #加載自定義idf詞典
aly.set_stop_words('./data/stop_words.utf8')  #加載停用詞典
keywords = aly.extract_tags(news_CN, topK=10, withWeight=True, allowPOS=())  #allowPOS為保留詞性,為空不過濾
keywords = aly.textrank(news_CN, topK=10, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v'))  #為空過濾所有


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
corpus = [
        'This is the first document.',
        'This is the second document.',
        'And the third one.'
        ]
#words = CountVectorizer().fit_transform(corpus)
#tfidf = TfidfTransformer().fit_transform(words)  #稀疏矩陣
tfidf = TfidfTransformer().fit_transform(corpus)  #結果一樣
print(tfidf)


# 並行分詞
# 按行多進程並行,基於 python 自帶的 multiprocessing 模塊,目前暫不支持 Windows jieba.enable_parallel(4) jieba.disable_parallel()

  英文分詞 - NLTK

import nltk

# ------------------------------------
#先分句再分詞
text = "And now for something completely different. I love you."
sentences = nltk.sent_tokenize(text)
words = []
for sent in sentences:
    words.append(nltk.word_tokenize(sent))
    #words_tagged += nltk.pos_tag(nltk.word_tokenize(sent))

# ------------------------------------
#分詞
words = nltk.word_tokenize("good good study, day day up!")

# ------------------------------------
#詞性標注
tagged = nltk.pos_tag(words)
print (tagged[0:6])

for word in tagged:
    if 'NNP' == word[1]:  #首字母大寫都判為專有名詞了
        print(word)
        
#命名實體識別
entities = nltk.chunk.ne_chunk(tagged)  #樹
print (entities)

# ------------------------------------
#詞頻統計
words = nltk.word_tokenize("good good study, day day up!")
fdist = nltk.FreqDist(words)
fdist.N()  #總詞數
fdist.B()  #詞典大小
fdist['good']  #頻數
fdist.freq('good') * 100  #頻率
fdist.tabulate(5, cumulative=False)  #前5個詞的頻數分布
fdist.plot(5, cumulative=True)  #前5個詞的累計頻數分布圖
#詞組統計
bgrams = nltk.bigrams(words)
bgfdist = nltk.FreqDist(list(bgrams))
bgfdist.plot(10)  #前十詞組

  基於TF-IDF算法的關鍵詞提取

  • jieba.analyse.extract_tags(sentence, topK=20, withWeight=False, allowPOS=())

  其中需要說明的是:
  1.sentence 為待提取的文本
  2.topK 為返回幾個 TF/IDF 權重最大的關鍵詞,默認值為 20
  3.withWeight 為是否一並返回關鍵詞權重值,默認值為 False
  4.allowPOS 僅包括指定詞性的詞,默認值為空,即不篩選

  • jieba.analyse.TFIDF(idf_path=None) 新建 TFIDF 實例,idf_path 為 IDF 頻率文件
#設置逆文檔頻率語料庫
#jieba.analyse.set_idf_path(file_name)
#勞動防護 13.900677652 勞動防護 13.900677652 ...
#設置停用詞語料庫
#jieba.analyse.set_stop_words(file_name)

import jieba
import jieba.analyse
#讀取文件,返回一個字符串,使用utf-8編碼方式讀取,該文檔位於此python同以及目錄下
content  = open(u'人民的名義.txt','r',encoding='utf-8').read()
jieba.analyse.set_stop_words("stopwords.txt")
tags = jieba.analyse.extract_tags(content, topK=10,withWeight=True)
for tag in tags:
	print("tag:%s\t\t weight:%f"%(tag[0],tag[1]))

  

 

  

 

   英文分詞 - NLTK - 詞形還原、詞性還原、詞干提取

  轉自:NLTK英文分詞嘗試

  NLP Lemmatisation(詞性還原) 和 Stemming(詞干提取) NLTK pos_tag word_tokenize

  Python nltk.WordNetLemmatizer() Examples 

import re, time, collections, nltk
from sklearn.datasets import fetch_20newsgroups
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# -----------------------------------
''' 詞形還原 '''
# 正則表達式過濾特殊符號用空格符占位,雙引號、單引號、句點、逗號
pat_letter = re.compile(r'[^a-zA-Z \']+')
# 還原常見縮寫單詞
pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
pat_s = re.compile("(?<=[a-zA-Z])\'s") # 找出字母后面的字母
pat_s2 = re.compile("(?<=s)\'s?")
pat_not = re.compile("(?<=[a-zA-Z])n\'t") # not的縮寫
pat_would = re.compile("(?<=[a-zA-Z])\'d") # would的縮寫
pat_will = re.compile("(?<=[a-zA-Z])\'ll") # will的縮寫
pat_am = re.compile("(?<=[I|i])\'m") # am的縮寫
pat_are = re.compile("(?<=[a-zA-Z])\'re") # are的縮寫
pat_have = re.compile("(?<=[a-zA-Z])\'ve") # have的縮寫

def replace_abbreviations(text):
    new_text = text
    new_text = pat_letter.sub(' ', text).strip().lower()
    new_text = pat_is.sub(r"\1 is", new_text)
    new_text = pat_s.sub("", new_text)
    new_text = pat_s2.sub("", new_text)
    new_text = pat_not.sub(" not", new_text)
    new_text = pat_would.sub(" would", new_text)
    new_text = pat_will.sub(" will", new_text)
    new_text = pat_am.sub(" am", new_text)
    new_text = pat_are.sub(" are", new_text)
    new_text = pat_have.sub(" have", new_text)
    new_text = new_text.replace('\'', ' ')
    return new_text

# -----------------------------------
''' 詞干提取 '''
# 基於Porter詞干提取算法
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
porter_stemmer.stem('leaves')  #輸出:leav,實際:leaf
porter_stemmer.stem('maximum')

# 基於Lancaster 詞干提取算法
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
lancaster_stemmer.stem('maximum')

# 基於Snowball 詞干提取算法
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')
snowball_stemmer.stem('maximum')

# -----------------------------------
''' 詞性還原 '''
def lemmatize_all(sentence, stopWords):
    # lemmatize()方法將word單詞還原成pos詞性的形式
    wnl = WordNetLemmatizer()
    for word, tag in nltk.pos_tag(word_tokenize(sentence)):
        if word in stopWords:
            continue
        elif tag.startswith('NN'):
            #pos = nltk.corpus.wordnet.NOUN
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            #pos = nltk.corpus.wordnet.VERB
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            #pos = nltk.corpus.wordnet.ADJ
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            #pos = nltk.corpus.wordnet.ADV
            yield wnl.lemmatize(word, pos='r')
        else:
            #按詞性剔除
            continue
            #yield word

#查看詞性說明
nltk.help.upenn_tagset('JJ')

# -----------------------------------
''' 詞頻統計 '''
def word_frequency_count(contents, stopWords):
    word_count_dict = collections.defaultdict(lambda:0)
    for text in contents:
        new_text = replace_abbreviations(text)
        words = lemmatize_all(new_text, stopWords)
        word_dict = collections.Counter(words)
        for key in word_dict:
            word_count_dict[key] += word_dict[key]
    return word_count_dict


if __name__=='__main__':
    t0 = time.time()
    news = fetch_20newsgroups(subset='all')
    stopWords = set(stopwords.words('english')) | set(['the','a'])
    word_count_dict = word_frequency_count(news.data[:30], stopWords)
    #print('高頻詞:', word_count_dict.most_common())
    word_items = list(word_count_dict.items())
    word_items.sort(key=lambda x:-x[1])
    print('高頻詞:', word_items[:50])
    print('總耗時:', time.time() - t0)

  

  實例1:根據人名預測性別

from nltk.corpus import names
from nltk.classify import NaiveBayesClassifier
# 導入數據 [(u'Aaron', 'male'), (u'Abbey', 'male')]
data = ([(name, 'male') for name in names.words('male.txt')] + 
     [(name, 'female') for name in names.words('female.txt')])

# 提取特征
def gender_features(word): 
    return {'last_letter': word[-1]}

train_set = [(gender_features(n), g) for (n,g) in data]

# 訓練模型
classifier = NaiveBayesClassifier.train(train_set) 
classifier.classify(gender_features('Frank'))

  實例2:確定積極評論和消極評論所占的比例

from nltk.classify import NaiveBayesClassifier
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]

def word_feats(words):
    return dict([(word, True) for word in words])
 
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

train_set = negative_features + positive_features + neutral_features
classifier = NaiveBayesClassifier.train(train_set)

neg = 0
pos = 0
sentence = "Awesome movie, I liked it"
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
    classResult = classifier.classify(word_feats(word))
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1

print('Positive: ' + str(float(pos) / len(words)))
print('Negative: ' + str(float(neg) / len(words)))

  

參考鏈接:

Jieba、NLTK等8種中英文分詞工具的分詞效果對比

官方文檔:Natural Language Toolkit — NLTK 3.3 documentation

 解決jieba分詞 load_userdict 加載自定義詞庫太慢的問題

jieba分詞未登錄詞的Viterbi算法源碼解析(二)

結巴分詞3--基於漢字成詞能力的HMM模型識別未登錄詞

文本特征提取方法

 

搜狗輸入法詞庫

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM