git: https://github.com/linyi0604/MachineLearning
分別使用詞袋法和nltk自然預言處理包提供的文本特征提取
1 from sklearn.feature_extraction.text import CountVectorizer 2 import nltk 3 # nltk.download("punkt") 4 # nltk.download('averaged_perceptron_tagger') 5 6 ''' 7 分別使用詞袋法和nltk自然預言處理包提供的文本特征提取 8 ''' 9 10 sent1 = "The cat is walking in the bedroom." 11 sent2 = "A dog was running across the kitchen." 12 # 使用詞袋法 將文本轉化為特征向量 13 count_vec = CountVectorizer() 14 sentences = [sent1, sent2] 15 # 輸出轉化后的特征向量 16 # print(count_vec.fit_transform(sentences).toarray()) 17 ''' 18 [[0 1 1 0 1 1 0 0 2 1 0] 19 [1 0 0 1 0 0 1 1 1 0 1]] 20 ''' 21 # 輸出轉化后特征的含義 22 # print(count_vec.get_feature_names()) 23 ''' 24 ['across', 'bedroom', 'cat', 'dog', 'in', 'is', 'kitchen', 'running', 'the', 'walking', 'was'] 25 ''' 26 27 # 使用nltk對文本進行語言分析 28 # 對句子詞匯分割和正則化 把aren't 分割成 are 和 n't I'm 分割成 I和'm 29 tokens1 = nltk.word_tokenize(sent1) 30 tokens2 = nltk.word_tokenize(sent2) 31 # print(tokens1) 32 # print(tokens2) 33 ''' 34 ['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom', '.'] 35 ['A', 'dog', 'was', 'running', 'across', 'the', 'kitchen', '.'] 36 ''' 37 # 整理詞匯表 按照ASCII的順序排序 38 vocab_1 = sorted(set(tokens1)) 39 vocab_2 = sorted(set(tokens2)) 40 # print(vocab_1) 41 # print(vocab_2) 42 ''' 43 ['.', 'The', 'bedroom', 'cat', 'in', 'is', 'the', 'walking'] 44 ['.', 'A', 'across', 'dog', 'kitchen', 'running', 'the', 'was'] 45 ''' 46 # 初始化stemer 尋找每個單詞最原始的詞根 47 stemmer = nltk.stem.PorterStemmer() 48 stem_1 = [stemmer.stem(t) for t in tokens1] 49 stem_2 = [stemmer.stem(t) for t in tokens2] 50 # print(stem_1) 51 # print(stem_2) 52 ''' 53 ['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.'] 54 ['A', 'dog', 'wa', 'run', 'across', 'the', 'kitchen', '.'] 55 ''' 56 # 利用詞性標注器 對詞性進行標注 57 pos_tag_1 = nltk.tag.pos_tag(tokens1) 58 pos_tag_2 = nltk.tag.pos_tag(tokens2) 59 # print(pos_tag_1) 60 # print(pos_tag_2) 61 ''' 62 [('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN'), ('.', '.')] 63 [('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN'), ('.', '.')] 64 '''