本節內容:
- 文本分析與關鍵詞提取
- 相似度計算
- 新聞數據與任務簡介
- TF-IDF關鍵詞提取
- LDA建模
- 基於貝葉斯算法進行新聞分類
1、文本分析與關鍵詞提取
2、相似度計算

1.數據 import pandas as pd import jieba #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 print(df_news.head()) 1234567 2.分詞:使用jieba庫 import pandas as pd import jieba # #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 print(content[1000]) print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) print(content_S[1000]) 12345678910111213141516171819 3.#將分完詞的結果轉化成DataFrame格式 import pandas as pd import jieba # #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 #print(content[1000]) #print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) #print(content_S[1000]) #將分完詞的結果轉化成DataFrame格式 df_content = pd.DataFrame({"content_S":content_S}) print(df_content.head()) 1234567891011121314151617181920212223 4.清洗數據(上面數據可以看到很亂),用停用詞表清洗停用詞 注:停用詞(語料庫中大量出現但是沒什么用的詞,比如“的”) import pandas as pd import jieba #分詞 #去除停用詞函數 def drop_stopwords(contents,stopwords): contents_clean = [] all_words = [] for line in contents: line_clean = [] for word in line: if word in stopwords: continue line_clean.append(word) all_words.append(str(word)) #所有的詞組成一個列表 contents_clean.append(line_clean) return contents_clean,all_words #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 #print(content[1000]) #print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) #print(content_S[1000]) #將分完詞的結果轉化成DataFrame格式 df_content = pd.DataFrame({"content_S":content_S}) #print(df_content.head()) #清洗亂的數據,用停用詞表去除停用詞 stopwords = pd.read_csv('stopwords.txt',index_col=False,sep='\t',quoting=3,names=["stopword"],encoding="utf-8") #讀入停用詞 print(stopwords.head(20)) print("-------------------------------------------------") #調用去除停用詞函數 contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist() contents_clean,all_words = drop_stopwords(contents,stopwords) #將清洗完的數據結果轉化成DataFrame格式 df_content = pd.DataFrame({"contents_clean":contents_clean}) print(df_content.head()) 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 5.用all_words統計詞頻 import numpy as np import pandas as pd import jieba #分詞 #去除停用詞函數 def drop_stopwords(contents,stopwords): contents_clean = [] all_words = [] for line in contents: line_clean = [] for word in line: if word in stopwords: continue line_clean.append(word) all_words.append(str(word)) contents_clean.append(line_clean) return contents_clean,all_words #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 #print(content[1000]) #print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) #print(content_S[1000]) #將分完詞的結果轉化成DataFrame格式 df_content = pd.DataFrame({"content_S":content_S}) #print(df_content.head()) #清洗亂的數據,用停用詞表去除停用詞 stopwords = pd.read_csv('stopwords.txt',index_col=False,sep='\t',quoting=3,names=["stopword"],encoding="utf-8") #讀入停用詞 #print(stopwords.head(20)) #print("-------------------------------------------------") #調用去除停用詞函數 contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist() contents_clean,all_words = drop_stopwords(contents,stopwords) #將清洗完的數據結果轉化成DataFrame格式 df_content = pd.DataFrame({"contents_clean":contents_clean}) df_all_words = pd.DataFrame({"all_words":all_words}) print(df_content.head()) print("-------------------------------------") print(df_all_words.head()) #用保存的all_word統計一下詞頻 words_count = df_all_words.groupby(by=["all_words"])["all_words"].agg({"count":np.size}) #groupby就是按詞分類 words_count = words_count.reset_index().sort_values(by=["count"],ascending=False) #降序 print(words_count.head()) 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 6.#用jieba.analyse提取關鍵詞(TF-IDF) 注:什么是TF-IDF(詞頻-逆文檔頻率)? TF = 某個詞在文章中出現的次數 / 文章的總詞數 IDF = log (語料庫中文檔總數 / 包含該詞的文檔數+1) import numpy as np import pandas as pd import jieba #分詞 from jieba import analyse #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 #print(content[1000]) #print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) print(content_S[1000]) #用jieba.analyse提取關鍵詞 index = 1000 print(df_news["content"][index]) #打印第1000數據的content content_S_str = "".join(content_S[index]) #將單個詞列表連接在一起 print(content_S_str) print(" ".join(analyse.extract_tags(content_S_str,topK=5))) 12345678910111213141516171819202122232425262728 7.LDA:主題模型 #要求格式:list of list格式,是將整個語料庫分詞好的list of list import numpy as np import pandas as pd import jieba #分詞 from jieba import analyse import gensim #自然語言處理庫 from gensim import corpora,models,similarities #去除停用詞函數 def drop_stopwords(contents,stopwords): contents_clean = [] all_words = [] for line in contents: line_clean = [] for word in line: if word in stopwords: continue line_clean.append(word) all_words.append(str(word)) contents_clean.append(line_clean) return contents_clean,all_words #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 #print(content[1000]) #print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) #print(content_S[1000]) #將分完詞的結果轉化成DataFrame格式 df_content = pd.DataFrame({"content_S":content_S}) #print(df_content.head()) #清洗亂的數據,用停用詞表去除停用詞 stopwords = pd.read_csv('stopwords.txt',index_col=False,sep='\t',quoting=3,names=["stopword"],encoding="utf-8") #讀入停用詞 #print(stopwords.head(20)) #print("-------------------------------------------------") #調用去除停用詞函數 contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist() contents_clean,all_words = drop_stopwords(contents,stopwords) #將清洗完的數據結果轉化成DataFrame格式 df_content = pd.DataFrame({"contents_clean":contents_clean}) df_all_words = pd.DataFrame({"all_words":all_words}) #print(df_content.head()) #print("-------------------------------------") #print(df_all_words.head()) #LDA:主題模型 #要求格式:list of list格式,是將整個語料庫分詞好的list of list #做映射,相當於詞袋 dictionary = corpora.Dictionary(contents_clean) #將清洗完的數據生成字典形式 corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean] print(dictionary) print("---------------------") print(corpus) print("---------------------") lda = gensim.models.LdaModel(corpus=corpus,id2word=dictionary,num_topics=20) #打印1號分類結果 print(lda.print_topic(1,topn=5)) 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 8.新聞分類 第一步:現將數據集的標簽轉換成sklearn可以識別的數值型格式 import numpy as np import pandas as pd import jieba #分詞 from jieba import analyse import gensim #自然語言處理庫 from gensim import corpora,models,similarities #去除停用詞函數 def drop_stopwords(contents,stopwords): contents_clean = [] all_words = [] for line in contents: line_clean = [] for word in line: if word in stopwords: continue line_clean.append(word) all_words.append(str(word)) contents_clean.append(line_clean) return contents_clean,all_words #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 #print(content[1000]) #print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) #print(content_S[1000]) #將分完詞的結果轉化成DataFrame格式 df_content = pd.DataFrame({"content_S":content_S}) #print(df_content.head()) #清洗亂的數據,用停用詞表去除停用詞 stopwords = pd.read_csv('stopwords.txt',index_col=False,sep='\t',quoting=3,names=["stopword"],encoding="utf-8") #讀入停用詞 #print(stopwords.head(20)) #print("-------------------------------------------------") #調用去除停用詞函數 contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist() contents_clean,all_words = drop_stopwords(contents,stopwords) #將清洗完的數據結果轉化成DataFrame格式 df_content = pd.DataFrame({"contents_clean":contents_clean}) df_all_words = pd.DataFrame({"all_words":all_words}) #print(df_content.head()) #print("-------------------------------------") #print(df_all_words.head()) #新聞分類 #打印DataFreme格式的內容和標簽 df_train = pd.DataFrame({"contents_clean":contents_clean,"label":df_news["category"]}) print(df_train.tail()) #打印最后幾個數據 print("--------------------------------------1------------------------------------------") print(df_train.label.unique()) #打印標簽的種類 print("--------------------------------------2------------------------------------------") #因為sklearn只識別數值型標簽,所以將字符型標簽轉換成數值型 label_mappping = {'汽車':1,'財經':2, '科技':3, '健康':4, '體育':5, '教育':6, '文化':7, '軍事':8, '娛樂':9, '時尚':0} df_train["label"] = df_train["label"].map(label_mappping) print(df_train.head()) print("--------------------------------------3------------------------------------------") 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 第二步:將清洗過的文章分詞轉化成朴素貝葉斯的矩陣形式[[],[],[],[]…] 如:注意標注的格式,下面需要將數據轉化成這種格式 import numpy as np import pandas as pd import jieba #分詞 from jieba import analyse import gensim #自然語言處理庫 from gensim import corpora,models,similarities from sklearn.feature_extraction.text import CountVectorizer #詞集轉換成向量 from sklearn.model_selection import train_test_split #去除停用詞函數 def drop_stopwords(contents,stopwords): contents_clean = [] all_words = [] for line in contents: line_clean = [] for word in line: if word in stopwords: continue line_clean.append(word) all_words.append(str(word)) contents_clean.append(line_clean) return contents_clean,all_words def format_transform(x): #x是數據集(訓練集或者測試集) words =[] for line_index in range(len(x)): try: words.append(" ".join(x[line_index])) except: print("數據格式有問題") return words def vec_transform(words): vec = CountVectorizer(analyzer="word",max_features=4000,lowercase=False) return vec.fit(words) #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 #print(content[1000]) #print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) #print(content_S[1000]) #將分完詞的結果轉化成DataFrame格式 df_content = pd.DataFrame({"content_S":content_S}) #print(df_content.head()) #清洗亂的數據,用停用詞表去除停用詞 stopwords = pd.read_csv('stopwords.txt',index_col=False,sep='\t',quoting=3,names=["stopword"],encoding="utf-8") #讀入停用詞 #print(stopwords.head(20)) #print("-------------------------------------------------") #調用去除停用詞函數 contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist() contents_clean,all_words = drop_stopwords(contents,stopwords) #將清洗完的數據結果轉化成DataFrame格式 df_content = pd.DataFrame({"contents_clean":contents_clean}) df_all_words = pd.DataFrame({"all_words":all_words}) #print(df_content.head()) #print("-------------------------------------") #print(df_all_words.head()) #新聞分類 #打印DataFreme格式的內容和標簽 df_train = pd.DataFrame({"contents_clean":contents_clean,"label":df_news["category"]}) #print(df_train.tail()) #打印最后幾個數據 #print("--------------------------------------1------------------------------------------") #print(df_train.label.unique()) #打印標簽的種類 #print("--------------------------------------2------------------------------------------") #因為sklearn只識別數值型標簽,所以將字符型標簽轉換成數值型 label_mappping = {'汽車':1,'財經':2, '科技':3, '健康':4, '體育':5, '教育':6, '文化':7, '軍事':8, '娛樂':9, '時尚':0} df_train["label"] = df_train["label"].map(label_mappping) #print(df_train.head()) #print("--------------------------------------3------------------------------------------") #切分數據集 x_train,x_test,y_train,y_test = train_test_split(df_train["contents_clean"].values,df_train["label"].values) #將清洗過的文章分詞轉化成朴素貝葉斯的矩陣形式[[],[],[],[]...] #首先將數據的分詞(list of list)轉換成["a b c","a b c",...]這種格式,因為調用的包只識別這種格式 #調用函數format_transform()函數 words_train = format_transform(x_train) words_test = format_transform(x_test) #轉化成向量格式,調用函數vec_transform() vec_trian = vec_transform(words_train) vec_test = vec_transform(words_test) print(vec_trian.transform(words_train)) print("------------------------------------------------") print(vec_test) 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 第三步:訓練數據,並給出測試的效果 import numpy as np import pandas as pd import jieba #分詞 from jieba import analyse import gensim #自然語言處理庫 from gensim import corpora,models,similarities from sklearn.feature_extraction.text import CountVectorizer #詞集轉換成向量 from sklearn.feature_extraction.text import TfidfVectorizer #另一個轉換成向量的庫 from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB #朴素貝葉斯多分類 #去除停用詞函數 def drop_stopwords(contents,stopwords): contents_clean = [] all_words = [] for line in contents: line_clean = [] for word in line: if word in stopwords: continue line_clean.append(word) all_words.append(str(word)) contents_clean.append(line_clean) return contents_clean,all_words def format_transform(x): #x是數據集(訓練集或者測試集) words =[] for line_index in range(len(x)): try: words.append(" ".join(x[line_index])) except: print("數據格式有問題") return words def vec_transform(words): vec = CountVectorizer(analyzer="word",max_features=4000,lowercase=False) return vec.fit(words) #數據(一小部分的新聞數據) df_news = pd.read_table('val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() #直接丟棄包括NAN的整條數據 #print(df_news.head()) #分詞 content = df_news.content.values.tolist() #因為jieba要列表格式 #print(content[1000]) #print("------------------------------------------------------") content_S = [] #存儲分完詞之后結果 for line in content: current_segment = jieba.lcut(line) #jieba分詞 if len(current_segment) > 1 and current_segment != "\r\n": content_S.append(current_segment) #print(content_S[1000]) #將分完詞的結果轉化成DataFrame格式 df_content = pd.DataFrame({"content_S":content_S}) #print(df_content.head()) #清洗亂的數據,用停用詞表去除停用詞 stopwords = pd.read_csv('stopwords.txt',index_col=False,sep='\t',quoting=3,names=["stopword"],encoding="utf-8") #讀入停用詞 #print(stopwords.head(20)) #print("-------------------------------------------------") #調用去除停用詞函數 contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist() contents_clean,all_words = drop_stopwords(contents,stopwords) #將清洗完的數據結果轉化成DataFrame格式 df_content = pd.DataFrame({"contents_clean":contents_clean}) df_all_words = pd.DataFrame({"all_words":all_words}) #print(df_content.head()) #print("-------------------------------------") #print(df_all_words.head()) #新聞分類 #打印DataFreme格式的內容和標簽 df_train = pd.DataFrame({"contents_clean":contents_clean,"label":df_news["category"]}) #print(df_train.tail()) #打印最后幾個數據 #print("--------------------------------------1------------------------------------------") #print(df_train.label.unique()) #打印標簽的種類 #print("--------------------------------------2------------------------------------------") #因為sklearn只識別數值型標簽,所以將字符型標簽轉換成數值型 label_mappping = {'汽車':1,'財經':2, '科技':3, '健康':4, '體育':5, '教育':6, '文化':7, '軍事':8, '娛樂':9, '時尚':0} df_train["label"] = df_train["label"].map(label_mappping) #print(df_train.head()) #print("--------------------------------------3------------------------------------------") #切分數據集 x_train,x_test,y_train,y_test = train_test_split(df_train["contents_clean"].values,df_train["label"].values) #將清洗過的文章分詞轉化成朴素貝葉斯的矩陣形式[[],[],[],[]...] #首先將數據的分詞(list of list)轉換成["a b c","a b c",...]這種格式,因為調用的包只識別這種格式 #調用函數format_transform()函數 words_train = format_transform(x_train) words_test = format_transform(x_test) #轉化成向量格式,調用函數vec_transform() vec_trian = vec_transform(words_train) #vec_test = vec_transform(words_test) #print(vec_trian.transform(words_train)) #print("------------------------------------------------") #print(vec_test) #訓練 nbm = MultinomialNB() nbm.fit(vec_trian.transform(words_train),y_train) #測試 score = nbm.score(vec_trian.transform(words_test),y_test) print(score) #0.8016 #原文:https://blog.csdn.net/fenfenxhf/article/details/82954613