# -*- coding: UTF-8 -*- import sys import numpy as np import pandas as pd import jieba import jieba.analyse import codecs #設置pd的顯示長度 pd.set_option('max_colwidth',500) #載入數據 rows=pd.read_csv('datas1.csv', header=0,encoding='utf-8',dtype=str) #載入停用詞 jieba.analyse.set_stop_words('stoped.txt') # 保存全局分詞,用於詞頻統計 segments = [] # 保存每行分詞,用於關聯分析 results = [] index = 0 for row in rows.index: content = rows[index:index+1]['content'].to_string() # 分詞操作 # words = jieba.cut(content) # TF-IDF關鍵詞抽取,關鍵詞數量設置為topK,不能過濾標點和數字 #words = jieba.analyse.extract_tags(content, topK=20) #TextRank 關鍵詞抽取,只獲取固定詞性 words = jieba.analyse.textrank(content, topK=20,withWeight=False,allowPOS=('ns', 'n', 'vn', 'v')) splitedStr = '' for word in words: # 記錄全局分詞 segments.append({'word':word, 'count':1}) splitedStr += word + ' ' # 記錄行結果 results.append({'text':content, 'words': splitedStr}) index = index + 1 # 將結果數組轉為df序列 dfSg = pd.DataFrame(segments) # 詞頻統計 dfWord = dfSg.groupby('word')['count'].sum() #導出csv dfWord.to_csv('keywords.csv',encoding='utf-8') dfRet = pd.DataFrame(results) dfRet.to_csv('result.csv',encoding='utf-8')