工作中有的時候需要對文本進行拆分,然后分析詞頻,分詞用結巴分詞做了一個簡單的,代碼如下:
import pandas ##引入pandas包
from pandas import Series as sr, DataFrame as df ##從pandas包引入Series與DataFrame格式
from collections import Counter as cr ##引入Counter進行計數
import jieba.posseg as pseg ##引入結巴分詞詞性標注
path = '' ##讀取文件路徑
data1 = df.read_csv(path,sep= ) ## sep后填文件間隔符,csv一般為'\t'
l = len(data1)
df1=df(columns=['word','type'])
for i in range(l):
words = pseg.cut(data1.ix[i][x]) ##x填寫要分詞的內容所在列數-1
for t in words:
df2 = pd.DataFrame([t.word,t.flag], columns=data2.columns)
df1.append(df2,ignore_index=True)
df3=df1.groupby(['word','type']).count()
