工作中有的時候需要對文本進行拆分,然后分析詞頻,分詞用結巴分詞做了一個簡單的,代碼如下:
import pandas ##引入pandas包 from pandas import Series as sr, DataFrame as df ##從pandas包引入Series與DataFrame格式 from collections import Counter as cr ##引入Counter進行計數 import jieba.posseg as pseg ##引入結巴分詞詞性標注 path = '' ##讀取文件路徑 data1 = df.read_csv(path,sep= ) ## sep后填文件間隔符,csv一般為'\t' l = len(data1) df1=df(columns=['word','type']) for i in range(l): words = pseg.cut(data1.ix[i][x]) ##x填寫要分詞的內容所在列數-1 for t in words: df2 = pd.DataFrame([t.word,t.flag], columns=data2.columns) df1.append(df2,ignore_index=True) df3=df1.groupby(['word','type']).count()