R語言-文本挖掘主題模型文本分類

本文轉載自查看原文 2015-11-15 21:48 1053

####需要先安裝幾個R包，如果有這些包，可省略安裝包的步驟。
#install.packages("Rwordseg")
#install.packages("tm");
#install.packages("wordcloud");
#install.packages("topicmodels")

例子中所用數據

數據來源於sougou實驗室數據。

數據網址：http://download.labs.sogou.com/dl/sogoulabdown/SogouC.mini.20061102.tar.gz

文件結構

└─Sample

├─C000007 汽車

├─C000008 財經

├─C000010 IT

├─C000013 健康

├─C000014 體育

├─C000016 旅游

├─C000020 教育

├─C000022 招聘

├─C000023

└─C000024 軍事

采用Python對數據進行預處理為train.csv文件，並把每個文件文本數據處理為1行。

預處理python腳本
<ignore_js_op> combineSample.zip (720 Bytes, 下載次數: 96)

所需數據
<ignore_js_op> train.zip (130.2 KB, 下載次數: 164)
大家也可以用R直接將原始數據轉變成train.csv中的數據

文章所需stopwords
<ignore_js_op> StopWords.zip (2.96 KB, 下載次數: 114)

1. 讀取資料庫

csv <- read.csv("d://wb//train.csv",header=T, stringsAsFactors=F)
mystopwords<- unlist (read.table("d://wb//StopWords.txt",stringsAsFactors=F))

復制代碼

2.

數據預處理（中文分詞、stopwords處理）

library(tm);
#移除數字
removeNumbers = function(x) { ret = gsub("[0-9０１２３４５６７８９]","",x) }
sample.words <- lapply(csv$$$$text, removeNumbers)

復制代碼

#處理中文分詞,此處用到Rwordseg包
wordsegment<- function(x) {
library(Rwordseg)
segmentCN(x)
}
sample.words <- lapply(sample.words, wordsegment)

復制代碼

###stopwords處理
###先處理中文分詞，再處理stopwords，防止全局替換丟失信息
removeStopWords = function(x,words) {
ret = character(0)
index <- 1
it_max <- length(x)
while (index <= it_max) {
if (length(words[words==x[index]]) <1) ret <- c(ret,x[index])
index <- index +1
}
ret
}
sample.words <- lapply(sample.words, removeStopWords, mystopwords)

復制代碼

3. wordcloud展示

#構建語料庫
corpus = Corpus(VectorSource(sample.words))
meta(corpus,"cluster") <- csv$$$$type
unique_type <- unique(csv$$$$type)
#建立文檔-詞條矩陣
(sample.dtm <- DocumentTermMatrix(corpus, control = list(wordLengths = c(2, Inf))))

復制代碼

#install.packages("wordcloud"); ##需要wordcloud包的支持
library(wordcloud);
#不同文檔wordcloud對比圖
sample.tdm <- TermDocumentMatrix(corpus, control = list(wordLengths = c(2, Inf)));
tdm_matrix <- as.matrix(sample.tdm);
png(paste("d://wb//sample_comparison",".png", sep = ""), width = 1500, height = 1500 );
comparison.cloud(tdm_matrix,colors=rainbow(ncol(tdm_matrix)));####由於顏色問題，稍作修改
title(main = "sample comparision");
dev.off();

復制代碼

#按分類匯總wordcloud對比圖
n <- nrow(csv)
zz1 = 1:n
cluster_matrix<-sapply(unique_type,function(type){apply(tdm_matrix[,zz1[csv$$$$type==type]],1,sum)})
png(paste("d://wb//sample_ cluster_comparison",".png", sep = ""), width = 800, height = 800 )
comparison.cloud(cluster_matrix,colors=brewer.pal(ncol(cluster_matrix),"Paired")) ##由於顏色分類過少，此處稍作修改
title(main = "sample cluster comparision")
dev.off()

復制代碼

<ignore_js_op>

可以看出數據分布不均勻，culture、auto等數據很少。

#按各分類畫wordcloud
sample.cloud <- function(cluster, maxwords = 100) {
words <- sample.words[which(csv$$$$type==cluster)]
allwords <- unlist(words)
wordsfreq <- sort(table(allwords), decreasing = T)
wordsname <- names(wordsfreq)
png(paste("d://wb//sample_", cluster, ".png", sep = ""), width = 600, height = 600 )
wordcloud(wordsname, wordsfreq, scale = c(6, 1.5), min.freq = 2, max.words = maxwords, colors = rainbow(100))
title(main = paste("cluster:", cluster))
dev.off()
}
lapply(unique_type,sample.cloud)# unique(csv$$$$type)

復制代碼

<ignore_js_op>
<ignore_js_op>

4. 主題模型分析

library(slam)
summary(col_sums(sample.dtm))
term_tfidf <- tapply(sample.dtm$$$$v/row_sums( sample.dtm)[ sample.dtm$$$$i], sample.dtm$$$$j, mean)*
log2(nDocs( sample.dtm)/col_sums( sample.dtm > 0))
summary(term_tfidf)
sample.dtm <- sample.dtm[, term_tfidf >= 0.1]
sample.dtm <- sample.dtm[row_sums(sample.dtm) > 0,]
library(topicmodels)
k <- 30
SEED <- 2010
sample_TM <-
list(
VEM = LDA(sample.dtm, k = k, control = list(seed = SEED)),
VEM_fixed = LDA(sample.dtm, k = k,control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(sample.dtm, k = k, method = "Gibbs",control = list(seed = SEED, burnin = 1000,thin = 100, iter = 1000)),
CTM = CTM(sample.dtm, k = k,control = list(seed = SEED,var = list(tol = 10^-4), em = list(tol = 10^-3)))
)

復制代碼