options(warn=-1) # 環境設定 library(jiebaR) # 斷詞利器 library(tm) # 文字詞彙矩陣運算 library(slam) # 稀疏矩陣運算 library(wordcloud) # 文字雲 library(topicmodels) # 主題模型 library(igraph) # 主題模型關聯 # 起手式,結巴建立斷詞器 mixseg = worker() # 常用操作函數化,讀資料檔用 readSpeaker <- function(file,sep){ lines <- readLines(file) allString <- do.call(paste,as.list(lines)) sepString <- strsplit(allString,sep,fixed=TRUE) return(sepString[[1]]) } # 講者清單 speakers <- readSpeaker("speaker.txt",sep="-----") length(speakers);speakers[1] # 第一位講者分的如何? first_speaker <- mixseg <= speakers[1] first_speaker # 自幹一下斷詞過濾器(內建用perl的 regular expression 有bug!!!!!!!) cutter <- function(msg){ filter_words = c("的","在","與","及","等","是","the","and","in","a","at","he","is","of","He") pattern <- sprintf("[^%s]", paste(filter_words, collapse = "|^")) filter_seg <- grep(pattern, mixseg <= msg ,value=TRUE) return(filter_seg) } # 再把每篇文章向量化 segRes = lapply(speakers,cutter) tmWordsVec = sapply(segRes,function(ws) paste(ws,collapse = " ")) tmWordsVec[1] # 文字探勘基本結構,語料庫 & TDM corpus <- Corpus(VectorSource(tmWordsVec)) tdm = TermDocumentMatrix(corpus,control = list(wordLengths = c(1, Inf))) tdm # 看看一下詞頻分的如何 (因為看到分的不好才弄個過濾器的) dtm1 <- DocumentTermMatrix(corpus, control = list( wordLengths=c(1, Inf), # to allow long words removeNumbers = TRUE, weighting = weightTf, encoding = "UTF-8") ) # colnames(dtm1) findFreqTerms(dtm1, 10) # 看一下高频词, he沒法filter掉 # DTM (Document Term Matrix 顧名思義.....) m <- as.matrix(dtm1) head(m) # 文字分析最愛的....文字雲 v <- sort(colSums(m), decreasing=TRUE) myNames <- names(v) d <- data.frame(word=myNames, freq=v) pal2 <- brewer.pal(8,"Dark2") png(paste(getwd(), "/wordcloud100_", ".png", sep = ''), width=10, height=10, units="in", res=700) wordcloud(d$word,d$freq, scale=c(6,0.5), min.freq=mean(d$freq), max.words=100, random.order=FALSE, rot.per=.01, colors=pal2) dev.off() # 利用tfidf 來處理高頻詞高估,低頻詞低估 dtm = dtm1 term_tfidf <-tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) * log2(nDocs(dtm)/col_sums(dtm > 0)) l1= term_tfidf >= quantile(term_tfidf, 0.5) # second quantile, ie. median summary(col_sums(dtm)) dim(dtm);dtm <- dtm[,l1] dtm = dtm[row_sums(dtm)>0, ]; dim(dtm) summary(col_sums(dtm)) # 建立主題模型 (出處參考大陸強者 Ref 3) fold_num = 10 kv_num = seq(2,24) seed_num = 2015 try_num = 1 smp<-function(cross=fold_num,n,seed) { set.seed(seed) dd=list() aa0=sample(rep(1:cross,ceiling(n/cross))[1:n],n) for (i in 1:cross) dd[[i]]=(1:n)[aa0==i] return(dd) } selectK<-function(dtm,kv=kv_num,SEED=seed_num,cross=fold_num,sp) # change 60 to 15 { per_ctm=NULL log_ctm=NULL for (k in kv) { per=NULL loglik=NULL for (i in 1:try_num) #only run for 3 replications# { cat("R is running for", "topic", k, "fold", i, as.character(as.POSIXlt(Sys.time(), "Asia/Shanghai")),"\n") te=sp[[i]] tr=setdiff(1:dtm$nrow, te) # setdiff(nrow(dtm),te) ## fix here when restart r session # VEM = LDA(dtm[tr, ], k = k, control = list(seed = SEED)), # VEM_fixed = LDA(dtm[tr,], k = k, control = list(estimate.alpha = FALSE, seed = SEED)), # CTM = CTM(dtm[tr,], k = k, # control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))) # Gibbs = LDA(dtm[tr,], k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,thin = 100, iter = 1000)) per=c(per,perplexity(Gibbs,newdata=dtm[te,])) loglik=c(loglik,logLik(Gibbs,newdata=dtm[te,])) } per_ctm=rbind(per_ctm,per) log_ctm=rbind(log_ctm,loglik) } return(list(perplex=per_ctm,loglik=log_ctm)) } sp=smp(n=dtm$nrow, seed=seed_num) # n = nrow(dtm) system.time((ctmK=selectK(dtm=dtm,kv=kv_num,SEED=seed_num,cross=fold_num,sp=sp))) # 跑個模擬,挑一個好的主題數 m_per=apply(ctmK[[1]],1,mean) m_log=apply(ctmK[[2]],1,mean) k=c(kv_num) #df = ctmK[[1]] # perplexity matrix logLik = ctmK[[2]] # perplexity matrix # matplot(k, df, type = c("b"), xlab = "Number of topics", # ylab = "Perplexity", pch=1:try_num,col = 1, main = '') # legend("topright", legend = paste("fold", 1:try_num), col=1, pch=1:try_num) matplot(k, logLik, type = c("b"), xlab = "Number of topics", ylab = "Log-Likelihood", pch=1:try_num,col = 1, main = '') legend("topleft", legend = paste("fold", 1:try_num), col=1, pch=1:try_num) # 現成有四種調法 k = which(logLik ==max(logLik))+1 SEED <- 2015 jss_TM2 <- list( VEM = LDA(dtm, k = k, control = list(seed = SEED)), VEM_fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)), Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000, thin = 100, iter = 1000)), CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))) ) # terms(模型,要的文字數) termsForSave1<- terms(jss_TM2[["VEM"]], 10) termsForSave2<- terms(jss_TM2[["VEM_fixed"]], 10) termsForSave3<- terms(jss_TM2[["Gibbs"]], 10) termsForSave4<- terms(jss_TM2[["CTM"]], 10) #'topic graphs' #tfs = as.data.frame(termsForSave1, stringsAsFactors = F);tfs tfs = as.data.frame(termsForSave2, stringsAsFactors = F);tfs #tfs = as.data.frame(termsForSave3, stringsAsFactors = F);tfs #tfs = as.data.frame(termsForSave4, stringsAsFactors = F);tfs adjacent_list = lapply(1:10, function(i) embed(tfs[,i], 2)[, 2:1]) edgelist = as.data.frame(do.call(rbind, adjacent_list), stringsAsFactors =F) topic = unlist(lapply(1:10, function(i) rep(i, 9))) edgelist$topic = topic g <-graph.data.frame(edgelist,directed=T ) l<-layout.fruchterman.reingold(g) # edge.color="black" nodesize = centralization.degree(g)$res V(g)$size = log( centralization.degree(g)$res ) nodeLabel = V(g)$name E(g)$color = unlist(lapply(sample(colors()[26:137], 10), function(i) rep(i, 9)));#unique(E(g)$color) #png( paste(getwd(), "/images/topic_graph_gibbs.png", sep=""), width=10, height=10, units="in", res=700) plot(g, vertex.label= nodeLabel, edge.curved=TRUE, vertex.label.cex =1.25, edge.arrow.size=0.2, layout=l) #dev.off()