讀取資料與套件

library(tm)
## Loading required package: NLP
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
library(cluster)
library(proxy)
## Warning: package 'proxy' was built under R version 3.2.5
## 
## Attaching package: 'proxy'
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## The following object is masked from 'package:base':
## 
##     as.matrix
download.file('https://github.com/ywchiu/rtibame/raw/master/news_big5.RData', destfile = 'news_big5.RData')
load('news_big5.RData')
mixseg = worker()

建立詞頻矩陣

mixseg = worker()
news.seg =lapply(as.character(news$V2), function(e)segment(code=e, jiebar=mixseg))

jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}

space_tokenizer=function(x){
  unlist(strsplit(as.character(x[[1]]),'[[:space:]]+'))
}

doc=VCorpus(VectorSource(news.seg))
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)

dim(dtm)
## [1]   147 11761

階層式分群

dtm.remove = removeSparseTerms(dtm, 0.99)
dtm.dist = proxy::dist(as.matrix(dtm.remove), method = "cosine")
dtm.mat = as.matrix(dtm.dist)
hc = hclust(dist(dtm.mat, method="euclidean"), method="ward.D2")

繪製分群結果

plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 4 , border="red")

fit = cutree(hc, k = 4)
table(fit)
## fit
##  1  2  3  4 
## 38 18 83  8