————-Clustering—————
library(tm)
#### set the working directory to yours
setwd("C:/Users/ngsook/Documents")
#### read in the data (8 topics)
#### Random Shuffle the data
#### Extract 'crude', 'interest', 'trade' classification
full_text <- read.table('r8-train-all-terms.txt', header=FALSE, sep='\t')
set.seed(10)
full_text_rand <- full_text[sample(1:nrow(full_text)),]
train_text <- full_text_rand[which(full_text_rand$V1 %in% c('crude', 'interest', 'trade')),]
y_label <- as.character(train_text$V1)
table(y_label)
## y_label
## crude interest trade
## 253 190 251
Create Corpus
corpus <- Corpus(VectorSource(train_text$V2)) #only V2 are selected
corpus <- tm_map(corpus, content_transformer(tolower)) #covernt to lower cases
corpus <- tm_map(corpus, removeNumbers) #remove digits
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument) #word stemming
corpus <- tm_map(corpus, removeWords, stopwords('english')) #stopwords removal
corpus <- tm_map(corpus, stripWhitespace) #delete redundent whitespace "a b"-> "a b"
Create DTM Unigram+TFIDF as the feature set
dtm <- DocumentTermMatrix(corpus)
dim(dtm)
## [1] 694 5075
dtm_ti <- weightTfIdf(dtm)
dim(dtm_ti)
## [1] 694 5075
K-Means CLustering, 3 clusters
km_3 <- kmeans(dtm_ti,3)
head(km_3$cluster, 30)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 1 1 1 1 3 1 1 1 3 3 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## 26 27 28 29 30
## 1 3 1 1 1
km_3$size
## [1] 524 32 138
table(y_label,km_3$cluster)
##
## y_label 1 2 3
## crude 221 32 0
## interest 53 0 137
## trade 250 0 1
Apply SVD (Singular Vector Decomposition)
install.packages(“rARPACK”)
library("rARPACK")
do truncated SVD, get top 500
dtm_svd <- svds(as.matrix(dtm_ti), 500)
d <- dtm_svd$d
plot(d^2)

dim(dtm_svd$u)
## [1] 694 500
dim(dtm_svd$v)
## [1] 5075 500
showing leading word vectors from v
terms <- colnames(dtm_ti)
for (i in 1:5) {
cat("Top 10 terms for vector ", i, ":\n", sep="")
ix <- rank(-abs(dtm_svd$v[,i])) <= 10
print(data.frame(term=terms[ix], weight=round(dtm_svd$v[ix,i], 2)))
cat("\n")
}
## Top 10 terms for vector 1:
## term weight
## 1 rais -0.27
## 2 prime -0.24
## 3 bank -0.24
## 4 pct -0.30
## 5 rate -0.33
## 6 fed -0.16
## 7 crude -0.17
## 8 bbl -0.21
## 9 wti -0.18
## 10 cts -0.18
##
## Top 10 terms for vector 2:
## term weight
## 1 rais 0.19
## 2 pct -0.20
## 3 rate -0.22
## 4 fed -0.21
## 5 post 0.22
## 6 crude 0.25
## 7 tender -0.14
## 8 bbl 0.37
## 9 wti 0.33
## 10 cts 0.32
##
## Top 10 terms for vector 3:
## term weight
## 1 billion -0.21
## 2 prime 0.16
## 3 say -0.17
## 4 bank 0.15
## 5 pct 0.19
## 6 rate 0.21
## 7 set -0.16
## 8 fed -0.59
## 9 custom -0.27
## 10 repurchas -0.38
##
## Top 10 terms for vector 4:
## term weight
## 1 billion 0.17
## 2 deficit 0.32
## 3 franc 0.16
## 4 mln 0.29
## 5 prime -0.16
## 6 trade 0.21
## 7 fed -0.29
## 8 repurchas -0.18
## 9 distil 0.15
## 10 jan 0.15
##
## Top 10 terms for vector 5:
## term weight
## 1 prime 0.27
## 2 bank 0.20
## 3 fall -0.13
## 4 rise -0.09
## 5 discount -0.37
## 6 tender -0.50
## 7 bbl -0.10
## 8 wti -0.10
## 9 bill -0.37
## 10 top -0.38
plot the documents in first two factors
x <- dtm_svd$u[,1] * dtm_svd$d[1] ## Concept 1, d = concept
y <- dtm_svd$u[,2] * dtm_svd$d[2] ## Concept 2, d = concept
plot(x, y, col=as.integer(factor(y_label)))

take top 300 concept
new <- dtm_svd$u[, 1:300] %*% diag(dtm_svd$d[1:300])
km_3svd <- kmeans(new, 3, nstart=50)
km_3svd$size
## [1] 104 36 554
table(y_label,km_3svd$cluster)
##
## y_label 1 2 3
## crude 0 32 221
## interest 104 4 82
## trade 0 0 251
In real case, clustering is employed when y_labels are not available.
Thus this evaluation is not appropriate sometimes
Clustering provides only reasonable (high-level) clusters