————-Clustering—————

library(tm)

#### set the working directory to yours
setwd("C:/Users/ngsook/Documents")

#### read in the data (8 topics)
#### Random Shuffle the data
#### Extract 'crude', 'interest', 'trade' classification
full_text <- read.table('r8-train-all-terms.txt', header=FALSE, sep='\t')
set.seed(10)
full_text_rand <- full_text[sample(1:nrow(full_text)),]
train_text <- full_text_rand[which(full_text_rand$V1 %in% c('crude', 'interest', 'trade')),]

y_label <- as.character(train_text$V1)
table(y_label)
## y_label
##    crude interest    trade 
##      253      190      251

Create Corpus

corpus <- Corpus(VectorSource(train_text$V2)) #only V2 are selected
corpus <- tm_map(corpus, content_transformer(tolower)) #covernt to lower cases
corpus <- tm_map(corpus, removeNumbers) #remove digits
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument) #word stemming
corpus <- tm_map(corpus, removeWords, stopwords('english')) #stopwords removal
corpus <- tm_map(corpus, stripWhitespace) #delete redundent whitespace "a  b"-> "a b"

Create DTM Unigram+TFIDF as the feature set

dtm <- DocumentTermMatrix(corpus)
dim(dtm)
## [1]  694 5075
dtm_ti <- weightTfIdf(dtm)
dim(dtm_ti)
## [1]  694 5075

K-Means CLustering, 3 clusters

km_3 <- kmeans(dtm_ti,3)
head(km_3$cluster, 30)
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  1  1  1  1  3  1  1  1  3  3  1  1  1  1  1  2  1  1  1  1  1  1  1  1  1 
## 26 27 28 29 30 
##  1  3  1  1  1
km_3$size
## [1] 524  32 138
table(y_label,km_3$cluster)
##           
## y_label      1   2   3
##   crude    221  32   0
##   interest  53   0 137
##   trade    250   0   1

Apply SVD (Singular Vector Decomposition)

install.packages(“rARPACK”)

library("rARPACK")

do truncated SVD, get top 500

dtm_svd <- svds(as.matrix(dtm_ti), 500)
d <- dtm_svd$d
plot(d^2)

dim(dtm_svd$u)
## [1] 694 500
dim(dtm_svd$v)
## [1] 5075  500

showing leading word vectors from v

terms <- colnames(dtm_ti)

for (i in 1:5) {
  cat("Top 10 terms for vector ", i, ":\n", sep="")
  ix <- rank(-abs(dtm_svd$v[,i])) <= 10
  print(data.frame(term=terms[ix], weight=round(dtm_svd$v[ix,i], 2)))
  cat("\n")
}
## Top 10 terms for vector 1:
##     term weight
## 1   rais  -0.27
## 2  prime  -0.24
## 3   bank  -0.24
## 4    pct  -0.30
## 5   rate  -0.33
## 6    fed  -0.16
## 7  crude  -0.17
## 8    bbl  -0.21
## 9    wti  -0.18
## 10   cts  -0.18
## 
## Top 10 terms for vector 2:
##      term weight
## 1    rais   0.19
## 2     pct  -0.20
## 3    rate  -0.22
## 4     fed  -0.21
## 5    post   0.22
## 6   crude   0.25
## 7  tender  -0.14
## 8     bbl   0.37
## 9     wti   0.33
## 10    cts   0.32
## 
## Top 10 terms for vector 3:
##         term weight
## 1    billion  -0.21
## 2      prime   0.16
## 3        say  -0.17
## 4       bank   0.15
## 5        pct   0.19
## 6       rate   0.21
## 7        set  -0.16
## 8        fed  -0.59
## 9     custom  -0.27
## 10 repurchas  -0.38
## 
## Top 10 terms for vector 4:
##         term weight
## 1    billion   0.17
## 2    deficit   0.32
## 3      franc   0.16
## 4        mln   0.29
## 5      prime  -0.16
## 6      trade   0.21
## 7        fed  -0.29
## 8  repurchas  -0.18
## 9     distil   0.15
## 10       jan   0.15
## 
## Top 10 terms for vector 5:
##        term weight
## 1     prime   0.27
## 2      bank   0.20
## 3      fall  -0.13
## 4      rise  -0.09
## 5  discount  -0.37
## 6    tender  -0.50
## 7       bbl  -0.10
## 8       wti  -0.10
## 9      bill  -0.37
## 10      top  -0.38

plot the documents in first two factors

x <- dtm_svd$u[,1] * dtm_svd$d[1] ## Concept 1, d = concept
y <- dtm_svd$u[,2] * dtm_svd$d[2] ## Concept 2, d = concept
plot(x, y, col=as.integer(factor(y_label)))

take top 300 concept

new <- dtm_svd$u[, 1:300] %*% diag(dtm_svd$d[1:300])

km_3svd <- kmeans(new, 3, nstart=50)
km_3svd$size
## [1] 104  36 554
table(y_label,km_3svd$cluster)
##           
## y_label      1   2   3
##   crude      0  32 221
##   interest 104   4  82
##   trade      0   0 251

Verify the performance of K-Means Before SVD

km_3$tot.withinss
## [1] 434.6238
km_3$betweenss
## [1] 19.80204
#### Verify the performance of K-Means after SVD
#### Lower the better
km_3svd$tot.withinss
## [1] 384.4065
#### Highest the better
km_3svd$betweenss
## [1] 19.45678

In real case, clustering is employed when y_labels are not available.

Thus this evaluation is not appropriate sometimes

Clustering provides only reasonable (high-level) clusters