Khon Nguyen - 444135 - textreviews

library(textmineR)

## Warning: package 'textmineR' was built under R version 4.2.2

## Loading required package: Matrix

## 
## Attaching package: 'textmineR'

## The following object is masked from 'package:Matrix':
## 
##     update

## The following object is masked from 'package:stats':
## 
##     update

library(tm)

## Warning: package 'tm' was built under R version 4.2.2

## Loading required package: NLP

#Load data
df <- read.csv("H:/My Drive/Text Mining/5/Class_5/BONUS_task/2.Bonus_textreviews/textreviews.csv",header=TRUE, stringsAsFactors=FALSE, fileEncoding="latin1")

Firstly, I load the data as dataframe.

docs <- VCorpus(VectorSource(t(df)))

Then I convert it to Corpus.

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
#Clean the characters that are not words, for example " ' ", ' " ', " - ", etc.
docs <- tm_map(docs, toSpace, "\\W")

#Remove one alphabet letter
docs <- tm_map(docs, toSpace, "\\b[A-z]\\b{1}")

# Preliminary cleaning, Cleaning text and Stopword removal ----
## Remove punctuation
docs <- tm_map(docs, removePunctuation)
## Remove numbers
docs <- tm_map(docs, removeNumbers)
## Lower all words
docs <- tm_map(docs, content_transformer(tolower))
## Remove all stop words
docs <- tm_map(docs, removeWords, stopwords("english"))
## Strip white space
docs <- tm_map(docs, stripWhitespace)

After that, I clean the corpus.

# Create document-term matrix ----
dtm <- DocumentTermMatrix(docs)

Then I convert the corpus to dtm.

tdm <- as.TermDocumentMatrix(t(dtm), weighting = weightTf)
tdms <- removeSparseTerms(tdm, sparse = 0.97)

From, dtm, I convert to tdm and then remove sparse terms. I’d note that when experimenting with the data before doing this Rmarkdown, I found that the clusters are not clear if we do not remove sparse terms.

#Elbow Method for finding the optimal number of clusters
set.seed(123)
# Compute and plot wss for k = 2 to k = 15.
k.max <- 15
wss <- sapply(1:k.max, 
              function(k){kmeans(dist(tdms, method="euclidian"), k, nstart=50,iter.max = 15 )$tot.withinss})
wss

##  [1] 61660.59 38446.98 25361.17 22420.42 20734.14 19193.59 17779.96 17120.55
##  [9] 15273.01 14269.60 13629.98 12773.97 11972.29 11234.49 10437.58

plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

After that, I use Elbow method to define how many cluster should be done. Here, 3 should be the good option.

library(cluster)
kfit <- kmeans(dist(tdms, method="euclidian"), 3)

clusplot(as.matrix(dist(tdms, method="euclidian")), kfit$cluster, color=T, shade=T, labels=2, lines=0, 
         main = "2D Representation of Clusters")

#append cluster labels to original data
m <- as.matrix(tdms)
final_data <- cbind(m, cluster = kfit$cluster)
final_data <- as.data.frame(final_data)
c1 <- final_data[final_data$cluster == 1,]
c1$cluster <- NULL
c1 <- t(c1)
c2 <- final_data[final_data$cluster == 2,]
c2$cluster <- NULL
c2 <- t(c2)
c3 <- final_data[final_data$cluster == 3,]
c3$cluster <- NULL
c3 <- t(c3)

After that, I split the data into 3 clusters to demonstrate on word cloud.

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.2.2

## Loading required package: RColorBrewer

#Cluster 1
# for tdm:
freq <- colSums(c1)
# Limit words in word cloud by specifying maximum number of words
wordcloud(names(freq), freq, max.words=50, rot.per=0.2, colors = brewer.pal(6, "Dark2"))

Cluster 1 has only 1 word, it should be the most important word about the category, so it should be Game category.

#Cluster 2
# for tdm:
freq <- colSums(c2)
# Limit words in word cloud by specifying maximum number of words
wordcloud(names(freq), freq, max.words=50, rot.per=0.2, colors = brewer.pal(6, "Dark2"))

For cluster two, it is about size, dress, etc. Based on this I can guess this is Clothes category, maybe slightly skewed to female category.

#Cluster 3
# for tdm:
freq <- colSums(c3)
# Limit words in word cloud by specifying maximum number of words
wordcloud(names(freq), freq, max.words=50, rot.per=0.2, colors = brewer.pal(6, "Dark2"))

For cluster 3, I also see the words like “wear”, “fabic”, “fit”, etc. So I think this is also about clothes.

Based on the cluster map shown above, cluster 2 and 3 are close to the other, I think they are all about clothes. So as a conclusion for this exercise, I think there are 2 categories here: Clothes and Game.

Khon Nguyen - 444135 - textreviews

Khon Hoang Nguyen

11/12/2022