library(tm)         # text mining
library(RWeka)      # collection of machine learning algorithms
library(dendextend) # Dendogram (hierarchical clustering)
library(cluster)    # clustering 
library(clValid)    # Calculating Dunn index

1 Abstract

In the following report, I provide a clustering analysis of 274 text documents from Project Gutenberg website that come from different topics. The goal of this analysis is to identify similar groups of documents and give an idea about possible topic within each group. Various clustering and text mining concepts were used in this analysis.

2 Introduction

In this project I am analyzing 274 text documents, that come from various possible topics such as amphibia, birds, fish, insects, mammals, reptilia, Rodentia, Italy paintings, Greece history, India history, Egipt, US Consitution, CIA world factbooks, and Science fiction novels 1930-31. Not all the above topics are included in our data set, and one of the goals of this analysis is to find out what topics are covered in our data set.

A brief description of our analysis:

  • creating the corpus

  • cleaning the data

  • descriptive statistics of the corpus

  • preparing Document-Term Matrix for further analysis

  • between/within cluster analysis

3 Analysis

3.1 Creating the corpus

First I create a corpus with 274 documents. By looking at the names of documents we can see that some of them have almost similar names. Below is the sorted table of documents based on first 7 characters of their names.

TEXTDOCS = work_directory

# Source directory
ds = DirSource(TEXTDOCS)

# Create a corpus
doc.corpus = Corpus(ds)

# Types of documents based on the file names
sort(table(substr(as.vector(names(doc.corpus)),1,7)), decreasing = TRUE)
## 
## pg37009 pg7353. pg14473 pg23755 pg35490 pg37856 pg38032 pg21138 pg33852 
##      35      25      23      20      20      19      19      18      17 
## pg36903 pg33874 pg37809 pg34044 pg33574 pg33967 pg24506 pg31050 pg33560 
##      17      10       6       5       4       4       3       3       3 
## pg34672 pg31513 pg35838 pg31011 pg31175 pg31221 pg31293 pg31334 pg31830 
##       3       2       2       1       1       1       1       1       1 
## pg32505 pg32653 pg34326 pg34523 pg34579 pg34604 pg34787 pg35413 pg37742 
##       1       1       1       1       1       1       1       1       1 
## pg37823 
##       1

Clearly, we can see that there is some file naming pattern. One of my hypothesis is that we have relationship between the file name and topic. We are going to check this hypothesis later.

3.2 Cleaning the data

I have created a cleaning corpus function that has the following cleaning steps.

  1. stripWhitespace - removes extra whitespace from our corpus
  2. removePunctuation - removes punctuation marks from our corpus
  3. removeNumbers - remove numbers from our corpus
  4. content_transformer(tolower) - convert every word to a lower case
  5. removeWords - removes stop words using stopwords(“en”) library. I also added such words like “one”,“two”, “three”,“four”,“five” beacause they appeared quite frequently in the corpus
  6. stemDocument - stems words in our corpus using Porter’s stemming algorithm
# Creating a corpus cleaning function
clean_corpus = function(corpus){
  corpus = tm_map(corpus, stripWhitespace) 
  corpus = tm_map(corpus, removePunctuation) 
  corpus = tm_map(corpus, removeNumbers)
  corpus = tm_map(corpus, content_transformer(tolower))
  corpus = tm_map(corpus, removeWords, c(stopwords("en"), "one", "two", "three", "five", "four"))
  corpus = tm_map(corpus, stemDocument)
  return(corpus)
}
# Applying the cleaning function to our corpus
clean_corp = clean_corpus(doc.corpus)

3.3 Descriptive statistics

# Creating Document Term Natrix
DTM = DocumentTermMatrix(clean_corp)

# Remove sparse terms from the corpus
DTM = removeSparseTerms(DTM, 0.9)

# Creating histograms and boxplots
dtms = as.matrix(DTM)
dtms_freq = as.matrix(rowSums(dtms))
dtms_freq1 = dtms_freq[order(dtms_freq),]
sd = sd(dtms_freq)
mean = mean(dtms_freq)

The following is the brief summary of cleaned data set:

  • The smallest text file has 11 words (pg32653.1.txt), while the biggest one has 1007 words (pg37009.11.txt).

  • The mean and standard deviation of our data set are 725.54, 197.8 respectively.

  • Boxplot and histogram show how our data is distributed based on the size of the files.

par(mfrow = c(1,2))
# Creating histogram and boxplot
hist(dtms_freq,
     main = "Histogram",
     col = "green",
     col.main = "dodgerblue")
boxplot(dtms_freq, 
        main = "Boxplot", 
        col = "green",
        col.main = "dodgerblue")

After cleaning our corpus, I have created a Document-Term Matrix (DTM) that shows frequencies of all the words in the corpus within each separate file. We can see that our DTM has high sparsity (97%), thus I reduced it to (74%).

3.4 Between/Within Cluster Analysis

From the summary statistics, we discovered that our data set has a high variability in terms of the size of the files (standard deviation is 197.8). Thus to better compare similarity between files I decided to normalize each document within our corpus. For normalization, I used Euclidean norm. After that, I created distance matrix between each document in the corpus. For that, the Euclidean distance is used. In order to create hierarchical clusters (dendrogram), the ward.D method is used.

# Converting DTM to matrix
DTM_m = as.matrix(DTM)

# Normalizing each document
for (i in 1:length(clean_corp)) {
  DTM_m[i,] = as.matrix(DTM_m[i,])/norm(as.matrix(DTM_m[i,]), type ="F")
}

# Caclculatimg distances between files
dist_uni = dist(DTM_m, method = "euclidian")

# Clustering using "ward.D" method
hc_uni = hclust(dist_uni, method="ward.D")
hc_uni_d = as.dendrogram(hc_uni)

# Finding the optimal number of clusters using Dunn Index
k = 50
mat = matrix(0, nrow = k, ncol = 2, byrow = TRUE)
for (i in 1:k) {
  members = cutree(hc_uni, i)
  dunn_index = dunn(clusters = members, Data = dist_uni)
  mat[i,1] = i
  mat[i,2] = dunn_index
}

# Plot number of clusters vs Dunn Index
plot(mat, 
     type = 'b',
     xlab = "Number of Cluster", 
     ylab = "Dunn Index",
     pch = 16,
     col = "red",
     main = "Dunn's Index vs Number of clusters",
     col.main = "dodgerblue")
points(mat, col = "green")

Based on Dunn’s Index we can see that the optimal number of clusters in 20.

# Plot hcd
plot(hc_uni_d, main = "Method Ward",leaflab = "none", col.main = "dodgerblue")

# Add cluster rectangles 
ncl = 20
rect.dendrogram(hc_uni_d, k = ncl, border = "blue", xpd = FALSE, lower_rect = 0)

clward1 = as.data.frame(cutree(hc_uni_d, ncl))

# Creating list of clusters with their files
ncl = 20
cl = list()
for (i in 1:ncl) {
cl[paste("cl_",i, sep = "")] = list(rownames(subset(clward1, clward1 == i)))
}
cl
## $cl_1
##  [1] "pg14473.1.txt"  "pg14473.10.txt" "pg14473.11.txt" "pg14473.12.txt"
##  [5] "pg14473.13.txt" "pg14473.14.txt" "pg14473.15.txt" "pg14473.16.txt"
##  [9] "pg14473.17.txt" "pg14473.18.txt" "pg14473.19.txt" "pg14473.2.txt" 
## [13] "pg14473.20.txt" "pg14473.21.txt" "pg14473.22.txt" "pg14473.23.txt"
## [17] "pg14473.3.txt"  "pg14473.4.txt"  "pg14473.5.txt"  "pg14473.6.txt" 
## [21] "pg14473.7.txt"  "pg14473.8.txt"  "pg14473.9.txt"  "pg24506.1.txt" 
## [25] "pg24506.2.txt"  "pg24506.3.txt" 
## 
## $cl_2
##  [1] "pg21138.1.txt"  "pg21138.10.txt" "pg21138.11.txt" "pg21138.12.txt"
##  [5] "pg21138.13.txt" "pg21138.14.txt" "pg21138.15.txt" "pg21138.16.txt"
##  [9] "pg21138.17.txt" "pg21138.18.txt" "pg21138.2.txt"  "pg21138.3.txt" 
## [13] "pg21138.4.txt"  "pg21138.5.txt"  "pg21138.6.txt"  "pg21138.7.txt" 
## [17] "pg21138.8.txt"  "pg21138.9.txt" 
## 
## $cl_3
##  [1] "pg23755.1.txt"  "pg23755.10.txt" "pg23755.11.txt" "pg23755.12.txt"
##  [5] "pg23755.13.txt" "pg23755.14.txt" "pg23755.15.txt" "pg23755.16.txt"
##  [9] "pg23755.17.txt" "pg23755.18.txt" "pg23755.19.txt" "pg23755.2.txt" 
## [13] "pg23755.20.txt" "pg23755.3.txt"  "pg23755.4.txt"  "pg23755.5.txt" 
## [17] "pg23755.6.txt"  "pg23755.7.txt"  "pg23755.8.txt"  "pg23755.9.txt" 
## 
## $cl_4
##  [1] "pg31011.1.txt"  "pg31221.1.txt"  "pg31830.1.txt"  "pg32505.1.txt" 
##  [5] "pg32653.1.txt"  "pg34523.1.txt"  "pg34604.1.txt"  "pg34787.1.txt" 
##  [9] "pg35413.1.txt"  "pg37742.1.txt"  "pg37809.1.txt"  "pg37823.1.txt" 
## [13] "pg38032.19.txt"
## 
## $cl_5
## [1] "pg31050.1.txt" "pg31050.2.txt" "pg31050.3.txt" "pg33560.1.txt"
## [5] "pg33560.2.txt" "pg33560.3.txt"
## 
## $cl_6
##  [1] "pg31175.1.txt" "pg31293.1.txt" "pg31334.1.txt" "pg33967.1.txt"
##  [5] "pg33967.2.txt" "pg33967.3.txt" "pg33967.4.txt" "pg34326.1.txt"
##  [9] "pg35838.1.txt" "pg35838.2.txt" "pg37809.2.txt" "pg37809.3.txt"
## [13] "pg37809.4.txt" "pg37809.5.txt" "pg37809.6.txt"
## 
## $cl_7
##  [1] "pg31513.1.txt"  "pg31513.2.txt"  "pg33574.1.txt"  "pg33574.2.txt" 
##  [5] "pg33574.3.txt"  "pg33574.4.txt"  "pg35490.18.txt" "pg35490.19.txt"
##  [9] "pg35490.2.txt"  "pg35490.20.txt"
## 
## $cl_8
##  [1] "pg33852.1.txt"  "pg33852.2.txt"  "pg33852.3.txt"  "pg33852.4.txt" 
##  [5] "pg33852.5.txt"  "pg33852.6.txt"  "pg33874.10.txt" "pg34579.1.txt" 
##  [9] "pg37009.1.txt"  "pg37009.2.txt"  "pg37009.35.txt" "pg37009.4.txt" 
## [13] "pg37009.5.txt"  "pg37009.7.txt" 
## 
## $cl_9
##  [1] "pg33852.10.txt" "pg33852.11.txt" "pg33852.12.txt" "pg33852.13.txt"
##  [5] "pg33852.14.txt" "pg33852.15.txt" "pg33852.16.txt" "pg33852.7.txt" 
##  [9] "pg33852.8.txt"  "pg33852.9.txt" 
## 
## $cl_10
## [1] "pg33852.17.txt" "pg36903.16.txt" "pg36903.17.txt"
## 
## $cl_11
## [1] "pg33874.1.txt" "pg33874.2.txt" "pg33874.3.txt" "pg33874.4.txt"
## [5] "pg33874.5.txt" "pg33874.6.txt" "pg33874.7.txt" "pg33874.8.txt"
## [9] "pg33874.9.txt"
## 
## $cl_12
## [1] "pg34044.1.txt" "pg34044.2.txt" "pg34044.3.txt" "pg34044.4.txt"
## [5] "pg34044.5.txt"
## 
## $cl_13
##  [1] "pg34672.1.txt"  "pg34672.2.txt"  "pg34672.3.txt"  "pg37856.18.txt"
##  [5] "pg37856.8.txt"  "pg38032.13.txt" "pg38032.14.txt" "pg38032.15.txt"
##  [9] "pg38032.16.txt" "pg38032.17.txt" "pg38032.18.txt"
## 
## $cl_14
##  [1] "pg35490.1.txt"  "pg35490.10.txt" "pg35490.11.txt" "pg35490.12.txt"
##  [5] "pg35490.13.txt" "pg35490.14.txt" "pg35490.15.txt" "pg35490.16.txt"
##  [9] "pg35490.17.txt" "pg35490.3.txt"  "pg35490.4.txt"  "pg35490.5.txt" 
## [13] "pg35490.6.txt"  "pg35490.7.txt"  "pg35490.8.txt"  "pg35490.9.txt" 
## 
## $cl_15
##  [1] "pg36903.1.txt"  "pg36903.10.txt" "pg36903.11.txt" "pg36903.12.txt"
##  [5] "pg36903.13.txt" "pg36903.14.txt" "pg36903.15.txt" "pg36903.2.txt" 
##  [9] "pg36903.3.txt"  "pg36903.4.txt"  "pg36903.5.txt"  "pg36903.6.txt" 
## [13] "pg36903.7.txt"  "pg36903.8.txt"  "pg36903.9.txt" 
## 
## $cl_16
##  [1] "pg37009.10.txt" "pg37009.13.txt" "pg37009.15.txt" "pg37009.16.txt"
##  [5] "pg37009.18.txt" "pg37009.19.txt" "pg37009.20.txt" "pg37009.21.txt"
##  [9] "pg37009.22.txt" "pg37009.23.txt" "pg37009.25.txt" "pg37009.26.txt"
## [13] "pg37009.27.txt" "pg37009.29.txt" "pg37009.3.txt"  "pg37009.34.txt"
## [17] "pg37009.6.txt"  "pg37009.8.txt"  "pg37009.9.txt" 
## 
## $cl_17
##  [1] "pg37009.11.txt" "pg37009.12.txt" "pg37009.14.txt" "pg37009.17.txt"
##  [5] "pg37009.24.txt" "pg37009.28.txt" "pg37009.30.txt" "pg37009.31.txt"
##  [9] "pg37009.32.txt" "pg37009.33.txt"
## 
## $cl_18
##  [1] "pg37856.1.txt"  "pg37856.10.txt" "pg37856.11.txt" "pg37856.12.txt"
##  [5] "pg37856.13.txt" "pg37856.14.txt" "pg37856.15.txt" "pg37856.16.txt"
##  [9] "pg37856.17.txt" "pg37856.19.txt" "pg37856.2.txt"  "pg37856.3.txt" 
## [13] "pg37856.4.txt"  "pg37856.5.txt"  "pg37856.6.txt"  "pg37856.7.txt" 
## [17] "pg37856.9.txt" 
## 
## $cl_19
##  [1] "pg38032.1.txt"  "pg38032.10.txt" "pg38032.11.txt" "pg38032.12.txt"
##  [5] "pg38032.2.txt"  "pg38032.3.txt"  "pg38032.4.txt"  "pg38032.5.txt" 
##  [9] "pg38032.6.txt"  "pg38032.7.txt"  "pg38032.8.txt"  "pg38032.9.txt" 
## 
## $cl_20
##  [1] "pg7353.1.txt"  "pg7353.10.txt" "pg7353.11.txt" "pg7353.12.txt"
##  [5] "pg7353.13.txt" "pg7353.14.txt" "pg7353.15.txt" "pg7353.16.txt"
##  [9] "pg7353.17.txt" "pg7353.18.txt" "pg7353.19.txt" "pg7353.2.txt" 
## [13] "pg7353.20.txt" "pg7353.21.txt" "pg7353.22.txt" "pg7353.23.txt"
## [17] "pg7353.24.txt" "pg7353.25.txt" "pg7353.3.txt"  "pg7353.4.txt" 
## [21] "pg7353.5.txt"  "pg7353.6.txt"  "pg7353.7.txt"  "pg7353.8.txt" 
## [25] "pg7353.9.txt"

We can notice that most of the clusters consist of the files that have similar name patterns, which might support our hypothesis in the introduction. Now, lets try to understand the possible topics of these clusters, by analyzing most frequent words.

3.5 Clusters’ topics

# Creating corpuses for each cluster
for (i in 1:ncl) {
  name = paste("cl_corp_", i, sep = "")
  assign(name, clean_corp[match(cl[[i]], names(clean_corp))])
} 

Tdm = list()

# Creating a list of TDMs for each cluster
for (i in 1:ncl) {
  bigram_dtm_i = TermDocumentMatrix(get(paste("cl_corp_",i,sep="")))
  tdm_i <- as.matrix(bigram_dtm_i)
  Tdm[paste("cluster_",i,sep="")] = list(tdm_i)
}

# Ploting most common words in each cluster
par(mfrow = c(1,2))
for (i in 1:ncl) {
  cl_m = as.matrix(Tdm[[i]])
  barplot(sort(sort(rowSums(cl_m), decreasing = TRUE)[1:10], decreasing = FALSE),
          las = 2,
          horiz = TRUE,
          decreasing = FALSE, 
          main = paste("Most common words for cluster", i, sep = " "),
          cex.main = 0.8,
          cex.names = 0.8,
          col.main = "dodgerblue")
}

So, based on the top 10 most frequent words we can have some idea of the topic in each cluster.