Import Data

#read data into a dataframe
df1 <- read.table('leukemia_array.txt')
str(df1)

## 'data.frame':    22283 obs. of  16 variables:
##  $ p1_good: num  1292 232 169 1589 128 ...
##  $ p2_good: num  932 259 348 1985 131 ...
##  $ p3_good: num  1172.9 236.3 363.3 2050.3 60.1 ...
##  $ p4_good: num  1212.5 365.8 205.8 1782.1 75.1 ...
##  $ p5_good: num  1280 243 255 1521 50 ...
##  $ p6_good: num  1351.3 260.9 243.2 2153.3 26.9 ...
##  $ p7_good: num  2242 192 234 2399 114 ...
##  $ p8_good: num  1411.4 195.9 122.4 1537.4 64.7 ...
##  $ p1_poor: num  1405 215 422 2956 102 ...
##  $ p2_poor: num  1070.6 83.9 253.8 4594.6 171.3 ...
##  $ p3_poor: num  1563.5 278.3 768.5 1558.7 57.5 ...
##  $ p4_poor: num  1475 526 141 3221 211 ...
##  $ p5_poor: num  1212 276 254 2044 102 ...
##  $ p6_poor: num  1418 280 147.6 1539 88.8 ...
##  $ p7_poor: num  1516 294 611 3532 196 ...
##  $ p8_poor: num  2279 238 893 3671 207 ...

#transpose df, to allow for PCA (genes are now recorded in columns and patients in rows)
df1 <- t(df1)
str(df1)

##  num [1:16, 1:22283] 1292 932 1173 1212 1280 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:16] "p1_good" "p2_good" "p3_good" "p4_good" ...
##   ..$ : chr [1:22283] "1007_s_at" "1053_at" "117_at" "121_at" ...

#Add prognosis label

prognosis <- c(rep("good", 8), rep("poor", 8))
prognosis <- as.data.frame(prognosis)

df2 <- cbind(prognosis, df1)
df2$prognosis <- as.factor(df2$prognosis)

Cluster Analysis:

all_data <- df1

#create a subgroup of data with top 100 genes driving PCA

sv <- svd(df1)
V = sv$v
k=1
X <- t(df1)
ord1<- order(abs(V[,k]),decreasing=TRUE)
x1<- as.matrix(X[ord1[1:100],])

#center and scale data

all_data <- scale(all_data)
top100 <- scale(t(x1))

Basic Clustering

#install.packages("factoextra")
#install.packages("cluster")

library("cluster")
library("factoextra")

#compute and visualize a distance matrix on both the full dataset and top 100 genes

#low distance (more similar): red, high distance (more dissimilar): blue, intermediate distance: white

distmat <- get_dist(all_data, stand = TRUE, method = "pearson")
fviz_dist(distmat, 
   gradient = list(low = "#FF0000", mid = "white", high = "blue"))

distmat2 <- get_dist(top100, stand = TRUE, method = "pearson")
fviz_dist(distmat2, 
   gradient = list(low = "#FF0000", mid = "white", high = "blue"))

#find optimal number of clusters (k), max(k)=10

library("factoextra")
fviz_nbclust(all_data, kmeans, method = "gap_stat")

#this method suggests that a single cluster is optimal, which is not very useful, we will choose 2 clusters, because we already know that our patients have prognostic ratings that split them into 2 clusters

#perform k-means clustering on both datasets (k=2) and visualize

kmeanz <- kmeans(all_data, 2, nstart = 25)
# Visualize
library("factoextra")
fviz_cluster(kmeanz, data = all_data, ellipse.type = "convex", trace=TRUE)+
  theme_minimal()

kmeanzz <- kmeans(top100, 2, nstart = 25)
# Visualize
library("factoextra")
fviz_cluster(kmeanzz, data = top100, ellipse.type = "convex", trace=TRUE)+
  theme_minimal()

#investigate the effect of using three clusters instead of 2

kmeanz1 <- kmeans(all_data, 3, nstart = 25)
# Visualize
library("factoextra")
fviz_cluster(kmeanz1, data = all_data, ellipse.type = "convex", trace=TRUE)+
  theme_minimal()

kmeanzz2 <- kmeans(top100, 3, nstart = 25)
# Visualize
library("factoextra")
fviz_cluster(kmeanzz2, data = top100, ellipse.type = "convex", trace=TRUE)+
  theme_minimal()

#perform PAM (Partitioning Around Medoids) clustering:This is a robust alternative to k-means clustering, which is less sensitive to outliers.

# Compute PAM on all data
library("cluster")
pam.res <- pam(all_data, 2)
# Visualize
fviz_cluster(pam.res)

# Compute PAM on top 100 genes
library("cluster")
pam.res1 <- pam(top100, 2)
# Visualize
fviz_cluster(pam.res1)

#Try PAM with 3 clusters

#all data
library("cluster")
pam.res <- pam(all_data, 3)
# Visualize
fviz_cluster(pam.res)

#top 100 genes
library("cluster")
pam.res1 <- pam(top100, 3)
# Visualize
fviz_cluster(pam.res1)

#Hierarchical Clustering

#compute dissimilarity matrix for all data
eu.d <- dist(all_data, method = "euclidean")
# Hierarchical clustering using Ward's method
res.hc <- hclust(eu.d, method = "ward.D2" )
# Cut tree into 4 groups
grp <- cutree(res.hc, k = 2)
# Visualize
plot(res.hc, cex = 0.6) # plot tree
rect.hclust(res.hc, k = 2, border = c("yellow","blue")) # add rectangle

#compute dissimilarity matrix for top 100 genes
eu.d100 <- dist(top100, method = "euclidean")
# Hierarchical clustering using Ward's method
res.hc <- hclust(eu.d100, method = "ward.D2" )
# Cut tree into 4 groups
grp <- cutree(res.hc, k = 2)

# Visualize
plot(res.hc, cex = 0.6) # plot tree
rect.hclust(res.hc, k = 2, border = c("yellow","blue")) # add rectangle

#a prettier vizualisation, using factoextra

library("factoextra")
# Compute hierarchical clustering and cut into 3 clusters for all data
res <- hcut(all_data, k = 2, stand = TRUE)

# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
          k_colors = c("purple","orange"), rect_fill = TRUE, color_labels_by_k = FALSE)

# Compute hierarchical clustering and cut into 3 clusters for top 100 data
res <- hcut(top100, k = 2, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
          k_colors = c("purple","orange"), rect_fill = TRUE, color_labels_by_k = FALSE)

#Assessing clustering tendency

library("factoextra")

clustAn <- get_clust_tendency(all_data, n = 15,
                   gradient = list(low = "steelblue",  high = "white"))

clustAn$hopkins_stat

## [1] 0.4218175

#This is quite a high Hopkins statistic, indicating that the data is not significantly clusterable

clustAn1 <- get_clust_tendency(top100, n = 15,
                   gradient = list(low = "steelblue",  high = "white"))

#The Hopkins statistic decreases somewhat, but it is still quite high

clustAn1$hopkins_stat

## [1] 0.3848475

clustAn$plot

clustAn1$plot

#A silhouette measures how similar an object is to the the other objects in its own cluster versus those in the neighbor cluster. Silhouette values range from 1 to - 1: Silhouette values close to 1 indicates that the object is well clustered. In the other words, the object is similar to the other objects in its group. Silhouette values close to -1 indicates that the object is not well clustered and could potentially be assigned to a different cluster.

#Obtain a sillhouette plot of PAM clustering (k-means not supported by this package)
pam.res <- pam(all_data, 2)
fviz_silhouette(pam.res)

##   cluster size ave.sil.width
## 1       1   13          0.30
## 2       2    3          0.08

#Obtain a sillhouette plot of hierarchical clustering
res.hc <- eclust(all_data, "hclust", k = 2, graph = FALSE) 
fviz_silhouette(res.hc)

##   cluster size ave.sil.width
## 1       1   12          0.29
## 2       2    4          0.04

#once again, we see that the objects are generally not well clustered (as would be expected for a dataset which does not cluster well), in the hierarchical clustering sillhouette plot, the last observation is particularly badly clustered

#install.packages("clValid")
library("clValid")
intern <- clValid(all_data, nClust = 2:4, 
              clMethods = c("hierarchical", "kmeans", "pam"),
              validation = "internal")
# Summary
summary(intern)

## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  2 3 4 
## 
## Validation Measures:
##                                  2       3       4
##                                                   
## hierarchical Connectivity   2.9290 10.3337 11.4115
##              Dunn           0.9749  0.8315  0.9103
##              Silhouette     0.3463  0.1939  0.1838
## kmeans       Connectivity   8.2504 10.3337 14.1405
##              Dunn           0.6503  0.8315  0.8158
##              Silhouette     0.2276  0.1939  0.1635
## pam          Connectivity   5.8869  7.3869 10.1492
##              Dunn           0.7182  0.7994  0.8582
##              Silhouette     0.2581  0.1989  0.1916
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## Connectivity 2.9290 hierarchical 2       
## Dunn         0.9749 hierarchical 2       
## Silhouette   0.3463 hierarchical 2

#The connectivity indicates the degree of connectedness of the clusters, as determined by the k-nearest neighbors. The neighbSize argument specifies the number of neighbors to use. The connectivity has a value between 0 and infinity and should be minimized.

#The Silhouette value measures the degree of confidence in a particular clustering assignment and lies in the interval [-1,1], with well-clustered observations having values near 1 and poorly clustered observations having values near -1. 

#The Dunn Index is the ratio between the smallest distance between observations not in the same cluster to the largest intra-cluster distance. It has a value between 0 and infinity and should be maximized.

#from all of this, it's clear to see that hierarchical clustering at k = 2 outperforms all other clustering methods

Advanced Clustering

library("cluster")
#fuzzy clustering on all data
res.f <- fanny(all_data, k=2, diss=FALSE, memb.exp = 1.2, metric = "euclidean", 
      stand = FALSE, maxit = 500)
#fuzzy clustering on all data with increased fuzziness
res.f3 <- fanny(all_data, k=2, diss=FALSE, memb.exp = 1.3, metric = "euclidean", 
      stand = FALSE, maxit = 500)

#fuzzy clustering on top 100 genes
res.f2 <- fanny(top100, k=2, diss=FALSE, memb.exp = 2, metric = "euclidean", 
      stand = FALSE, maxit = 500)

#fuzzy clustering on top 100 genes with increased fuzziness
res.f4 <- fanny(top100, k=2, diss=FALSE, memb.exp = 2.5, metric = "euclidean", 
      stand = FALSE, maxit = 500)

library(factoextra)
fviz_cluster(res.f, ellipse.type = "norm",
             ellipse.level = 0.68)

fviz_cluster(res.f2, ellipse.type = "norm",
             ellipse.level = 0.68)

fviz_cluster(res.f3, ellipse.type = "norm",
             ellipse.level = 0.68)

fviz_cluster(res.f4, ellipse.type = "norm",
             ellipse.level = 0.68)

#sillhouette plots
fviz_silhouette(res.f, label = TRUE)

##   cluster size ave.sil.width
## 1       1   11          0.32
## 2       2    5          0.03

fviz_silhouette(res.f2, label = TRUE)

##   cluster size ave.sil.width
## 1       1   10          0.36
## 2       2    6          0.39

fviz_silhouette(res.f3, label = TRUE)

##   cluster size ave.sil.width
## 1       1   11          0.32
## 2       2    5          0.03

fviz_silhouette(res.f4, label = TRUE)

##   cluster size ave.sil.width
## 1       1    9          0.39
## 2       2    7          0.29

#increased fuzziness leads to a marked increas in sillhouette scores for both datasets

#visualize using a correlation plot, to see how likely an individual is to belong to each of the 2 clusters
#install.packages("corrplot")
library(corrplot)
corrplot(res.f$membership, is.corr = FALSE)

corrplot(res.f2$membership, is.corr = FALSE)

corrplot(res.f3$membership, is.corr = FALSE)

corrplot(res.f4$membership, is.corr = FALSE)

MSc Data Science, Department of Statistical Sciences, University of Cape Town

STA5077Z: Unsupervised Learning Module Project

Gerhard Viljoen [VLJCHR004]

26 July 2017

Import Data

Cluster Analysis:

Basic Clustering

Advanced Clustering