Basic Clustering
#install.packages("factoextra")
#install.packages("cluster")
library("cluster")
library("factoextra")
#compute and visualize a distance matrix on both the full dataset and top 100 genes
#low distance (more similar): red, high distance (more dissimilar): blue, intermediate distance: white
distmat <- get_dist(all_data, stand = TRUE, method = "pearson")
fviz_dist(distmat,
gradient = list(low = "#FF0000", mid = "white", high = "blue"))

distmat2 <- get_dist(top100, stand = TRUE, method = "pearson")
fviz_dist(distmat2,
gradient = list(low = "#FF0000", mid = "white", high = "blue"))

#find optimal number of clusters (k), max(k)=10
library("factoextra")
fviz_nbclust(all_data, kmeans, method = "gap_stat")

#this method suggests that a single cluster is optimal, which is not very useful, we will choose 2 clusters, because we already know that our patients have prognostic ratings that split them into 2 clusters
#perform k-means clustering on both datasets (k=2) and visualize
kmeanz <- kmeans(all_data, 2, nstart = 25)
# Visualize
library("factoextra")
fviz_cluster(kmeanz, data = all_data, ellipse.type = "convex", trace=TRUE)+
theme_minimal()

kmeanzz <- kmeans(top100, 2, nstart = 25)
# Visualize
library("factoextra")
fviz_cluster(kmeanzz, data = top100, ellipse.type = "convex", trace=TRUE)+
theme_minimal()

#investigate the effect of using three clusters instead of 2
kmeanz1 <- kmeans(all_data, 3, nstart = 25)
# Visualize
library("factoextra")
fviz_cluster(kmeanz1, data = all_data, ellipse.type = "convex", trace=TRUE)+
theme_minimal()

kmeanzz2 <- kmeans(top100, 3, nstart = 25)
# Visualize
library("factoextra")
fviz_cluster(kmeanzz2, data = top100, ellipse.type = "convex", trace=TRUE)+
theme_minimal()

#perform PAM (Partitioning Around Medoids) clustering:This is a robust alternative to k-means clustering, which is less sensitive to outliers.
# Compute PAM on all data
library("cluster")
pam.res <- pam(all_data, 2)
# Visualize
fviz_cluster(pam.res)

# Compute PAM on top 100 genes
library("cluster")
pam.res1 <- pam(top100, 2)
# Visualize
fviz_cluster(pam.res1)

#Try PAM with 3 clusters
#all data
library("cluster")
pam.res <- pam(all_data, 3)
# Visualize
fviz_cluster(pam.res)

#top 100 genes
library("cluster")
pam.res1 <- pam(top100, 3)
# Visualize
fviz_cluster(pam.res1)

#Hierarchical Clustering
#compute dissimilarity matrix for all data
eu.d <- dist(all_data, method = "euclidean")
# Hierarchical clustering using Ward's method
res.hc <- hclust(eu.d, method = "ward.D2" )
# Cut tree into 4 groups
grp <- cutree(res.hc, k = 2)
# Visualize
plot(res.hc, cex = 0.6) # plot tree
rect.hclust(res.hc, k = 2, border = c("yellow","blue")) # add rectangle

#compute dissimilarity matrix for top 100 genes
eu.d100 <- dist(top100, method = "euclidean")
# Hierarchical clustering using Ward's method
res.hc <- hclust(eu.d100, method = "ward.D2" )
# Cut tree into 4 groups
grp <- cutree(res.hc, k = 2)
# Visualize
plot(res.hc, cex = 0.6) # plot tree
rect.hclust(res.hc, k = 2, border = c("yellow","blue")) # add rectangle

#a prettier vizualisation, using factoextra
library("factoextra")
# Compute hierarchical clustering and cut into 3 clusters for all data
res <- hcut(all_data, k = 2, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = c("purple","orange"), rect_fill = TRUE, color_labels_by_k = FALSE)

# Compute hierarchical clustering and cut into 3 clusters for top 100 data
res <- hcut(top100, k = 2, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = c("purple","orange"), rect_fill = TRUE, color_labels_by_k = FALSE)

#Assessing clustering tendency
library("factoextra")
clustAn <- get_clust_tendency(all_data, n = 15,
gradient = list(low = "steelblue", high = "white"))
clustAn$hopkins_stat
## [1] 0.4218175
#This is quite a high Hopkins statistic, indicating that the data is not significantly clusterable
clustAn1 <- get_clust_tendency(top100, n = 15,
gradient = list(low = "steelblue", high = "white"))
#The Hopkins statistic decreases somewhat, but it is still quite high
clustAn1$hopkins_stat
## [1] 0.3848475
clustAn$plot

clustAn1$plot

#A silhouette measures how similar an object is to the the other objects in its own cluster versus those in the neighbor cluster. Silhouette values range from 1 to - 1: Silhouette values close to 1 indicates that the object is well clustered. In the other words, the object is similar to the other objects in its group. Silhouette values close to -1 indicates that the object is not well clustered and could potentially be assigned to a different cluster.
#Obtain a sillhouette plot of PAM clustering (k-means not supported by this package)
pam.res <- pam(all_data, 2)
fviz_silhouette(pam.res)
## cluster size ave.sil.width
## 1 1 13 0.30
## 2 2 3 0.08

#Obtain a sillhouette plot of hierarchical clustering
res.hc <- eclust(all_data, "hclust", k = 2, graph = FALSE)
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 12 0.29
## 2 2 4 0.04

#once again, we see that the objects are generally not well clustered (as would be expected for a dataset which does not cluster well), in the hierarchical clustering sillhouette plot, the last observation is particularly badly clustered
#install.packages("clValid")
library("clValid")
intern <- clValid(all_data, nClust = 2:4,
clMethods = c("hierarchical", "kmeans", "pam"),
validation = "internal")
# Summary
summary(intern)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4
##
## Validation Measures:
## 2 3 4
##
## hierarchical Connectivity 2.9290 10.3337 11.4115
## Dunn 0.9749 0.8315 0.9103
## Silhouette 0.3463 0.1939 0.1838
## kmeans Connectivity 8.2504 10.3337 14.1405
## Dunn 0.6503 0.8315 0.8158
## Silhouette 0.2276 0.1939 0.1635
## pam Connectivity 5.8869 7.3869 10.1492
## Dunn 0.7182 0.7994 0.8582
## Silhouette 0.2581 0.1989 0.1916
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 2.9290 hierarchical 2
## Dunn 0.9749 hierarchical 2
## Silhouette 0.3463 hierarchical 2
#The connectivity indicates the degree of connectedness of the clusters, as determined by the k-nearest neighbors. The neighbSize argument specifies the number of neighbors to use. The connectivity has a value between 0 and infinity and should be minimized.
#The Silhouette value measures the degree of confidence in a particular clustering assignment and lies in the interval [-1,1], with well-clustered observations having values near 1 and poorly clustered observations having values near -1.
#The Dunn Index is the ratio between the smallest distance between observations not in the same cluster to the largest intra-cluster distance. It has a value between 0 and infinity and should be maximized.
#from all of this, it's clear to see that hierarchical clustering at k = 2 outperforms all other clustering methods
Advanced Clustering
library("cluster")
#fuzzy clustering on all data
res.f <- fanny(all_data, k=2, diss=FALSE, memb.exp = 1.2, metric = "euclidean",
stand = FALSE, maxit = 500)
#fuzzy clustering on all data with increased fuzziness
res.f3 <- fanny(all_data, k=2, diss=FALSE, memb.exp = 1.3, metric = "euclidean",
stand = FALSE, maxit = 500)
#fuzzy clustering on top 100 genes
res.f2 <- fanny(top100, k=2, diss=FALSE, memb.exp = 2, metric = "euclidean",
stand = FALSE, maxit = 500)
#fuzzy clustering on top 100 genes with increased fuzziness
res.f4 <- fanny(top100, k=2, diss=FALSE, memb.exp = 2.5, metric = "euclidean",
stand = FALSE, maxit = 500)
library(factoextra)
fviz_cluster(res.f, ellipse.type = "norm",
ellipse.level = 0.68)

fviz_cluster(res.f2, ellipse.type = "norm",
ellipse.level = 0.68)

fviz_cluster(res.f3, ellipse.type = "norm",
ellipse.level = 0.68)

fviz_cluster(res.f4, ellipse.type = "norm",
ellipse.level = 0.68)

#sillhouette plots
fviz_silhouette(res.f, label = TRUE)
## cluster size ave.sil.width
## 1 1 11 0.32
## 2 2 5 0.03

fviz_silhouette(res.f2, label = TRUE)
## cluster size ave.sil.width
## 1 1 10 0.36
## 2 2 6 0.39

fviz_silhouette(res.f3, label = TRUE)
## cluster size ave.sil.width
## 1 1 11 0.32
## 2 2 5 0.03

fviz_silhouette(res.f4, label = TRUE)
## cluster size ave.sil.width
## 1 1 9 0.39
## 2 2 7 0.29

#increased fuzziness leads to a marked increas in sillhouette scores for both datasets
#visualize using a correlation plot, to see how likely an individual is to belong to each of the 2 clusters
#install.packages("corrplot")
library(corrplot)
corrplot(res.f$membership, is.corr = FALSE)

corrplot(res.f2$membership, is.corr = FALSE)

corrplot(res.f3$membership, is.corr = FALSE)

corrplot(res.f4$membership, is.corr = FALSE)
