Libraries
library(tidyr)
library(readr)
library(ggplot2)
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(reactable) # reactable(df)
library(dplyr)
## Knn imputation
library(caret)
library(RANN)
# CLustering
library(factoextra) #clustering visualization
library(cluster) #clustering algorithms
library(dendextend) #for comparing two dendrograms
library(corrplot) #corelation between dendrograms
library(tidyverse) #data manupulation
library(NbClust) #determine optimal no. of clusters [not working...]
Reading Data
FC_all_valid_patients_input_P1 = data.frame(read_xlsx("../data/clean-data/CriteriaBasedImputation/FC_all_valid_patients_input_P1.xlsx", sheet = "FC_all_valid_patients_input_P1" ))
Sat02_all_valid_patients_input_P1 = data.frame(read_xlsx("../data/clean-data/CriteriaBasedImputation/SatO2_all_valid_patients_input_P1.xlsx", sheet = "St2_all_valid_patients_input_P1" ))
## Deterioro and Not deterioro
file_patient_name_NO_DETERIORO <- data.frame(read_xlsx("../data/info-patients/file_patient_name_NO_DETERIORO.xlsx"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read_xlsx("../data/info-patients/file_patient_name_DETERIORO.xlsx"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x
I wanted a mean of 0 and a standard deviation of 1. Standardization consists of transforming the variables such that they have mean zero and standard deviation one.
datos <- cbind(t(Sat02_all_valid_patients_input_P1), t(FC_all_valid_patients_input_P1))
SatO2_norm = scale(Sat02_all_valid_patients_input_P1)
FC_norm = scale(FC_all_valid_patients_input_P1)
SatO2_t <- t(SatO2_norm)
FC_t <- t(FC_norm)
datos_norm <- cbind(SatO2_t, FC_t)
Rows are observations (individuals) and columns are variables
The data have been standardized (i.e., scaled) to make variables comparable.
Any missing value in the data must be removed or estimated.
table(is.na(datos_norm))
##
## FALSE
## 195976
distance <- dist(datos_norm, method = "euclidean")
distance_matrix <- as.matrix(distance)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
We’ll use the package dendextend which contains many functions for comparing two dendrograms. Compute two hierarchical clusterings
clust_1 <- hclust(distance, method = "complete")
clust_2 <- hclust(distance, method = "ward.D2")
Compute two dendograms
dend1 <- as.dendrogram((clust_1))
dend2 <- as.dendrogram((clust_2))
dend_list <- dendlist(dend1, dend2) # list of dendograms
The function tanglegram() plots two dendrograms, side by side, with their labels connected by lines. It can be used for visually comparing two methods of Hierarchical clustering as follow:
tanglegram(dend1, dend2)
# Create multiple dendrograms by chaining
dend1 <- datos_norm %>% dist %>% hclust("complete") %>% as.dendrogram
dend2 <- datos_norm %>% dist %>% hclust("single") %>% as.dendrogram
dend3 <- datos_norm %>% dist %>% hclust("ward.D2") %>% as.dendrogram
dend4 <- datos_norm %>% dist %>% hclust("centroid") %>% as.dendrogram
# Compute correlation matrix
dend_list <- dendlist("Complete" = dend1, "Single" = dend2,
"Ward.D2" = dend3, "Centroid" = dend4)
cors <- cor.dendlist(dend_list)
# Print correlation matrix
round(cors, 2)
## Complete Single Ward.D2 Centroid
## Complete 1.00 0.52 0.41 0.34
## Single 0.52 1.00 0.20 0.62
## Ward.D2 0.41 0.20 1.00 0.05
## Centroid 0.34 0.62 0.05 1.00
corrplot(cors, method = c("number"),
type = c("lower"))
Alternatively, we can use the agnes
function. This
functions behave very similarly; however, the agnes
function can also get the agglomerative coefficient, which measures the
amount of clustering structure found (values closer to 1 suggest strong
clustering structure).
To find which hierarchical clustering methods that can identify stronger clustering structures. Here we see that Ward’s method identifies the strongest clustering structure of the four methods assessed.
#method to assess
m <- c("average", "single","complete","ward")
names(m) <- c("average", "single","complete","ward.D2")
#function to compute coefficient
ac <- function(x){agnes(datos_norm, method = x)$ac}
map_dbl(m,ac)
## average single complete ward.D2
## 0.1378161 0.1030981 0.2302779 0.4472303
# Note that agnes(*, method="ward") corresponds to hclust(*, "ward.D2").
# Compute with agnes
hc_a <- agnes(datos_norm, method = "ward")
# Agglomerative coefficient
hc_a$ac
## [1] 0.4472303
Clustering validation and evaluation strategies, consist of measuring the goodness of clustering results. Before applying any clustering algorithm to a data set, the first thing to do is to assess the clustering tendency. That is, whether the data contains any inherent grouping structure.
If yes, then how many clusters are there. Next, you can perform hierarchical clustering or partitioning clustering (with a pre-specified number of clusters). Finally, evaluate the goodness of the clustering results.
To assess the clustering tendency, the Hopkins’ statistic and a visual approach can be used. This can be performed using the function get_clust_tendency() [factoextra package], which creates an ordered dissimilarity image (ODI).
Hopkins statistic: If the value of Hopkins statistic is close to 1 (far above 0.5), then we can conclude that the dataset is significantly clusterable.
Visual approach: The visual approach detects the clustering tendency by counting the number of square shaped dark (or colored) blocks along the diagonal in the ordered dissimilarity image.
gradient.color <- list(low = "steelblue", high = "white")
datos_norm %>%
get_clust_tendency(n = 50, gradient = gradient.color)
## $hopkins_stat
## [1] 0.6560631
##
## $plot
?????
# clust_1 <- hclust(distance, method = "ward.D2")
# plot(clust_1)
# Enhanced hierarchical clustering, cut in 2 groups
res.hc <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, method = "ward.D2")
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
rect = TRUE, show_labels = FALSE)
Visualize the clustered data with red rectangle border considering k = 2
Evaluate the result: To evaluate the result, we can calculate the silhouette index.
# reactable(silhouette(cutree(clust_1, k = 2), distance))
library(knitr)
library(factoextra)
library(ggplot2)
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 50 0.01
## 2 2 18 0.05
silhouette(cutree(clust_1, k = 2), distance)
is used to
calculate the silhouette measure for the cluster obtained by dividing
cluster into two groups using the cutree function, and where
distance
is the distance_matrix
used in the
clustering process. The silhouette measure is a measure of the quality
of the grouping, which indicates how well objects are grouped within a
group compared to other groups. A high value of the silhouette measure
indicates that the grouping is good, while a low value indicates that
the grouping is bad.
The silhouette
measurement can vary between -1 and 1,
where values close to 1 indicate a good grouping, while values close to
-1 indicate a bad grouping. A value of 0 indicates that the groups
overlap each other.
Using the function fviz_cluster()
, we can also
visualize the result in a scatter plot. Observations are represented by
points in the plot, using principal components. A frame is drawn around
each cluster.
fviz_cluster(list(data = datos_norm, cluster = cutree(clust_1, k = 2)))
# clust_1 <- hclust(distance, method = "ward.D2")
# plot(clust_1)
# Enhanced hierarchical clustering, cut in 2 groups
res.hc <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
# kmeans does not have dendogram
fviz_silhouette(res.hc)
## cluster size ave.sil.width
## 1 1 37 0.03
## 2 2 31 0.03
fviz_cluster(list(data = datos_norm, cluster = res.hc$cluster))
k2 <- datos_norm %>%
eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
k3 <- datos_norm %>%
eclust("hclust", k = 3, graph = FALSE, FUNcluster = "kmeans")
k4 <-datos_norm %>%
eclust("hclust", k = 4, graph = FALSE, FUNcluster = "kmeans")
k5 <- datos_norm %>%
eclust("hclust", k = 5, graph = FALSE, FUNcluster = "kmeans")
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
Recall that, the basic idea behind cluster partitioning methods, such as k-means clustering, is to define clusters such that the total intra-cluster variation (known as total within-cluster variation or total within-cluster sum of square) is minimized:
\[ minimize\Bigg(\sum^k_{k=1}W(C_k)\Bigg) \]
Where \(C_k\) is the \(k^{th}\) cluster and \(W(C_k)\) is the within-cluster variation. The total within-cluster sum of square (wss) measures the compactness of the clustering and we want it to be as small as possible. Thus, we can use the following algorithm to define the optimal clusters:
Compute clustering algorithm (e.g., k-means clustering) for different values of \(k\). For instance, by varying \(k\) from 1 to 7 clusters.
For each \(k\), calculate the total within-cluster sum of square (wss).
Plot the curve of wss according to the number of clusters \(k\).
The location of a bend (knee) in the plot is generally considered as an indicator of the appropriate number of clusters
We can implement this in R with the following code. The results suggest that 4 is the optimal number of clusters as it appears to be the bend in the knee (or elbow).
set.seed(123)
# function to compute total within-cluster sum of square
wss <- function(k) {
kmeans(datos_norm, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:10
# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
set.seed(123)
fviz_nbclust(datos_norm, kmeans, method = "wss")
n short, the average silhouette approach measures the quality of a clustering. That is, it determines how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The average silhouette method computes the average silhouette of observations for different values of k. The optimal number of clusters k is the one that maximizes the average silhouette over a range of possible values for \(k\).
We can use the silhouette
function in the cluster
package to compuate the average silhouette width. The following code
computes this approach for 1-15 clusters. The results show that 2
clusters maximize the average silhouette values with 4 clusters coming
in as second optimal number of clusters.
fviz_nbclust(datos_norm, kmeans, method = "silhouette")
The gap statistic has been published by R. Tibshirani, G. Walther, and T. Hastie (Standford University, 2001). The approach can be applied to any clustering method (i.e. K-means clustering, hierarchical clustering). The gap statistic compares the total intracluster variation for different values of k with their expected values under null reference distribution of the data (i.e. a distribution with no obvious clustering).
# compute gap statistic
set.seed(123)
gap_stat <- clusGap(datos_norm, FUN = kmeans, nstart = 25,
K.max = 10, B = 25)
# Print the result
#print(gap_stat, method = "firstmax")
fviz_gap_stat(gap_stat)