Libraries

library(tidyr)
library(readr)
library(ggplot2)
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(reactable) # reactable(df)

library(dplyr)

## Knn imputation
library(caret)
library(RANN)

# CLustering 
library(factoextra)    #clustering visualization
library(cluster)       #clustering algorithms
library(dendextend)    #for comparing two dendrograms
library(corrplot)      #corelation between dendrograms
library(tidyverse)     #data manupulation
library(NbClust)       #determine optimal no. of clusters  [not working...]

Reading Data

FC_all_valid_patients_input_P1 = data.frame(read_xlsx("../data/clean-data/CriteriaBasedImputation/FC_all_valid_patients_input_P1.xlsx", sheet = "FC_all_valid_patients_input_P1" ))
Sat02_all_valid_patients_input_P1 = data.frame(read_xlsx("../data/clean-data/CriteriaBasedImputation/SatO2_all_valid_patients_input_P1.xlsx", sheet = "St2_all_valid_patients_input_P1" ))

## Deterioro and Not deterioro
file_patient_name_NO_DETERIORO <- data.frame(read_xlsx("../data/info-patients/file_patient_name_NO_DETERIORO.xlsx"))
file_patient_name_NO_DETERIORO <- file_patient_name_NO_DETERIORO$x
file_patient_name_DETERIORO <- data.frame(read_xlsx("../data/info-patients/file_patient_name_DETERIORO.xlsx"))
file_patient_name_DETERIORO <- file_patient_name_DETERIORO$x

Data standarization

I wanted a mean of 0 and a standard deviation of 1. Standardization consists of transforming the variables such that they have mean zero and standard deviation one.

datos <- cbind(t(Sat02_all_valid_patients_input_P1), t(FC_all_valid_patients_input_P1))

SatO2_norm = scale(Sat02_all_valid_patients_input_P1)
FC_norm = scale(FC_all_valid_patients_input_P1)

SatO2_t <- t(SatO2_norm)
FC_t <- t(FC_norm)

datos_norm <- cbind(SatO2_t, FC_t)

Hierarchical clustering

  1. Rows are observations (individuals) and columns are variables

  2. The data have been standardized (i.e., scaled) to make variables comparable.

  3. Any missing value in the data must be removed or estimated.

table(is.na(datos_norm))
## 
##  FALSE 
## 195976

Matrix Distance

distance <- dist(datos_norm, method = "euclidean")
distance_matrix <- as.matrix(distance)

Visualizing Distance atrix

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

Comparing two dendrogram

We’ll use the package dendextend which contains many functions for comparing two dendrograms. Compute two hierarchical clusterings

clust_1 <- hclust(distance, method = "complete")
clust_2 <- hclust(distance, method = "ward.D2")

Compute two dendograms

dend1 <- as.dendrogram((clust_1))
dend2 <- as.dendrogram((clust_2))

dend_list <- dendlist(dend1, dend2) # list of dendograms

The function tanglegram() plots two dendrograms, side by side, with their labels connected by lines. It can be used for visually comparing two methods of Hierarchical clustering as follow:

tanglegram(dend1, dend2)

Correlation matrix between a list of dendrogram

# Create multiple dendrograms by chaining
dend1 <- datos_norm %>% dist %>% hclust("complete") %>% as.dendrogram
dend2 <- datos_norm %>% dist %>% hclust("single") %>% as.dendrogram
dend3 <- datos_norm %>% dist %>% hclust("ward.D2") %>% as.dendrogram
dend4 <- datos_norm %>% dist %>% hclust("centroid") %>% as.dendrogram

# Compute correlation matrix
dend_list <- dendlist("Complete" = dend1, "Single" = dend2,
                      "Ward.D2" = dend3, "Centroid" = dend4)
cors <- cor.dendlist(dend_list)
# Print correlation matrix
round(cors, 2)
##          Complete Single Ward.D2 Centroid
## Complete     1.00   0.52    0.41     0.34
## Single       0.52   1.00    0.20     0.62
## Ward.D2      0.41   0.20    1.00     0.05
## Centroid     0.34   0.62    0.05     1.00
corrplot(cors, method = c("number"),
  type = c("lower"))

Alternative Approach to Hierarchical Clustering

Alternatively, we can use the agnes function. This functions behave very similarly; however, the agnes function can also get the agglomerative coefficient, which measures the amount of clustering structure found (values closer to 1 suggest strong clustering structure).

To find which hierarchical clustering methods that can identify stronger clustering structures. Here we see that Ward’s method identifies the strongest clustering structure of the four methods assessed.

#method to assess
m <- c("average", "single","complete","ward")
names(m) <- c("average", "single","complete","ward.D2")

#function to compute coefficient
ac <- function(x){agnes(datos_norm, method = x)$ac}
map_dbl(m,ac)
##   average    single  complete   ward.D2 
## 0.1378161 0.1030981 0.2302779 0.4472303
# Note that agnes(*, method="ward") corresponds to hclust(*, "ward.D2").
# Compute with agnes
hc_a <- agnes(datos_norm, method = "ward")
# Agglomerative coefficient
hc_a$ac
## [1] 0.4472303

Clustering Validation and evaluation

Clustering validation and evaluation strategies, consist of measuring the goodness of clustering results. Before applying any clustering algorithm to a data set, the first thing to do is to assess the clustering tendency. That is, whether the data contains any inherent grouping structure.

If yes, then how many clusters are there. Next, you can perform hierarchical clustering or partitioning clustering (with a pre-specified number of clusters). Finally, evaluate the goodness of the clustering results.

Assessing clustering tendency

To assess the clustering tendency, the Hopkins’ statistic and a visual approach can be used. This can be performed using the function get_clust_tendency() [factoextra package], which creates an ordered dissimilarity image (ODI).

Hopkins statistic: If the value of Hopkins statistic is close to 1 (far above 0.5), then we can conclude that the dataset is significantly clusterable.

Visual approach: The visual approach detects the clustering tendency by counting the number of square shaped dark (or colored) blocks along the diagonal in the ordered dissimilarity image.

gradient.color <- list(low = "steelblue",  high = "white")
                  datos_norm %>%
                  get_clust_tendency(n = 50, gradient = gradient.color)
## $hopkins_stat
## [1] 0.6560631
## 
## $plot

Determining the optimal number of clusters

?????

Clustering Visualization

# clust_1 <- hclust(distance, method = "ward.D2")
# plot(clust_1)
# Enhanced hierarchical clustering, cut in 2 groups
res.hc <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, method = "ward.D2")
# Visualize with factoextra
fviz_dend(res.hc, palette = "jco",
          rect = TRUE, show_labels = FALSE)

Visualize the clustered data with red rectangle border considering k = 2

Evaluate the result: To evaluate the result, we can calculate the silhouette index.

# reactable(silhouette(cutree(clust_1, k = 2), distance))
library(knitr)
library(factoextra)
library(ggplot2)
fviz_silhouette(res.hc)
##   cluster size ave.sil.width
## 1       1   50          0.01
## 2       2   18          0.05

silhouette(cutree(clust_1, k = 2), distance) is used to calculate the silhouette measure for the cluster obtained by dividing cluster into two groups using the cutree function, and where distance is the distance_matrix used in the clustering process. The silhouette measure is a measure of the quality of the grouping, which indicates how well objects are grouped within a group compared to other groups. A high value of the silhouette measure indicates that the grouping is good, while a low value indicates that the grouping is bad.

The silhouette measurement can vary between -1 and 1, where values close to 1 indicate a good grouping, while values close to -1 indicate a bad grouping. A value of 0 indicates that the groups overlap each other.

Using the function fviz_cluster() , we can also visualize the result in a scatter plot. Observations are represented by points in the plot, using principal components. A frame is drawn around each cluster.

fviz_cluster(list(data = datos_norm, cluster = cutree(clust_1, k = 2)))

K-means clustering

# clust_1 <- hclust(distance, method = "ward.D2")
# plot(clust_1)
# Enhanced hierarchical clustering, cut in 2 groups
res.hc <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")

# kmeans does not have dendogram
fviz_silhouette(res.hc)
##   cluster size ave.sil.width
## 1       1   37          0.03
## 2       2   31          0.03

fviz_cluster(list(data = datos_norm, cluster = res.hc$cluster))

k2 <- datos_norm %>%
          eclust("hclust", k = 2, graph = FALSE, FUNcluster = "kmeans")
k3 <- datos_norm %>%
          eclust("hclust", k = 3, graph = FALSE, FUNcluster = "kmeans")
k4 <-datos_norm %>%
          eclust("hclust", k = 4, graph = FALSE, FUNcluster = "kmeans")
k5 <- datos_norm %>%
          eclust("hclust", k = 5, graph = FALSE, FUNcluster = "kmeans")

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)

Recall that, the basic idea behind cluster partitioning methods, such as k-means clustering, is to define clusters such that the total intra-cluster variation (known as total within-cluster variation or total within-cluster sum of square) is minimized:

\[ minimize\Bigg(\sum^k_{k=1}W(C_k)\Bigg) \]

Where \(C_k\) is the \(k^{th}\) cluster and \(W(C_k)\) is the within-cluster variation. The total within-cluster sum of square (wss) measures the compactness of the clustering and we want it to be as small as possible. Thus, we can use the following algorithm to define the optimal clusters:

  1. Compute clustering algorithm (e.g., k-means clustering) for different values of \(k\). For instance, by varying \(k\) from 1 to 7 clusters.

  2. For each \(k\), calculate the total within-cluster sum of square (wss).

  3. Plot the curve of wss according to the number of clusters \(k\).

  4. The location of a bend (knee) in the plot is generally considered as an indicator of the appropriate number of clusters

We can implement this in R with the following code. The results suggest that 4 is the optimal number of clusters as it appears to be the bend in the knee (or elbow).

set.seed(123)

# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(datos_norm, k, nstart = 10 )$tot.withinss
}

# Compute and plot wss for k = 1 to k = 15
k.values <- 1:10

# extract wss for 2-15 clusters
wss_values <- map_dbl(k.values, wss)

plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

set.seed(123)

fviz_nbclust(datos_norm, kmeans, method = "wss")

Average Silhouette Method

n short, the average silhouette approach measures the quality of a clustering. That is, it determines how well each object lies within its cluster. A high average silhouette width indicates a good clustering. The average silhouette method computes the average silhouette of observations for different values of k. The optimal number of clusters k is the one that maximizes the average silhouette over a range of possible values for \(k\).

We can use the silhouette function in the cluster package to compuate the average silhouette width. The following code computes this approach for 1-15 clusters. The results show that 2 clusters maximize the average silhouette values with 4 clusters coming in as second optimal number of clusters.

fviz_nbclust(datos_norm, kmeans, method = "silhouette")

Gap Statistic Method

The gap statistic has been published by R. Tibshirani, G. Walther, and T. Hastie (Standford University, 2001). The approach can be applied to any clustering method (i.e. K-means clustering, hierarchical clustering). The gap statistic compares the total intracluster variation for different values of k with their expected values under null reference distribution of the data (i.e. a distribution with no obvious clustering).

# compute gap statistic
set.seed(123)
gap_stat <- clusGap(datos_norm, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 25)
# Print the result
#print(gap_stat, method = "firstmax")
fviz_gap_stat(gap_stat)