library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.3.3
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
df <- read.csv("~/Downloads/HeartFailure_excerpt.csv", header=FALSE)
View(df)

colnames(df) <- c("Age", "Creatinine_Phosphokinase", "Ejection_Fraction", 
                  "Platelets", "Serum_Creatinine", "Serum_Sodium", "Time", "Death_Event")

df_features <- df[, -ncol(df)]

# Standardize the data
df_scaled <- scale(df_features)

# K-Means Clustering
set.seed(42)
wss <- sapply(2:10, function(k) kmeans(df_scaled, centers = k, nstart = 10)$tot.withinss)

# Elbow plot
elbow_plot <- qplot(2:10, wss, geom = "line") +
  geom_point(size = 3) +
  ggtitle("Elbow Method for Optimal K in K-Means") +
  xlab("Number of Clusters (k)") +
  ylab("Within-Cluster Sum of Squares")
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(elbow_plot)

# Compute silhouette scores for different k
silhouette_scores <- sapply(2:10, function(k) {
  km <- kmeans(df_scaled, centers = k, nstart = 10)
  mean(silhouette(km$cluster, dist(df_scaled))[, 3])
})

# Plot Silhouette scores
silhouette_plot <- qplot(2:10, silhouette_scores, geom = "line") +
  geom_point(size = 3) +
  ggtitle("Silhouette Scores for Different K in K-Means") +
  xlab("Number of Clusters (k)") +
  ylab("Silhouette Score")
print(silhouette_plot)

# Best K (assuming 3 from analysis)
k_best <- 3
km_final <- kmeans(df_scaled, centers = k_best, nstart = 10)
df$Cluster_KMeans <- km_final$cluster

# Silhouette score for K-Means
silhouette_kmeans <- mean(silhouette(km_final$cluster, dist(df_scaled))[, 3])
print(paste("Silhouette Score for K-Means:", silhouette_kmeans))
## [1] "Silhouette Score for K-Means: 0.13209140483699"
# -------------------------
# Hierarchical Clustering
# -------------------------
linkage_methods <- c("ward.D", "complete", "average", "single")
silhouette_hierarchical <- c()

for (method in linkage_methods) {
  hc <- hclust(dist(df_scaled), method = method)
  clusters <- cutree(hc, k = k_best)
  silhouette_hierarchical <- c(silhouette_hierarchical, 
                               mean(silhouette(clusters, dist(df_scaled))[, 3]))
}

names(silhouette_hierarchical) <- linkage_methods
print("Silhouette Scores for Hierarchical Clustering:")
## [1] "Silhouette Scores for Hierarchical Clustering:"
print(silhouette_hierarchical)
##    ward.D  complete   average    single 
## 0.1045150 0.4123292 0.4825934 0.5373909
# Dendrogram for Single Linkage (Best internal score)
hc_single <- hclust(dist(df_scaled), method = "single")
plot(hc_single, main = "Dendrogram for Single Linkage Clustering", xlab = "Data Points", ylab = "Height")

# External Evaluation - Adjusted Rand Index (ARI)
library(mclust)  # For ARI computation
## Warning: package 'mclust' was built under R version 4.3.2
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
ari_kmeans <- adjustedRandIndex(df$Death_Event, df$Cluster_KMeans)
print(paste("Adjusted Rand Index for K-Means:", ari_kmeans))
## [1] "Adjusted Rand Index for K-Means: 0.146584857214254"
ari_hierarchical <- sapply(linkage_methods, function(method) {
  hc <- hclust(dist(df_scaled), method = method)
  clusters <- cutree(hc, k = k_best)
  adjustedRandIndex(df$Death_Event, clusters)
})

names(ari_hierarchical) <- linkage_methods
print("Adjusted Rand Index for Hierarchical Clustering:")
## [1] "Adjusted Rand Index for Hierarchical Clustering:"
print(ari_hierarchical)
##     ward.D   complete    average     single 
## 0.06327714 0.03289532 0.01518362 0.01489878