This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Install required packages if not already installed
if (!require("cluster")) install.packages("cluster", dependencies = TRUE)
## Loading required package: cluster
if (!require("factoextra")) install.packages("factoextra", dependencies = TRUE)
## Loading required package: factoextra
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
if (!require("dendextend")) install.packages("dendextend", dependencies = TRUE)
## Loading required package: dendextend
## Warning: package 'dendextend' was built under R version 4.4.3
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
# Load required libraries
library(cluster)
library(factoextra)
library(dendextend)
# Load dataset
heart_data <- read.csv("C:/Users/yeu3178/Downloads/HeartFailure_excerpt.csv")
# Define proper column names
features_name <- c('age', 'creatinine', 'ejection_fraction', 'platelets',
'serum_creatinine', 'serum_sodium', 'time', 'death_event')
# Assign correct column names to dataset
colnames(heart_data) <- features_name
# Keep only numeric columns for clustering
heart_data_numeric <- heart_data[, sapply(heart_data, is.numeric)]
# Standardize (scale) the data
heart_scaled <- scale(heart_data_numeric)
# Compute Euclidean distance matrix
dist_matrix <- dist(heart_scaled, method = "euclidean")
# **1️⃣ Find Optimal K for K-Means**
fviz_nbclust(heart_scaled, kmeans, method = "wss") + labs(title = "Elbow Method for Optimal K")
fviz_nbclust(heart_scaled, kmeans, method = "silhouette") + labs(title = "Silhouette Method for Optimal K")
# Choose best K based on the plots
best_k <- 2 # Update this based on the elbow & silhouette results
# **2️⃣ Apply K-Means Clustering with Optimal K**
set.seed(123)
kmeans_model <- kmeans(heart_scaled, centers = best_k, nstart = 25)
# View K-Means cluster distribution
table(kmeans_model$cluster)
##
## 1 2
## 201 97
# Visualize K-Means Clustering
fviz_cluster(kmeans_model, data = heart_scaled)
# Compute Silhouette score for K-Means
sil_km <- silhouette(kmeans_model$cluster, dist_matrix)
fviz_silhouette(sil_km)
## cluster size ave.sil.width
## 1 1 201 0.33
## 2 2 97 0.11
# **3️⃣ Apply Hierarchical Clustering**
hclust_avg <- hclust(dist_matrix, method = "average") # Average linkage
hclust_complete <- hclust(dist_matrix, method = "complete") # Complete linkage
hclust_ward <- hclust(dist_matrix, method = "ward.D2") # Ward's method
# Plot dendrogram for Average linkage
plot(hclust_avg, main = "Hierarchical Clustering (Average Linkage)", xlab = "", sub = "")
rect.hclust(hclust_avg, k = best_k, border = c("blue", "green", "red")) # Draw cluster boxes
# Plot dendrogram for Ward's method
plot(hclust_ward, main = "Hierarchical Clustering (Ward's Method)", xlab = "", sub = "")
rect.hclust(hclust_ward, k = best_k, border = c("blue", "green", "red")) # Draw cluster boxes
# Cut hierarchical tree into best_k clusters
hc_clusters <- cutree(hclust_avg, k = best_k)
table(hc_clusters) # View cluster distribution
## hc_clusters
## 1 2
## 295 3
# **4️⃣ Compare Clusters with Labels (if available)**
if("death_event" %in% colnames(heart_data)){
print(table(hc_clusters, heart_data$death_event))
print(table(kmeans_model$cluster, heart_data$death_event))
}
##
## hc_clusters 0 1
## 1 203 92
## 2 0 3
##
## 0 1
## 1 200 1
## 2 3 94
# Compute Silhouette score for Hierarchical Clustering
sil_hc <- silhouette(hc_clusters, dist_matrix)
fviz_silhouette(sil_hc)
## cluster size ave.sil.width
## 1 1 295 0.56
## 2 2 3 0.33
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.