R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Install required packages if not already installed
if (!require("cluster")) install.packages("cluster", dependencies = TRUE)
## Loading required package: cluster
if (!require("factoextra")) install.packages("factoextra", dependencies = TRUE)
## Loading required package: factoextra
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
if (!require("dendextend")) install.packages("dendextend", dependencies = TRUE)
## Loading required package: dendextend
## Warning: package 'dendextend' was built under R version 4.4.3
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
# Load required libraries
library(cluster)
library(factoextra)
library(dendextend)

# Load dataset
heart_data <- read.csv("C:/Users/yeu3178/Downloads/HeartFailure_excerpt.csv")

# Define proper column names
features_name <- c('age', 'creatinine', 'ejection_fraction', 'platelets', 
                   'serum_creatinine', 'serum_sodium', 'time', 'death_event')

# Assign correct column names to dataset
colnames(heart_data) <- features_name

# Keep only numeric columns for clustering
heart_data_numeric <- heart_data[, sapply(heart_data, is.numeric)]

# Standardize (scale) the data
heart_scaled <- scale(heart_data_numeric)

# Compute Euclidean distance matrix
dist_matrix <- dist(heart_scaled, method = "euclidean")

# **1️⃣ Find Optimal K for K-Means**
fviz_nbclust(heart_scaled, kmeans, method = "wss") + labs(title = "Elbow Method for Optimal K")

fviz_nbclust(heart_scaled, kmeans, method = "silhouette") + labs(title = "Silhouette Method for Optimal K")

# Choose best K based on the plots
best_k <- 2  # Update this based on the elbow & silhouette results

# **2️⃣ Apply K-Means Clustering with Optimal K**
set.seed(123)
kmeans_model <- kmeans(heart_scaled, centers = best_k, nstart = 25)

# View K-Means cluster distribution
table(kmeans_model$cluster)
## 
##   1   2 
## 201  97
# Visualize K-Means Clustering
fviz_cluster(kmeans_model, data = heart_scaled)

# Compute Silhouette score for K-Means
sil_km <- silhouette(kmeans_model$cluster, dist_matrix)
fviz_silhouette(sil_km)
##   cluster size ave.sil.width
## 1       1  201          0.33
## 2       2   97          0.11

# **3️⃣ Apply Hierarchical Clustering**
hclust_avg <- hclust(dist_matrix, method = "average")  # Average linkage
hclust_complete <- hclust(dist_matrix, method = "complete")  # Complete linkage
hclust_ward <- hclust(dist_matrix, method = "ward.D2")  # Ward's method

# Plot dendrogram for Average linkage
plot(hclust_avg, main = "Hierarchical Clustering (Average Linkage)", xlab = "", sub = "")
rect.hclust(hclust_avg, k = best_k, border = c("blue", "green", "red"))  # Draw cluster boxes

# Plot dendrogram for Ward's method
plot(hclust_ward, main = "Hierarchical Clustering (Ward's Method)", xlab = "", sub = "")
rect.hclust(hclust_ward, k = best_k, border = c("blue", "green", "red"))  # Draw cluster boxes

# Cut hierarchical tree into best_k clusters
hc_clusters <- cutree(hclust_avg, k = best_k)
table(hc_clusters)  # View cluster distribution
## hc_clusters
##   1   2 
## 295   3
# **4️⃣ Compare Clusters with Labels (if available)**
if("death_event" %in% colnames(heart_data)){
  print(table(hc_clusters, heart_data$death_event))
  print(table(kmeans_model$cluster, heart_data$death_event))
}
##            
## hc_clusters   0   1
##           1 203  92
##           2   0   3
##    
##       0   1
##   1 200   1
##   2   3  94
# Compute Silhouette score for Hierarchical Clustering
sil_hc <- silhouette(hc_clusters, dist_matrix)
fviz_silhouette(sil_hc)
##   cluster size ave.sil.width
## 1       1  295          0.56
## 2       2    3          0.33

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.