Project2

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Install required packages if not already installed
if (!require("cluster")) install.packages("cluster", dependencies = TRUE)

## Loading required package: cluster

if (!require("factoextra")) install.packages("factoextra", dependencies = TRUE)

## Loading required package: factoextra

## Warning: package 'factoextra' was built under R version 4.4.3

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

if (!require("dendextend")) install.packages("dendextend", dependencies = TRUE)

## Loading required package: dendextend

## Warning: package 'dendextend' was built under R version 4.4.3

## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

# Load required libraries
library(cluster)
library(factoextra)
library(dendextend)

# Load dataset
heart_data <- read.csv("C:/Users/yeu3178/Downloads/HeartFailure_excerpt.csv")

# Define proper column names
features_name <- c('age', 'creatinine', 'ejection_fraction', 'platelets', 
                   'serum_creatinine', 'serum_sodium', 'time', 'death_event')

# Assign correct column names to dataset
colnames(heart_data) <- features_name

# Keep only numeric columns for clustering
heart_data_numeric <- heart_data[, sapply(heart_data, is.numeric)]

# Standardize (scale) the data
heart_scaled <- scale(heart_data_numeric)

# Compute Euclidean distance matrix
dist_matrix <- dist(heart_scaled, method = "euclidean")

# **1️⃣ Find Optimal K for K-Means**
fviz_nbclust(heart_scaled, kmeans, method = "wss") + labs(title = "Elbow Method for Optimal K")

fviz_nbclust(heart_scaled, kmeans, method = "silhouette") + labs(title = "Silhouette Method for Optimal K")

# Choose best K based on the plots
best_k <- 2  # Update this based on the elbow & silhouette results

# **2️⃣ Apply K-Means Clustering with Optimal K**
set.seed(123)
kmeans_model <- kmeans(heart_scaled, centers = best_k, nstart = 25)

# View K-Means cluster distribution
table(kmeans_model$cluster)

## 
##   1   2 
## 201  97

# Visualize K-Means Clustering
fviz_cluster(kmeans_model, data = heart_scaled)

# Compute Silhouette score for K-Means
sil_km <- silhouette(kmeans_model$cluster, dist_matrix)
fviz_silhouette(sil_km)

##   cluster size ave.sil.width
## 1       1  201          0.33
## 2       2   97          0.11

# **3️⃣ Apply Hierarchical Clustering**
hclust_avg <- hclust(dist_matrix, method = "average")  # Average linkage
hclust_complete <- hclust(dist_matrix, method = "complete")  # Complete linkage
hclust_ward <- hclust(dist_matrix, method = "ward.D2")  # Ward's method

# Plot dendrogram for Average linkage
plot(hclust_avg, main = "Hierarchical Clustering (Average Linkage)", xlab = "", sub = "")
rect.hclust(hclust_avg, k = best_k, border = c("blue", "green", "red"))  # Draw cluster boxes

# Plot dendrogram for Ward's method
plot(hclust_ward, main = "Hierarchical Clustering (Ward's Method)", xlab = "", sub = "")
rect.hclust(hclust_ward, k = best_k, border = c("blue", "green", "red"))  # Draw cluster boxes

# Cut hierarchical tree into best_k clusters
hc_clusters <- cutree(hclust_avg, k = best_k)
table(hc_clusters)  # View cluster distribution

## hc_clusters
##   1   2 
## 295   3

# **4️⃣ Compare Clusters with Labels (if available)**
if("death_event" %in% colnames(heart_data)){
  print(table(hc_clusters, heart_data$death_event))
  print(table(kmeans_model$cluster, heart_data$death_event))
}

##            
## hc_clusters   0   1
##           1 203  92
##           2   0   3
##    
##       0   1
##   1 200   1
##   2   3  94

# Compute Silhouette score for Hierarchical Clustering
sil_hc <- silhouette(hc_clusters, dist_matrix)
fviz_silhouette(sil_hc)

##   cluster size ave.sil.width
## 1       1  295          0.56
## 2       2    3          0.33

Project2

Chris Yeu

2025-03-18

R Markdown

Including Plots