R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
# Load dataset
heart_data <- read.csv("C:/Users/otuata4438/Downloads/HeartFailure_excerpt.csv")

# Remove labels for clustering
labels <- heart_data$DEATH_EVENT


# Scale the data
scaled_data <- scale(heart_data)

# K-Means Clustering - trying multiple cluster numbers
wss <- map_dbl(1:10, function(k) {
  kmeans(scaled_data, centers = k, nstart = 25)$tot.withinss
})

# Plot WSS to find optimal k
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of clusters", ylab = "Total within-clusters sum of squares")

# Perform K-Means with optimal k (replace 3 with chosen number)
kmeans_result <- kmeans(scaled_data, centers = 3, nstart = 25)


# Internal evaluation for K-Means
silhouette_kmeans <- silhouette(kmeans_result$cluster, dist(scaled_data))
avg_silhouette_kmeans <- mean(silhouette_kmeans[, 3])

# External evaluation for K-Means


# Hierarchical Clustering - trying multiple linkage methods
linkage_methods <- c("ward.D2", "complete", "average", "single")
hclust_results <- map(linkage_methods, function(method) {
  hclust(dist(scaled_data), method = method)
})

# Plot dendrograms
map2(hclust_results, linkage_methods, function(hclust_result, method) {
  plot(hclust_result, main = paste("Dendrogram -", method), sub = "", xlab = "")
})

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
# Cut tree with optimal clusters (replace 3 with chosen number)
cutree_results <- map(hclust_results, function(hclust_result) {
  cutree(hclust_result, k = 3)
})

# Internal evaluation for Hierarchical
silhouette_hierarchical <- map(cutree_results, function(clusters) {
  silhouette(clusters, dist(scaled_data))
})
avg_silhouette_hierarchical <- map_dbl(silhouette_hierarchical, ~ mean(.x[, 3]))

# External evaluation for Hierarchical


# Print evaluations
print(avg_silhouette_kmeans)
## [1] 0.2610136
print(avg_silhouette_hierarchical)
## [1] 0.2535210 0.4814053 0.4954127 0.5152873
# Display the K-Means clustering plot

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.