Project 2

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(cluster)
library(dendextend)

## Warning: package 'dendextend' was built under R version 4.4.3

## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

set.seed(786)

file_path <- "C:/Users/singleton1097/Downloads/HeartFailure_excerpt(in).csv"

if (file.exists(file_path)) {
  HeartFailure_excerpt <- read.csv(file_path, header=FALSE)
  print("Dataset loaded successfully!")
} else {
  stop("Error: File not found! Check the path.")
}

## [1] "Dataset loaded successfully!"

feature_name <- c('Age','CPK_levels','Ejection_Fraction','Platelets',
                  'Serum_Creatinine','Serum_Sodium','Time','DEATH_EVENT')
colnames(HeartFailure_excerpt) <- feature_name

str(HeartFailure_excerpt)

## 'data.frame':    299 obs. of  8 variables:
##  $ Age              : num  75 55 65 50 65 90 75 60 65 80 ...
##  $ CPK_levels       : int  582 7861 146 111 160 47 246 315 157 123 ...
##  $ Ejection_Fraction: int  20 38 20 20 20 40 15 60 65 35 ...
##  $ Platelets        : num  265000 263358 162000 210000 327000 ...
##  $ Serum_Creatinine : num  1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
##  $ Serum_Sodium     : int  130 136 129 137 116 132 137 131 138 133 ...
##  $ Time             : int  4 6 7 7 8 8 10 10 10 10 ...
##  $ DEATH_EVENT      : int  1 1 1 1 1 1 1 1 1 1 ...

summary(HeartFailure_excerpt)

##       Age          CPK_levels     Ejection_Fraction   Platelets     
##  Min.   :40.00   Min.   :  23.0   Min.   :14.00     Min.   : 25100  
##  1st Qu.:51.00   1st Qu.: 116.5   1st Qu.:30.00     1st Qu.:212500  
##  Median :60.00   Median : 250.0   Median :38.00     Median :262000  
##  Mean   :60.83   Mean   : 581.8   Mean   :38.08     Mean   :263358  
##  3rd Qu.:70.00   3rd Qu.: 582.0   3rd Qu.:45.00     3rd Qu.:303500  
##  Max.   :95.00   Max.   :7861.0   Max.   :80.00     Max.   :850000  
##  Serum_Creatinine  Serum_Sodium        Time        DEATH_EVENT    
##  Min.   :0.500    Min.   :113.0   Min.   :  4.0   Min.   :0.0000  
##  1st Qu.:0.900    1st Qu.:134.0   1st Qu.: 73.0   1st Qu.:0.0000  
##  Median :1.100    Median :137.0   Median :115.0   Median :0.0000  
##  Mean   :1.394    Mean   :136.6   Mean   :130.3   Mean   :0.3211  
##  3rd Qu.:1.400    3rd Qu.:140.0   3rd Qu.:203.0   3rd Qu.:1.0000  
##  Max.   :9.400    Max.   :148.0   Max.   :285.0   Max.   :1.0000

if (any(is.na(HeartFailure_excerpt))) {
  print("Warning: Missing values detected. Removing NA rows...")
  HeartFailure_excerpt <- na.omit(HeartFailure_excerpt)
}

heart_labels <- HeartFailure_excerpt$DEATH_EVENT
HeartFailure_excerpt$DEATH_EVENT <- NULL 

heart_failure_scaled <- as.data.frame(scale(HeartFailure_excerpt))

dist_mat <- dist(heart_failure_scaled, method="euclidean")

methods <- c("complete", "single", "average")
sil_scores <- matrix(NA, ncol=length(methods), nrow=5, dimnames=list(2:6, methods))

for (m in seq_along(methods)) {
  hclust_model <- hclust(dist_mat, method=methods[m])
  plot(hclust_model, main=paste("Dendrogram -", methods[m]))  
  
  for (k in 2:6) {
    clusters <- cutree(hclust_model, k)
    silhouette_score <- silhouette(clusters, dist_mat)
    sil_scores[k-1, m] <- mean(silhouette_score[,3])  
  }
}

print("Silhouette Scores for Hierarchical Clustering:")

## [1] "Silhouette Scores for Hierarchical Clustering:"

print(sil_scores)

##     complete    single   average
## 2 0.58492314 0.5382345 0.5849231
## 3 0.41232918 0.5373909 0.4825934
## 4 0.39225866 0.5404165 0.4631045
## 5 0.35005824 0.4753990 0.4594738
## 6 0.09704012 0.3971030 0.3740178

best_method <- methods[which.max(colMeans(sil_scores, na.rm=TRUE))]
hclust_best <- hclust(dist_mat, method=best_method)
plot(hclust_best, main=paste("Best Dendrogram -", best_method))
rect.hclust(hclust_best, k=3, border=2:6)

hc_clusters <- cutree(hclust_best, k=3)
cm_hc <- table(hc_clusters, heart_labels)
print("Confusion Matrix for Hierarchical Clustering:")

## [1] "Confusion Matrix for Hierarchical Clustering:"

print(cm_hc)

##            heart_labels
## hc_clusters   0   1
##           1 203  94
##           2   0   1
##           3   0   1

error_hc <- 1 - sum(diag(cm_hc)) / sum(cm_hc)
print(paste("Hierarchical Clustering Error:", round(error_hc, 4)))

## [1] "Hierarchical Clustering Error: 0.3177"

set.seed(1234)
kmeans_sil_scores <- c()

for (k in 2:6) {
  kmeans_model <- kmeans(heart_failure_scaled, centers=k, iter.max=200, nstart=10)
  kmeans_clusters <- kmeans_model$cluster
  sil_score <- silhouette(kmeans_clusters, dist_mat)
  kmeans_sil_scores[k-1] <- mean(sil_score[,3]) 
}

kmeans_results <- data.frame(Clusters=2:6, Silhouette_Score=kmeans_sil_scores)
print("Silhouette Scores for K-Means Clustering:")

## [1] "Silhouette Scores for K-Means Clustering:"

print(kmeans_results)

##   Clusters Silhouette_Score
## 1        2        0.1579632
## 2        3        0.1537191
## 3        4        0.1458186
## 4        5        0.1588330
## 5        6        0.1576545

best_kmeans_model <- kmeans(heart_failure_scaled, centers=3, iter.max=200, nstart=10)
cm_kmeans <- table(best_kmeans_model$cluster, heart_labels)
print("Confusion Matrix for K-Means Clustering:")

## [1] "Confusion Matrix for K-Means Clustering:"

print(cm_kmeans)

##    heart_labels
##       0   1
##   1  10   8
##   2  51  74
##   3 142  14

error_kmeans <- 1 - sum(diag(cm_kmeans)) / sum(cm_kmeans)
print(paste("K-Means Clustering Error:", round(error_kmeans, 4)))

## [1] "K-Means Clustering Error: 0.7191"

ggplot(heart_failure_scaled, aes(x=Age, y=CPK_levels, color=factor(best_kmeans_model$cluster))) +
  geom_point() +
  labs(title="K-Means Clustering", x="Age", y="CPK Levels", color="Cluster") +
  theme_minimal()

Project 2

Augusta Singleton

2025-03-18

R Markdown

Including Plots