This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(cluster)
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.4.3
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(786)
file_path <- "C:/Users/singleton1097/Downloads/HeartFailure_excerpt(in).csv"
if (file.exists(file_path)) {
HeartFailure_excerpt <- read.csv(file_path, header=FALSE)
print("Dataset loaded successfully!")
} else {
stop("Error: File not found! Check the path.")
}
## [1] "Dataset loaded successfully!"
feature_name <- c('Age','CPK_levels','Ejection_Fraction','Platelets',
'Serum_Creatinine','Serum_Sodium','Time','DEATH_EVENT')
colnames(HeartFailure_excerpt) <- feature_name
str(HeartFailure_excerpt)
## 'data.frame': 299 obs. of 8 variables:
## $ Age : num 75 55 65 50 65 90 75 60 65 80 ...
## $ CPK_levels : int 582 7861 146 111 160 47 246 315 157 123 ...
## $ Ejection_Fraction: int 20 38 20 20 20 40 15 60 65 35 ...
## $ Platelets : num 265000 263358 162000 210000 327000 ...
## $ Serum_Creatinine : num 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
## $ Serum_Sodium : int 130 136 129 137 116 132 137 131 138 133 ...
## $ Time : int 4 6 7 7 8 8 10 10 10 10 ...
## $ DEATH_EVENT : int 1 1 1 1 1 1 1 1 1 1 ...
summary(HeartFailure_excerpt)
## Age CPK_levels Ejection_Fraction Platelets
## Min. :40.00 Min. : 23.0 Min. :14.00 Min. : 25100
## 1st Qu.:51.00 1st Qu.: 116.5 1st Qu.:30.00 1st Qu.:212500
## Median :60.00 Median : 250.0 Median :38.00 Median :262000
## Mean :60.83 Mean : 581.8 Mean :38.08 Mean :263358
## 3rd Qu.:70.00 3rd Qu.: 582.0 3rd Qu.:45.00 3rd Qu.:303500
## Max. :95.00 Max. :7861.0 Max. :80.00 Max. :850000
## Serum_Creatinine Serum_Sodium Time DEATH_EVENT
## Min. :0.500 Min. :113.0 Min. : 4.0 Min. :0.0000
## 1st Qu.:0.900 1st Qu.:134.0 1st Qu.: 73.0 1st Qu.:0.0000
## Median :1.100 Median :137.0 Median :115.0 Median :0.0000
## Mean :1.394 Mean :136.6 Mean :130.3 Mean :0.3211
## 3rd Qu.:1.400 3rd Qu.:140.0 3rd Qu.:203.0 3rd Qu.:1.0000
## Max. :9.400 Max. :148.0 Max. :285.0 Max. :1.0000
if (any(is.na(HeartFailure_excerpt))) {
print("Warning: Missing values detected. Removing NA rows...")
HeartFailure_excerpt <- na.omit(HeartFailure_excerpt)
}
heart_labels <- HeartFailure_excerpt$DEATH_EVENT
HeartFailure_excerpt$DEATH_EVENT <- NULL
heart_failure_scaled <- as.data.frame(scale(HeartFailure_excerpt))
dist_mat <- dist(heart_failure_scaled, method="euclidean")
methods <- c("complete", "single", "average")
sil_scores <- matrix(NA, ncol=length(methods), nrow=5, dimnames=list(2:6, methods))
for (m in seq_along(methods)) {
hclust_model <- hclust(dist_mat, method=methods[m])
plot(hclust_model, main=paste("Dendrogram -", methods[m]))
for (k in 2:6) {
clusters <- cutree(hclust_model, k)
silhouette_score <- silhouette(clusters, dist_mat)
sil_scores[k-1, m] <- mean(silhouette_score[,3])
}
}
print("Silhouette Scores for Hierarchical Clustering:")
## [1] "Silhouette Scores for Hierarchical Clustering:"
print(sil_scores)
## complete single average
## 2 0.58492314 0.5382345 0.5849231
## 3 0.41232918 0.5373909 0.4825934
## 4 0.39225866 0.5404165 0.4631045
## 5 0.35005824 0.4753990 0.4594738
## 6 0.09704012 0.3971030 0.3740178
best_method <- methods[which.max(colMeans(sil_scores, na.rm=TRUE))]
hclust_best <- hclust(dist_mat, method=best_method)
plot(hclust_best, main=paste("Best Dendrogram -", best_method))
rect.hclust(hclust_best, k=3, border=2:6)
hc_clusters <- cutree(hclust_best, k=3)
cm_hc <- table(hc_clusters, heart_labels)
print("Confusion Matrix for Hierarchical Clustering:")
## [1] "Confusion Matrix for Hierarchical Clustering:"
print(cm_hc)
## heart_labels
## hc_clusters 0 1
## 1 203 94
## 2 0 1
## 3 0 1
error_hc <- 1 - sum(diag(cm_hc)) / sum(cm_hc)
print(paste("Hierarchical Clustering Error:", round(error_hc, 4)))
## [1] "Hierarchical Clustering Error: 0.3177"
set.seed(1234)
kmeans_sil_scores <- c()
for (k in 2:6) {
kmeans_model <- kmeans(heart_failure_scaled, centers=k, iter.max=200, nstart=10)
kmeans_clusters <- kmeans_model$cluster
sil_score <- silhouette(kmeans_clusters, dist_mat)
kmeans_sil_scores[k-1] <- mean(sil_score[,3])
}
kmeans_results <- data.frame(Clusters=2:6, Silhouette_Score=kmeans_sil_scores)
print("Silhouette Scores for K-Means Clustering:")
## [1] "Silhouette Scores for K-Means Clustering:"
print(kmeans_results)
## Clusters Silhouette_Score
## 1 2 0.1579632
## 2 3 0.1537191
## 3 4 0.1458186
## 4 5 0.1588330
## 5 6 0.1576545
best_kmeans_model <- kmeans(heart_failure_scaled, centers=3, iter.max=200, nstart=10)
cm_kmeans <- table(best_kmeans_model$cluster, heart_labels)
print("Confusion Matrix for K-Means Clustering:")
## [1] "Confusion Matrix for K-Means Clustering:"
print(cm_kmeans)
## heart_labels
## 0 1
## 1 10 8
## 2 51 74
## 3 142 14
error_kmeans <- 1 - sum(diag(cm_kmeans)) / sum(cm_kmeans)
print(paste("K-Means Clustering Error:", round(error_kmeans, 4)))
## [1] "K-Means Clustering Error: 0.7191"
ggplot(heart_failure_scaled, aes(x=Age, y=CPK_levels, color=factor(best_kmeans_model$cluster))) +
geom_point() +
labs(title="K-Means Clustering", x="Age", y="CPK Levels", color="Cluster") +
theme_minimal()
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.