library(dplyr)

df_original <- read.csv("C:/Users/kayla/OneDrive/Documents/TSU/Adv_Topics_CS497/Projects/HeartFailure_excerpt.csv", header=TRUE)

if ("Label" %in% colnames(df_original)) {
    colnames(df_original)[colnames(df_original) == "Label"] <- "DEATH_EVENT"
}

df <- df_original %>% select(-Cluster_Hierarchical, -Cluster_KMeans)

expected_columns <- c("Age", "CPK_Levels", "Ejection_Fraction", "Platelets", 
                      "Serum_Creatinine", "Serum_Sodium", "Time", "DEATH_EVENT")

if (length(colnames(df)) == length(expected_columns)) {
    colnames(df) <- expected_columns
} else {
    print("Warning: Column count mismatch. Here are the current column names:")
    print(colnames(df)) 
}

print("Final Column Names in df (after removing clustering labels):")
## [1] "Final Column Names in df (after removing clustering labels):"
print(colnames(df))
## [1] "Age"               "CPK_Levels"        "Ejection_Fraction"
## [4] "Platelets"         "Serum_Creatinine"  "Serum_Sodium"     
## [7] "Time"              "DEATH_EVENT"
if ("DEATH_EVENT" %in% colnames(df)) {
    X <- df %>% select(-DEATH_EVENT)  
    X_scaled <- scale(X)  
} else {
    stop("Error: Column 'DEATH_EVENT' not found in the dataset!")
}

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(cluster)
library(dplyr) 
library(ggplot2)

chooseCRANmirror(graphics=FALSE, ind=66) 
# print("Checking if X exists before scaling:")
# print("Checking if X is selected correctly:")

hc <- hclust(dist(X_scaled), method="ward.D2")
print(hc)
## 
## Call:
## hclust(d = dist(X_scaled), method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 299
plot(hc, main="Hierarchical Clustering Dendrogram")

df$Cluster_Hierarchical <- cutree(hc, k=3)

print(dim(X)) 
## [1] 299   7
X_scaled <- scale(X)

silhouette_scores <- c()
for (k in 2:10) {
    km <- kmeans(X_scaled, centers=k, nstart=25)
    sil <- silhouette(km$cluster, dist(X_scaled))
    silhouette_scores <- c(silhouette_scores, mean(sil[, 3]))
}

best_k <- which.max(silhouette_scores) + 1
kmeans_result <- kmeans(X_scaled, centers=best_k, nstart=25)
df$Cluster_KMeans <- kmeans_result$cluster 

if ("Cluster_Hierarchical" %in% colnames(df) & "Cluster_KMeans" %in% colnames(df)) {
    X <- df %>% select(-DEATH_EVENT, -Cluster_Hierarchical, -Cluster_KMeans)
} else {
    stop("Error: Clustering columns are missing from df!")
}

install.packages("cluster", dependencies=TRUE)
set.seed(123)
print("Checking if X_scaled exists:")
## [1] "Checking if X_scaled exists:"
print(dim(X_scaled)) 
## [1] 299   7
print("Checking if K-Means clustering worked:")
## [1] "Checking if K-Means clustering worked:"
print(table(kmeans_result$cluster))  
## 
##  1  2  3  4  5  6  7  8 
##  6 41 80 38  5 28 24 77

Including Plots

You can also embed plots, for example:

library(cluster)

hc <- hclust(dist(X_scaled), method="ward.D2")  
plot(hc, main="Hierarchical Clustering Dendrogram", xlab="", sub="", cex=0.6) 

best_hierarchical_clusters <- 3
df$Cluster_Hierarchical <- cutree(hc, k=best_hierarchical_clusters)


df_original <- read.csv("C:/Users/kayla/OneDrive/Documents/TSU/Adv_Topics_CS497/Projects/HeartFailure_excerpt.csv", header=TRUE)

print("Checking column names before confusion matrix:")
## [1] "Checking column names before confusion matrix:"
print(colnames(df_original))
##  [1] "Age"                  "CPK_Levels"           "Ejection_Fraction"   
##  [4] "Platelets"            "Serum_Creatinine"     "Serum_Sodium"        
##  [7] "Time"                 "DEATH_EVENT"          "Cluster_Hierarchical"
## [10] "Cluster_KMeans"
if ("Label" %in% colnames(df_original)) {
    colnames(df_original)[colnames(df_original) == "Label"] <- "DEATH_EVENT"
}
print("First few rows of df:")
## [1] "First few rows of df:"
print(head(df)) 
##   Age CPK_Levels Ejection_Fraction Platelets Serum_Creatinine Serum_Sodium Time
## 1  75        582                20    265000              1.9          130    4
## 2  55       7861                38    263358              1.1          136    6
## 3  65        146                20    162000              1.3          129    7
## 4  50        111                20    210000              1.9          137    7
## 5  65        160                20    327000              2.7          116    8
## 6  90         47                40    204000              2.1          132    8
##   DEATH_EVENT Cluster_Hierarchical Cluster_KMeans
## 1           1                    1              7
## 2           1                    2              1
## 3           1                    1              7
## 4           1                    3              3
## 5           1                    1              7
## 6           1                    1              4
if ("DEATH_EVENT" %in% colnames(df_original)) {

print(paste("Length of df$Cluster_KMeans:", length(df$Cluster_KMeans)))
print(paste("Length of df_original$DEATH_EVENT:", length(df_original$DEATH_EVENT)))

if (length(df$Cluster_KMeans) == length(df_original$DEATH_EVENT)) {
    print("Confusion Matrix for K-Means Clustering:")
    print(table(df$Cluster_KMeans, df_original$DEATH_EVENT))
} else {
    stop("Error: Cluster assignments and DEATH_EVENT do not have the same length!")
}

print("Confusion Matrix for Hierarchical Clustering:")
if (length(df$Cluster_Hierarchical) == length(df_original$DEATH_EVENT)) {
    print(table(df$Cluster_Hierarchical, df_original$DEATH_EVENT))
} else {
    stop("Error: Cluster assignments and DEATH_EVENT do not have the same length!")
 }
}
## [1] "Length of df$Cluster_KMeans: 299"
## [1] "Length of df_original$DEATH_EVENT: 299"
## [1] "Confusion Matrix for K-Means Clustering:"
##    
##      0  1
##   1  3  3
##   2 35  6
##   3 50 30
##   4 16 22
##   5  1  4
##   6 23  5
##   7  6 18
##   8 69  8
## [1] "Confusion Matrix for Hierarchical Clustering:"
##    
##       0   1
##   1  20  44
##   2   3   3
##   3 180  49
sil_hc <- silhouette(df$Cluster_Hierarchical, dist(X_scaled))

plot(sil_hc[, 3], main="Silhouette Plot for Hierarchical Clustering",
     xlab="Cluster Index", ylab="Silhouette Width", col="palevioletred1", pch=19)

sil_kmeans <- silhouette(df$Cluster_KMeans, dist(X_scaled))

plot(sil_kmeans[, 3], main="Silhouette Plot for K-Means Clustering",
     xlab="Cluster Index", ylab="Silhouette Width", col="#40E0D0", pch=19)

library(ggplot2)
install.packages("ggplot2", dependencies=TRUE)
file_path <- "C:/Users/kayla/OneDrive/Documents/TSU/Adv_Topics_CS497/Projects/HeartFailure_excerpt.csv"
if (file.exists(file_path)) {
    HeartFailure_excerpt <- read.csv(file_path, header=TRUE)
    print("Dataset loaded successfully")
} else {
    stop("Error: File not found! Check the path.")
}
## [1] "Dataset loaded successfully"
print(colnames(HeartFailure_excerpt))
##  [1] "Age"                  "CPK_Levels"           "Ejection_Fraction"   
##  [4] "Platelets"            "Serum_Creatinine"     "Serum_Sodium"        
##  [7] "Time"                 "DEATH_EVENT"          "Cluster_Hierarchical"
## [10] "Cluster_KMeans"
silhouette_df <- data.frame(k=2:10, Silhouette=silhouette_scores)
ggplot(silhouette_df, aes(x=k, y=Silhouette)) +
  geom_point(color="#FF34B3", size=3) +   
  geom_line(color="#00E5EE") +          
  labs(title="Silhouette Score for K-Means Clustering",
       x="Number of Clusters (k)", 
       y="Silhouette Score") +
  theme_minimal()

df$Cluster_KMeans <- as.factor(kmeans_result$cluster)

ggplot(df, aes(x=Age, y=CPK_Levels, color=Cluster_KMeans)) +
  
  geom_point(size=3) +

 labs(title="K-Means Clustering Visualization", x="Age", y="CPK_Levels") +

  theme_minimal()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

write.csv(df, "C:/Users/kayla/OneDrive/Documents/TSU/Adv_Topics_CS497/Projects/HeartFailure_excerpt.csv", row.names=FALSE)