library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
df <- read.csv("health_data400.csv")
head(df)
## patient_id age sex ethnicity bmi smoker diabetes hypertension cholesterol
## 1 1 66 Male Hispanic 26.6 No Yes No 226
## 2 2 82 Female Hispanic 27.6 Yes No No 244
## 3 3 42 Male Black 30.7 Yes No Yes 286
## 4 4 35 Male Hispanic 24.8 Yes No No 167
## 5 5 66 Male White 24.5 No No No 203
## 6 6 64 Female White 22.7 Yes Yes Yes 237
## glucose visit_date diagnosis
## 1 112 2022-01-22 Hypertension
## 2 100 2022-08-14 Hypertension
## 3 102 2022-02-06 Back pain
## 4 95 2022-10-05 Back pain
## 5 68 2022-02-20 Back pain
## 6 108 2022-03-23 Asthma
clustering_data <- df %>%
select("age","bmi","cholesterol", "glucose")
scaled_data <- scale(clustering_data)
kmeans_result <- kmeans(scaled_data, centers = 3, nstart = 25)
fviz_cluster(kmeans_result, data = scaled_data,
ellipse.type = "convex",
palette = "jco",
ggtheme = theme_minimal())
clustered_df <- cbind(df, cluster = kmeans_result$cluster)
mean_cluster_data <- aggregate(clustered_df[, c("age", "bmi", "cholesterol" ,"glucose" )], by = list(clustered_df$cluster), FUN = mean)
mean_cluster_data
## Group.1 age bmi cholesterol glucose
## 1 1 43.06000 30.92700 224.2300 118.2200
## 2 2 72.97727 24.65606 214.7652 103.0833
## 3 3 42.31548 26.50417 172.2976 89.3750
# Comments on the mean cluster data
# Cluster 1: This cluster represents the oldest patient group with a mean age of approximately 73.
# The NA value for visit_date suggests that this variable was not included in the clustering analysis or was not properly converted to a numeric type.
# Cluster 2: This cluster represents a younger patient group with a mean age of approximately 43.
# Similar to Cluster 1, the NA value for visit_date indicates it was not used in the analysis.
# Cluster 3: This cluster also represents a younger patient group, with a mean age of approximately 42.
# The NA value for visit_date again shows this variable was not part of the clustering calculation.
```