# Load necessary libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
# 1. Load and Clean Data
# Assuming the dataset is named 'survey.csv'
data <- read.csv("survey.csv", stringsAsFactors = FALSE)
# Select relevant columns for clustering
# Note: treatment, family_history, care_options, seek_help, obs_consequence
cluster_data <- data %>%
select(treatment, family_history, care_options, seek_help, obs_consequence) %>%
mutate(across(everything(), ~ case_when(
.x %in% c("Yes", "Often", "Sometimes") ~ 1,
.x %in% c("No", "Never", "Rarely") ~ 0,
TRUE ~ 0.5 # Handle "Don't know" or "Not sure"
)))
# 2. Perform K-Means Clustering
# We use k=2 to separate high-risk vs low-risk profiles
set.seed(123) # For reproducibility
kmeans_result <- kmeans(cluster_data[, -5], centers = 2, nstart = 25)
# Add cluster assignments back to the data
cluster_data$cluster <- kmeans_result$cluster
# 3. Analyze Centroids by 'obs_consequence'
# This creates the summary table provided in the previous response
summary_table <- cluster_data %>%
group_by(obs_consequence) %>%
summarize(
treatment_mean = mean(treatment),
family_history_mean = mean(family_history),
care_options_mean = mean(care_options),
seek_help_mean = mean(seek_help),
count = n()
)
print(summary_table)
## # A tibble: 2 × 6
## obs_consequence treatment_mean family_history_mean care_options_mean
## <dbl> <dbl> <dbl> <dbl>
## 1 0 0.473 0.366 0.464
## 2 1 0.696 0.538 0.554
## # ℹ 2 more variables: seek_help_mean <dbl>, count <int>
# 4. Visualization
# Visualizing the clusters using a silhouette plot or PCA-based plot
clusplot(cluster_data[, -5],
kmeans_result$cluster,
color=TRUE, shade=TRUE,
labels=2, lines=0,
main="Mental Health Employee Clusters")
