Q2.knit

# Load necessary libraries
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.3

## Warning: package 'lubridate' was built under R version 4.5.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(cluster)

## Warning: package 'cluster' was built under R version 4.5.3

# 1. Load and Clean Data
# Assuming the dataset is named 'survey.csv'
data <- read.csv("survey.csv", stringsAsFactors = FALSE)

# Select relevant columns for clustering
# Note: treatment, family_history, care_options, seek_help, obs_consequence
cluster_data <- data %>%
  select(treatment, family_history, care_options, seek_help, obs_consequence) %>%
  mutate(across(everything(), ~ case_when(
    .x %in% c("Yes", "Often", "Sometimes") ~ 1,
    .x %in% c("No", "Never", "Rarely") ~ 0,
    TRUE ~ 0.5 # Handle "Don't know" or "Not sure"
  )))

# 2. Perform K-Means Clustering
# We use k=2 to separate high-risk vs low-risk profiles
set.seed(123) # For reproducibility
kmeans_result <- kmeans(cluster_data[, -5], centers = 2, nstart = 25)

# Add cluster assignments back to the data
cluster_data$cluster <- kmeans_result$cluster

# 3. Analyze Centroids by 'obs_consequence'
# This creates the summary table provided in the previous response
summary_table <- cluster_data %>%
  group_by(obs_consequence) %>%
  summarize(
    treatment_mean = mean(treatment),
    family_history_mean = mean(family_history),
    care_options_mean = mean(care_options),
    seek_help_mean = mean(seek_help),
    count = n()
  )

print(summary_table)

## # A tibble: 2 × 6
##   obs_consequence treatment_mean family_history_mean care_options_mean
##             <dbl>          <dbl>               <dbl>             <dbl>
## 1               0          0.473               0.366             0.464
## 2               1          0.696               0.538             0.554
## # ℹ 2 more variables: seek_help_mean <dbl>, count <int>

# 4. Visualization
# Visualizing the clusters using a silhouette plot or PCA-based plot
clusplot(cluster_data[, -5], 
         kmeans_result$cluster, 
         color=TRUE, shade=TRUE, 
         labels=2, lines=0, 
         main="Mental Health Employee Clusters")