# =============================================
# INITIAL CLEANING AS PER part a and b
# =============================================
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
# Read data into exact variable name
Mental_Health_Survey <- read.csv("survey.csv", stringsAsFactors = FALSE)
# Remove rows with NA in any variable EXCEPT Comments and State
df_clean <- Mental_Health_Survey %>%
filter(!is.na(Age), !is.na(Gender), !is.na(Country), !is.na(self_employed),
!is.na(family_history), !is.na(treatment), !is.na(work_interfere),
!is.na(remote_work), !is.na(tech_company), !is.na(benefits),
!is.na(care_options), !is.na(wellness_program), !is.na(seek_help),
!is.na(leave), !is.na(phys_health_consequence), !is.na(coworkers),
!is.na(obs_consequence))
# Further Cleaning
df <- df_clean %>%
mutate(Age = as.numeric(Age)) %>%
filter(Age >= 18 & Age <= 75) %>% # Remove unrealistic ages
# Standardize Gender
mutate(Gender = tolower(trimws(Gender))) %>%
mutate(Gender = case_when(
grepl("male|man|m\\b", Gender) ~ "Male",
grepl("female|woman|f\\b|cis female", Gender) ~ "Female",
TRUE ~ "Other"
)) %>%
# Convert important categoricals to numeric where useful
mutate(
treatment_num = ifelse(treatment == "Yes", 1, 0),
self_employed_num = ifelse(self_employed == "Yes", 1, 0),
tech_company_num = ifelse(tech_company == "Yes", 1, 0),
remote_work_num = ifelse(remote_work == "Yes", 1, 0),
work_interfere_num = case_when(
work_interfere == "Never" ~ 0,
work_interfere == "Rarely" ~ 1,
work_interfere == "Sometimes" ~ 2,
work_interfere == "Often" ~ 3,
TRUE ~ NA_real_
)
) %>%
mutate(
treatment = factor(treatment),
Gender = factor(Gender),
leave = factor(leave)
)
cat("Final cleaned dataset rows:", nrow(df), "\n")
## Final cleaned dataset rows: 971
#Which state in the United States seems to have employees with the most diagnosed cases of depression? Which state has the least?
# Q1 Analysis
us_states <- df %>%
filter(Country == "United States" & state != "") %>%
group_by(state) %>%
summarise(
n = n(),
treatment_rate = round(mean(treatment == "Yes") * 100, 1)
) %>%
filter(n >= 10) %>%
arrange(desc(treatment_rate))
cat("=== States with HIGHEST diagnosed depression (treatment) ===\n")
## === States with HIGHEST diagnosed depression (treatment) ===
print(head(us_states, 5))
## # A tibble: 5 × 3
## state n treatment_rate
## <chr> <int> <dbl>
## 1 WI 11 81.8
## 2 MN 16 75
## 3 OH 24 75
## 4 OR 23 73.9
## 5 CA 116 73.3
cat("\n=== States with LOWEST diagnosed depression (treatment) ===\n")
##
## === States with LOWEST diagnosed depression (treatment) ===
print(tail(us_states, 5))
## # A tibble: 5 × 3
## state n treatment_rate
## <chr> <int> <dbl>
## 1 TN 29 58.6
## 2 GA 11 54.5
## 3 PA 26 53.8
## 4 MI 17 52.9
## 5 VA 13 46.2
# Visualization
ggplot(us_states %>% filter(n >= 12),
aes(x = reorder(state, treatment_rate), y = treatment_rate)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Treatment Rate by US State (Proxy for Depression)",
subtitle = "n ≥ 12 respondents per state",
x = "State", y = "Treatment Rate (%)") +
theme_minimal()
Highest: Wisconsin, Minnesota, Oregon, California, New York Lowest: Virginia, Michigan, Pennsylvania, Georgia, Tennessee
#What is the relationship among Self-employment, working for a tech company, ability to get a leave and how does it relate to depression for the employee? Also, how do physical health consequences play a role in incidence of depression? You might use a combination of statistics and visualizations to answer this question.
# Summary Statistics
cat("Treatment Rate by Self-Employed:\n")
## Treatment Rate by Self-Employed:
df %>% group_by(self_employed) %>% summarise(rate = mean(treatment=="Yes"), n=n())
## # A tibble: 2 × 3
## self_employed rate n
## <chr> <dbl> <int>
## 1 No 0.641 850
## 2 Yes 0.612 121
cat("\nTreatment Rate by Tech Company:\n")
##
## Treatment Rate by Tech Company:
df %>% group_by(tech_company) %>% summarise(rate = mean(treatment=="Yes"), n=n())
## # A tibble: 2 × 3
## tech_company rate n
## <chr> <dbl> <int>
## 1 No 0.684 177
## 2 Yes 0.627 794
cat("\nTreatment Rate by Leave Difficulty:\n")
##
## Treatment Rate by Leave Difficulty:
df %>% group_by(leave) %>% summarise(rate = mean(treatment=="Yes"), n=n())
## # A tibble: 5 × 3
## leave rate n
## <fct> <dbl> <int>
## 1 Don't know 0.602 412
## 2 Somewhat difficult 0.729 107
## 3 Somewhat easy 0.610 210
## 4 Very difficult 0.730 89
## 5 Very easy 0.654 153
cat("\nTreatment Rate by Physical Health Consequence:\n")
##
## Treatment Rate by Physical Health Consequence:
df %>% group_by(phys_health_consequence) %>% summarise(rate = mean(treatment=="Yes"), n=n())
## # A tibble: 3 × 3
## phys_health_consequence rate n
## <chr> <dbl> <int>
## 1 Maybe 0.652 221
## 2 No 0.637 697
## 3 Yes 0.585 53
# Visualizations
p1 <- ggplot(df, aes(x = self_employed, fill = treatment)) +
geom_bar(position = "fill") +
labs(title = "Depression Treatment by Self-Employment", y = "Proportion")
p2 <- ggplot(df, aes(x = tech_company, fill = treatment)) +
geom_bar(position = "fill") +
labs(title = "Depression Treatment by Tech Company", y = "Proportion")
p3 <- ggplot(df, aes(x = leave, fill = treatment)) +
geom_bar(position = "fill") +
labs(title = "Effect of Leave Difficulty on Depression Treatment") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p4 <- ggplot(df, aes(x = phys_health_consequence, fill = treatment)) +
geom_bar(position = "fill") +
labs(title = "Physical Health Consequences vs Depression Treatment") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p1); print(p2); print(p3); print(p4)
Self-employed have lower treatment rates. Tech employees have higher treatment rates. Difficult leave strongly increases treatment-seeking. Physical health consequences are strongly associated with mental health issues.
# Summary
cat("Treatment Rate by Work Interference:\n")
## Treatment Rate by Work Interference:
df %>% group_by(work_interfere) %>% summarise(rate = mean(treatment=="Yes"), n=n())
## # A tibble: 4 × 3
## work_interfere rate n
## <chr> <dbl> <int>
## 1 Never 0.146 206
## 2 Often 0.848 138
## 3 Rarely 0.712 170
## 4 Sometimes 0.768 457
cat("\nTreatment Rate by Remote Work:\n")
##
## Treatment Rate by Remote Work:
df %>% group_by(remote_work) %>% summarise(rate = mean(treatment=="Yes"), n=n())
## # A tibble: 2 × 3
## remote_work rate n
## <chr> <dbl> <int>
## 1 No 0.631 677
## 2 Yes 0.653 294
cat("\nTreatment Rate by Coworkers:\n")
##
## Treatment Rate by Coworkers:
df %>% group_by(coworkers) %>% summarise(rate = mean(treatment=="Yes"), n=n())
## # A tibble: 3 × 3
## coworkers rate n
## <chr> <dbl> <int>
## 1 No 0.577 201
## 2 Some of them 0.636 601
## 3 Yes 0.716 169
# Visualizations
p5 <- ggplot(df, aes(x = work_interfere, fill = treatment)) +
geom_bar(position = "fill") +
facet_wrap(~ remote_work) +
labs(title = "Work Interference vs Treatment by Remote Work")
p6 <- ggplot(df, aes(x = coworkers, fill = treatment)) +
geom_bar(position = "fill") +
labs(title = "Role of Coworkers in Depression Treatment")
print(p5); print(p6)
Work Interference (Often/Sometimes) is the strongest factor linked to depression treatment. Remote work slightly lowers treatment rates. Poor support from coworkers increases the likelihood of needing treatment.