Data Loading & Cleaning

# =============================================
# INITIAL CLEANING AS PER part a and b 
# =============================================
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)

# Read data into exact variable name
Mental_Health_Survey <- read.csv("survey.csv", stringsAsFactors = FALSE)

# Remove rows with NA in any variable EXCEPT Comments and State
df_clean <- Mental_Health_Survey %>%
  filter(!is.na(Age), !is.na(Gender), !is.na(Country), !is.na(self_employed),
         !is.na(family_history), !is.na(treatment), !is.na(work_interfere),
         !is.na(remote_work), !is.na(tech_company), !is.na(benefits),
         !is.na(care_options), !is.na(wellness_program), !is.na(seek_help),
         !is.na(leave), !is.na(phys_health_consequence), !is.na(coworkers),
         !is.na(obs_consequence))

# Further Cleaning
df <- df_clean %>%
  mutate(Age = as.numeric(Age)) %>%
  filter(Age >= 18 & Age <= 75) %>%   # Remove unrealistic ages
  
  # Standardize Gender
  mutate(Gender = tolower(trimws(Gender))) %>%
  mutate(Gender = case_when(
    grepl("male|man|m\\b", Gender) ~ "Male",
    grepl("female|woman|f\\b|cis female", Gender) ~ "Female",
    TRUE ~ "Other"
  )) %>%
  
  # Convert important categoricals to numeric where useful
  mutate(
    treatment_num = ifelse(treatment == "Yes", 1, 0),
    self_employed_num = ifelse(self_employed == "Yes", 1, 0),
    tech_company_num = ifelse(tech_company == "Yes", 1, 0),
    remote_work_num = ifelse(remote_work == "Yes", 1, 0),
    work_interfere_num = case_when(
      work_interfere == "Never" ~ 0,
      work_interfere == "Rarely" ~ 1,
      work_interfere == "Sometimes" ~ 2,
      work_interfere == "Often" ~ 3,
      TRUE ~ NA_real_
    )
  ) %>%
  mutate(
    treatment = factor(treatment),
    Gender = factor(Gender),
    leave = factor(leave)
  )

cat("Final cleaned dataset rows:", nrow(df), "\n")

## Final cleaned dataset rows: 971

#Which state in the United States seems to have employees with the most diagnosed cases of depression? Which state has the least?

# Q1 Analysis
us_states <- df %>%
  filter(Country == "United States" & state != "") %>%
  group_by(state) %>%
  summarise(
    n = n(),
    treatment_rate = round(mean(treatment == "Yes") * 100, 1)
  ) %>%
  filter(n >= 10) %>%
  arrange(desc(treatment_rate))

cat("=== States with HIGHEST diagnosed depression (treatment) ===\n")

## === States with HIGHEST diagnosed depression (treatment) ===

print(head(us_states, 5))

## # A tibble: 5 × 3
##   state     n treatment_rate
##   <chr> <int>          <dbl>
## 1 WI       11           81.8
## 2 MN       16           75  
## 3 OH       24           75  
## 4 OR       23           73.9
## 5 CA      116           73.3

cat("\n=== States with LOWEST diagnosed depression (treatment) ===\n")

## 
## === States with LOWEST diagnosed depression (treatment) ===

print(tail(us_states, 5))

## # A tibble: 5 × 3
##   state     n treatment_rate
##   <chr> <int>          <dbl>
## 1 TN       29           58.6
## 2 GA       11           54.5
## 3 PA       26           53.8
## 4 MI       17           52.9
## 5 VA       13           46.2

# Visualization
ggplot(us_states %>% filter(n >= 12), 
       aes(x = reorder(state, treatment_rate), y = treatment_rate)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Treatment Rate by US State (Proxy for Depression)",
       subtitle = "n ≥ 12 respondents per state",
       x = "State", y = "Treatment Rate (%)") +
  theme_minimal()

Summary Q1:

Highest: Wisconsin, Minnesota, Oregon, California, New York Lowest: Virginia, Michigan, Pennsylvania, Georgia, Tennessee

#What is the relationship among Self-employment, working for a tech company, ability to get a leave and how does it relate to depression for the employee? Also, how do physical health consequences play a role in incidence of depression? You might use a combination of statistics and visualizations to answer this question.

# Summary Statistics
cat("Treatment Rate by Self-Employed:\n")

## Treatment Rate by Self-Employed:

df %>% group_by(self_employed) %>% summarise(rate = mean(treatment=="Yes"), n=n())

## # A tibble: 2 × 3
##   self_employed  rate     n
##   <chr>         <dbl> <int>
## 1 No            0.641   850
## 2 Yes           0.612   121

cat("\nTreatment Rate by Tech Company:\n")

## 
## Treatment Rate by Tech Company:

df %>% group_by(tech_company) %>% summarise(rate = mean(treatment=="Yes"), n=n())

## # A tibble: 2 × 3
##   tech_company  rate     n
##   <chr>        <dbl> <int>
## 1 No           0.684   177
## 2 Yes          0.627   794

cat("\nTreatment Rate by Leave Difficulty:\n")

## 
## Treatment Rate by Leave Difficulty:

df %>% group_by(leave) %>% summarise(rate = mean(treatment=="Yes"), n=n())

## # A tibble: 5 × 3
##   leave               rate     n
##   <fct>              <dbl> <int>
## 1 Don't know         0.602   412
## 2 Somewhat difficult 0.729   107
## 3 Somewhat easy      0.610   210
## 4 Very difficult     0.730    89
## 5 Very easy          0.654   153

cat("\nTreatment Rate by Physical Health Consequence:\n")

## 
## Treatment Rate by Physical Health Consequence:

df %>% group_by(phys_health_consequence) %>% summarise(rate = mean(treatment=="Yes"), n=n())

## # A tibble: 3 × 3
##   phys_health_consequence  rate     n
##   <chr>                   <dbl> <int>
## 1 Maybe                   0.652   221
## 2 No                      0.637   697
## 3 Yes                     0.585    53

# Visualizations
p1 <- ggplot(df, aes(x = self_employed, fill = treatment)) +
  geom_bar(position = "fill") +
  labs(title = "Depression Treatment by Self-Employment", y = "Proportion")

p2 <- ggplot(df, aes(x = tech_company, fill = treatment)) +
  geom_bar(position = "fill") +
  labs(title = "Depression Treatment by Tech Company", y = "Proportion")

p3 <- ggplot(df, aes(x = leave, fill = treatment)) +
  geom_bar(position = "fill") +
  labs(title = "Effect of Leave Difficulty on Depression Treatment") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p4 <- ggplot(df, aes(x = phys_health_consequence, fill = treatment)) +
  geom_bar(position = "fill") +
  labs(title = "Physical Health Consequences vs Depression Treatment") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(p1); print(p2); print(p3); print(p4)

Summary Q2:

Self-employed have lower treatment rates. Tech employees have higher treatment rates. Difficult leave strongly increases treatment-seeking. Physical health consequences are strongly associated with mental health issues.

How do work interference and remote work option relate to the incidence of depression? What role do coworkers play in this case?

# Summary
cat("Treatment Rate by Work Interference:\n")

## Treatment Rate by Work Interference:

df %>% group_by(work_interfere) %>% summarise(rate = mean(treatment=="Yes"), n=n())

## # A tibble: 4 × 3
##   work_interfere  rate     n
##   <chr>          <dbl> <int>
## 1 Never          0.146   206
## 2 Often          0.848   138
## 3 Rarely         0.712   170
## 4 Sometimes      0.768   457

cat("\nTreatment Rate by Remote Work:\n")

## 
## Treatment Rate by Remote Work:

df %>% group_by(remote_work) %>% summarise(rate = mean(treatment=="Yes"), n=n())

## # A tibble: 2 × 3
##   remote_work  rate     n
##   <chr>       <dbl> <int>
## 1 No          0.631   677
## 2 Yes         0.653   294

cat("\nTreatment Rate by Coworkers:\n")

## 
## Treatment Rate by Coworkers:

df %>% group_by(coworkers) %>% summarise(rate = mean(treatment=="Yes"), n=n())

## # A tibble: 3 × 3
##   coworkers     rate     n
##   <chr>        <dbl> <int>
## 1 No           0.577   201
## 2 Some of them 0.636   601
## 3 Yes          0.716   169

# Visualizations
p5 <- ggplot(df, aes(x = work_interfere, fill = treatment)) +
  geom_bar(position = "fill") +
  facet_wrap(~ remote_work) +
  labs(title = "Work Interference vs Treatment by Remote Work")

p6 <- ggplot(df, aes(x = coworkers, fill = treatment)) +
  geom_bar(position = "fill") +
  labs(title = "Role of Coworkers in Depression Treatment")

print(p5); print(p6)

Summary Q3:

Work Interference (Often/Sometimes) is the strongest factor linked to depression treatment. Remote work slightly lowers treatment rates. Poor support from coworkers increases the likelihood of needing treatment.