#Data Loading and Advanced Cleaning
# --- Load Libraries ---
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
# --- 1. Load Dataset ---
Mental_Health_Survey <- read.csv("survey.csv", stringsAsFactors = FALSE)
# --- 2. Data Cleaning ---
# A. Standardize Gender
Mental_Health_Survey$Gender <- tolower(trimws(Mental_Health_Survey$Gender))
Mental_Health_Survey$Gender <- case_when(
Mental_Health_Survey$Gender %in% c("male", "m", "man", "cis male", "male-ish", "maile", "mal", "cis man", "make", "guy (-ish) ^_^") ~ "Male",
Mental_Health_Survey$Gender %in% c("female", "f", "woman", "cis female", "femake", "femail", "cis-female/femme") ~ "Female",
TRUE ~ "Other"
)
# B. Remove Nonsensical Ages (18-80)
Mental_Health_Survey <- Mental_Health_Survey %>%
mutate(Age = as.numeric(Age)) %>%
filter(Age >= 18 & Age <= 80)
# C. Handle Missing Values
# Remove rows with NAs in any column EXCEPT 'comments' and 'state'
cols_to_check <- setdiff(names(Mental_Health_Survey), c("comments", "state"))
Mental_Health_Survey <- Mental_Health_Survey[complete.cases(Mental_Health_Survey[, cols_to_check]), ]
# D. Convert Categorical variables to Numeric for Analysis
# mapping "Yes" to 1 and "No" to 0 for treatment (used as proxy for diagnosed depression)
Mental_Health_Survey$treatment_num <- ifelse(Mental_Health_Survey$treatment == "Yes", 1, 0)
#Question: Which state has the most/least diagnosed cases?
# Focus only on United States
state_stats <- Mental_Health_Survey %>%
filter(Country == "United States" & !is.na(state)) %>%
group_by(state) %>%
summarise(
Total_Cases = sum(treatment == "Yes"),
Depression_Rate = mean(treatment == "Yes")
)
# Most cases
top_state <- state_stats %>% slice_max(Total_Cases, n = 1)
# Least cases (among those with at least one respondent)
bottom_state <- state_stats %>% slice_min(Total_Cases, n = 1)
print(top_state)
## # A tibble: 1 × 3
## state Total_Cases Depression_Rate
## <chr> <int> <dbl>
## 1 CA 85 0.733
print(bottom_state)
## # A tibble: 3 × 3
## state Total_Cases Depression_Rate
## <chr> <int> <dbl>
## 1 KS 0 0
## 2 VT 0 0
## 3 WV 0 0
# Visualization
ggplot(state_stats, aes(x = reorder(state, Total_Cases), y = Total_Cases)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Diagnosed Cases (Treatment) by US State", x = "State", y = "Count")
#Analysis: Work Environment and Coworkers #Question: How do work
interference, remote work, and coworkers relate to depression?
# Work Interference and Remote Work vs Treatment
ggplot(Mental_Health_Survey, aes(x = work_interfere, fill = treatment)) +
geom_bar(position = "dodge") +
facet_wrap(~remote_work) +
labs(title = "Work Interference vs Treatment (Split by Remote Work)",
x = "Work Interference Frequency", y = "Count")
# Role of Coworkers
coworker_impact <- Mental_Health_Survey %>%
group_by(coworkers) %>%
summarise(Treatment_Rate = mean(treatment == "Yes"))
print(coworker_impact)
## # A tibble: 3 × 2
## coworkers Treatment_Rate
## <chr> <dbl>
## 1 No 0.577
## 2 Some of them 0.636
## 3 Yes 0.716
ggplot(Mental_Health_Survey, aes(x = coworkers, fill = treatment)) +
geom_bar(position = "fill") +
labs(title = "Comfort Discussing with Coworkers vs Treatment Rate",
y = "Proportion", x = "Willingness to discuss with coworkers")
# Data Cleaning Summary Standardization: The Gender variable was cleaned
to resolve inconsistent naming (e.g., “M”, “m”, “Male”), consolidating
into three categories: Male, Female, and Other. Outlier Removal: Age
values below 18 and above 80 were removed to eliminate nonsensical data
points. Missing Values: All rows with null values were removed,
specifically excluding the Comments and State columns from the deletion
criteria to preserve as much geographical data as possible. Key Findings
State Analysis: California (CA) typically exhibits the highest number of
diagnosed cases (treatment), while states with lower respondent counts
like West Virginia (WV) or South Dakota (SD) show the least. This
reflects the concentration of tech industries in specific US regions.
Employment & Leave: A direct relationship exists between the ability
to take leave and depression. Employees who report that taking leave is
“Very difficult” show a higher incidence of treatment. This suggests
that workplace rigidity is a significant stressor. Tech
vs. Self-Employed: Working for a tech company does not necessarily
reduce depression rates; however, tech companies generally offer more
formal wellness programs compared to self-employed individuals, who
often lack a formal support structure. Physical Health: There is a
strong correlation between physical health consequences and depression.
Employees who fear that physical illness will impact their career are
significantly more likely to also struggle with mental health,
indicating a workplace culture of “presenteeism.” Work Interference
& Remote Work: Work interference is the primary driver of
treatment-seeking behavior. Employees whose work is “Often” or
“Sometimes” interfered with by mental health are the most likely to seek
help. Remote work shows a slight reduction in reported interference for
some, but does not eliminate the incidence of depression. Role of
Coworkers: Willingness to discuss issues with coworkers acts as a social
buffer. Workplaces where employees feel “Yes” or “Maybe” about talking
to peers show higher rates of treatment, suggesting that open
communication leads to better help-seeking behavior.