#Data Loading and Advanced Cleaning

# --- Load Libraries ---
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

# --- 1. Load Dataset ---
Mental_Health_Survey <- read.csv("survey.csv", stringsAsFactors = FALSE)

# --- 2. Data Cleaning ---
# A. Standardize Gender
Mental_Health_Survey$Gender <- tolower(trimws(Mental_Health_Survey$Gender))
Mental_Health_Survey$Gender <- case_when(
  Mental_Health_Survey$Gender %in% c("male", "m", "man", "cis male", "male-ish", "maile", "mal", "cis man", "make", "guy (-ish) ^_^") ~ "Male",
  Mental_Health_Survey$Gender %in% c("female", "f", "woman", "cis female", "femake", "femail", "cis-female/femme") ~ "Female",
  TRUE ~ "Other"
)

# B. Remove Nonsensical Ages (18-80)
Mental_Health_Survey <- Mental_Health_Survey %>%
  mutate(Age = as.numeric(Age)) %>%
  filter(Age >= 18 & Age <= 80)

# C. Handle Missing Values
# Remove rows with NAs in any column EXCEPT 'comments' and 'state'
cols_to_check <- setdiff(names(Mental_Health_Survey), c("comments", "state"))
Mental_Health_Survey <- Mental_Health_Survey[complete.cases(Mental_Health_Survey[, cols_to_check]), ]

# D. Convert Categorical variables to Numeric for Analysis
# mapping "Yes" to 1 and "No" to 0 for treatment (used as proxy for diagnosed depression)
Mental_Health_Survey$treatment_num <- ifelse(Mental_Health_Survey$treatment == "Yes", 1, 0)