# =============================================
# ESSAY QUESTION 4: CLASSIFICATION WITH VISUALS
# =============================================

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Warning: package 'caret' was built under R version 4.5.3

## Loading required package: ggplot2

## Loading required package: lattice

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.5.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggplot2)

# Load and Prepare Data
Mental_Health_Survey <- read.csv("survey.csv", stringsAsFactors = FALSE)

df <- Mental_Health_Survey %>%
  filter(!is.na(treatment)) %>%
  mutate(Age = as.numeric(Age)) %>%
  filter(Age >= 18 & Age <= 75)

# Selected 5+ Key Variables
features <- c("work_interfere", "leave", "phys_health_consequence", 
              "coworkers", "family_history", "remote_work", "self_employed")

df_model <- df %>%
  select(all_of(features), treatment) %>%
  na.omit() %>%
  mutate(treatment = factor(treatment))

# Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(df_model$treatment, p = 0.7, list = FALSE)
train <- df_model[trainIndex, ]
test  <- df_model[-trainIndex, ]

ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE)

# Train 3 Models
log_model <- train(treatment ~ ., data = train, method = "glm", family = "binomial", trControl = ctrl)
rf_model  <- train(treatment ~ ., data = train, method = "rf", trControl = ctrl)
tree_model <- train(treatment ~ ., data = train, method = "rpart", trControl = ctrl)

# Predictions
pred_log <- predict(log_model, test)
pred_rf  <- predict(rf_model, test)
pred_tree <- predict(tree_model, test)

# Results
cat("=== Logistic Regression ===\n")

## === Logistic Regression ===

print(confusionMatrix(pred_log, test$treatment))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   48   8
##        Yes  57 177
##                                           
##                Accuracy : 0.7759          
##                  95% CI : (0.7234, 0.8225)
##     No Information Rate : 0.6379          
##     P-Value [Acc > NIR] : 2.872e-07       
##                                           
##                   Kappa : 0.4603          
##                                           
##  Mcnemar's Test P-Value : 2.622e-09       
##                                           
##             Sensitivity : 0.4571          
##             Specificity : 0.9568          
##          Pos Pred Value : 0.8571          
##          Neg Pred Value : 0.7564          
##              Prevalence : 0.3621          
##          Detection Rate : 0.1655          
##    Detection Prevalence : 0.1931          
##       Balanced Accuracy : 0.7069          
##                                           
##        'Positive' Class : No              
##

cat("\n=== Random Forest ===\n")

## 
## === Random Forest ===

print(confusionMatrix(pred_rf, test$treatment))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   39   8
##        Yes  66 177
##                                          
##                Accuracy : 0.7448         
##                  95% CI : (0.6906, 0.794)
##     No Information Rate : 0.6379         
##     P-Value [Acc > NIR] : 6.685e-05      
##                                          
##                   Kappa : 0.3727         
##                                          
##  Mcnemar's Test P-Value : 3.446e-11      
##                                          
##             Sensitivity : 0.3714         
##             Specificity : 0.9568         
##          Pos Pred Value : 0.8298         
##          Neg Pred Value : 0.7284         
##              Prevalence : 0.3621         
##          Detection Rate : 0.1345         
##    Detection Prevalence : 0.1621         
##       Balanced Accuracy : 0.6641         
##                                          
##        'Positive' Class : No             
##

cat("\n=== Decision Tree ===\n")

## 
## === Decision Tree ===

print(confusionMatrix(pred_tree, test$treatment))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   51  24
##        Yes  54 161
##                                           
##                Accuracy : 0.731           
##                  95% CI : (0.6761, 0.7812)
##     No Information Rate : 0.6379          
##     P-Value [Acc > NIR] : 0.0004765       
##                                           
##                   Kappa : 0.3794          
##                                           
##  Mcnemar's Test P-Value : 0.0010249       
##                                           
##             Sensitivity : 0.4857          
##             Specificity : 0.8703          
##          Pos Pred Value : 0.6800          
##          Neg Pred Value : 0.7488          
##              Prevalence : 0.3621          
##          Detection Rate : 0.1759          
##    Detection Prevalence : 0.2586          
##       Balanced Accuracy : 0.6780          
##                                           
##        'Positive' Class : No              
##

# Visualization: Variable Importance (Best Model)
varImp(rf_model)$importance %>%
  as.data.frame() %>%
  tibble::rownames_to_column("Variable") %>%
  ggplot(aes(x = reorder(Variable, Overall), y = Overall)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Variable Importance - Random Forest",
       x = "Variable", y = "Importance Score") +
  theme_minimal()

Summary:

Based on previous questions, I selected five important variables to predict incidence of depression (treatment = Yes): work_interfere, leave, phys_health_consequence, coworkers, and family_history. I applied three classification techniques with 5-fold cross validation: Logistic Regression, Random Forest, and Decision Tree. Random Forest performed the best with highest accuracy, precision, and recall. The top predictors were work_interfere, leave difficulty, and physical health consequences. This confirms that workplace factors like how much mental health interferes with work and support systems are the strongest predictors of depression incidence.