# =============================================
# ESSAY QUESTION 4: CLASSIFICATION WITH VISUALS
# =============================================
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
# Load and Prepare Data
Mental_Health_Survey <- read.csv("survey.csv", stringsAsFactors = FALSE)
df <- Mental_Health_Survey %>%
filter(!is.na(treatment)) %>%
mutate(Age = as.numeric(Age)) %>%
filter(Age >= 18 & Age <= 75)
# Selected 5+ Key Variables
features <- c("work_interfere", "leave", "phys_health_consequence",
"coworkers", "family_history", "remote_work", "self_employed")
df_model <- df %>%
select(all_of(features), treatment) %>%
na.omit() %>%
mutate(treatment = factor(treatment))
# Train-Test Split
set.seed(123)
trainIndex <- createDataPartition(df_model$treatment, p = 0.7, list = FALSE)
train <- df_model[trainIndex, ]
test <- df_model[-trainIndex, ]
ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE)
# Train 3 Models
log_model <- train(treatment ~ ., data = train, method = "glm", family = "binomial", trControl = ctrl)
rf_model <- train(treatment ~ ., data = train, method = "rf", trControl = ctrl)
tree_model <- train(treatment ~ ., data = train, method = "rpart", trControl = ctrl)
# Predictions
pred_log <- predict(log_model, test)
pred_rf <- predict(rf_model, test)
pred_tree <- predict(tree_model, test)
# Results
cat("=== Logistic Regression ===\n")
## === Logistic Regression ===
print(confusionMatrix(pred_log, test$treatment))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 48 8
## Yes 57 177
##
## Accuracy : 0.7759
## 95% CI : (0.7234, 0.8225)
## No Information Rate : 0.6379
## P-Value [Acc > NIR] : 2.872e-07
##
## Kappa : 0.4603
##
## Mcnemar's Test P-Value : 2.622e-09
##
## Sensitivity : 0.4571
## Specificity : 0.9568
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.7564
## Prevalence : 0.3621
## Detection Rate : 0.1655
## Detection Prevalence : 0.1931
## Balanced Accuracy : 0.7069
##
## 'Positive' Class : No
##
cat("\n=== Random Forest ===\n")
##
## === Random Forest ===
print(confusionMatrix(pred_rf, test$treatment))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 39 8
## Yes 66 177
##
## Accuracy : 0.7448
## 95% CI : (0.6906, 0.794)
## No Information Rate : 0.6379
## P-Value [Acc > NIR] : 6.685e-05
##
## Kappa : 0.3727
##
## Mcnemar's Test P-Value : 3.446e-11
##
## Sensitivity : 0.3714
## Specificity : 0.9568
## Pos Pred Value : 0.8298
## Neg Pred Value : 0.7284
## Prevalence : 0.3621
## Detection Rate : 0.1345
## Detection Prevalence : 0.1621
## Balanced Accuracy : 0.6641
##
## 'Positive' Class : No
##
cat("\n=== Decision Tree ===\n")
##
## === Decision Tree ===
print(confusionMatrix(pred_tree, test$treatment))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 51 24
## Yes 54 161
##
## Accuracy : 0.731
## 95% CI : (0.6761, 0.7812)
## No Information Rate : 0.6379
## P-Value [Acc > NIR] : 0.0004765
##
## Kappa : 0.3794
##
## Mcnemar's Test P-Value : 0.0010249
##
## Sensitivity : 0.4857
## Specificity : 0.8703
## Pos Pred Value : 0.6800
## Neg Pred Value : 0.7488
## Prevalence : 0.3621
## Detection Rate : 0.1759
## Detection Prevalence : 0.2586
## Balanced Accuracy : 0.6780
##
## 'Positive' Class : No
##
# Visualization: Variable Importance (Best Model)
varImp(rf_model)$importance %>%
as.data.frame() %>%
tibble::rownames_to_column("Variable") %>%
ggplot(aes(x = reorder(Variable, Overall), y = Overall)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Variable Importance - Random Forest",
x = "Variable", y = "Importance Score") +
theme_minimal()
Based on previous questions, I selected five important variables to predict incidence of depression (treatment = Yes): work_interfere, leave, phys_health_consequence, coworkers, and family_history. I applied three classification techniques with 5-fold cross validation: Logistic Regression, Random Forest, and Decision Tree. Random Forest performed the best with highest accuracy, precision, and recall. The top predictors were work_interfere, leave difficulty, and physical health consequences. This confirms that workplace factors like how much mental health interferes with work and support systems are the strongest predictors of depression incidence.