Part 2: Mental Health Data Classification using Supervised Learning (SUSENAS DATA)

Background

This is report part 2. In this report, we employed supervised learning and logistic regression to evaluate features that significant to classify people who prone to emotional issue. The exploration of SUSENAS data is written in previous report:

Load library used in this report

library (tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(foreign)
library(survey)

## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: survival
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.1

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(ROSE)

## Loaded ROSE 0.0-4

Classification using Supervised learning

random forest

df_filtered <- read.csv("data for supervised learning.csv")


# Ensure target is a factor
df_filtered$emotional_issue <- as.factor(df_filtered$emotional_issue)

# Select relevant predictors
features <- df_filtered %>%
  select(emotional_issue,
         edu_level,
         education,
         klas_desakota,
         marital_status,
         business_sector_class,
         work_status_cat,
         who_breadwinner,
         food_1)

# Select relevant predictors
features <- features %>%
  mutate(emotional_issue = as.factor(emotional_issue),
         edu_level = as.factor(edu_level),
         klas_desakota = as.factor(klas_desakota),
         marital_status = as.factor(marital_status),
         business_sector_class = as.factor(business_sector_class),
         work_status_cat = as.factor(work_status_cat),
         who_breadwinner = as.factor(who_breadwinner),
         food_1 = as.factor(food_1))

table(features$emotional_issue)

## 
##      0      1 
## 524147   3190

set.seed(123)

# Balance the classes
balanced_data <- ROSE(emotional_issue ~ ., data = features, seed = 123)$data

# Check new class distribution
table(balanced_data$emotional_issue)

## 
##      0      1 
## 263936 263401

balanced_data_RF <- balanced_data%>%
  select(!education)

library(caret)

## Warning: package 'caret' was built under R version 4.4.1

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

## The following object is masked from 'package:purrr':
## 
##     lift

trainIndex <- createDataPartition(balanced_data_RF$emotional_issue, p = .8, list = FALSE)
train <- balanced_data_RF[trainIndex, ]
test  <- balanced_data_RF[-trainIndex, ]

rf_model <- randomForest(
  emotional_issue ~ .,
  data = train,
  ntree = 200,
  importance = TRUE
)

varImpPlot(rf_model,
           main = "Feature Importance (Balanced with ROSE)",
           pch = 16,
           col = "darkgreen")

library(rpart)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# prediction and probability
y_true <- test$emotional_issue
y_pred <- predict(rf_model, test, type = "class")
y_prob <- predict(rf_model, test, type = "prob")[,2]  # probabilitas kelas 1

# Confusion Matrix 
conf_mat <- confusionMatrix(as.factor(y_pred), as.factor(y_true), positive = "1")
print(conf_mat)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 36831 16551
##          1 15956 36129
##                                          
##                Accuracy : 0.6918         
##                  95% CI : (0.689, 0.6946)
##     No Information Rate : 0.5005         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3836         
##                                          
##  Mcnemar's Test P-Value : 0.0009857      
##                                          
##             Sensitivity : 0.6858         
##             Specificity : 0.6977         
##          Pos Pred Value : 0.6937         
##          Neg Pred Value : 0.6900         
##              Prevalence : 0.4995         
##          Detection Rate : 0.3426         
##    Detection Prevalence : 0.4939         
##       Balanced Accuracy : 0.6918         
##                                          
##        'Positive' Class : 1              
##

# Extract the matrix
accuracy <- conf_mat$overall["Accuracy"]
precision <- conf_mat$byClass["Precision"]
recall <- conf_mat$byClass["Recall"]
f1 <- conf_mat$byClass["F1"]

# AUC
roc_obj <- roc(y_true, y_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc_value <- auc(roc_obj)

cat("Accuracy :", round(accuracy, 3), "\n")

## Accuracy : 0.692

cat("Precision:", round(precision, 3), "\n")

## Precision: 0.694

cat("Recall   :", round(recall, 3), "\n")

## Recall   : 0.686

cat("F1 Score :", round(f1, 3), "\n")

## F1 Score : 0.69

cat("AUC      :", round(auc_value, 3), "\n")

## AUC      : 0.74

b) decision tree

set.seed(123)

tree_model <- rpart(
  emotional_issue ~ .,
  data = train,
  method = "class",
  control = rpart.control(cp = 0.01)  # Anda bisa tuning cp (complexity parameter)
)



library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.4.1

rpart.plot(tree_model,
           type = 2,          # label di cabang
           extra = 104,       # tampilkan prob dan nilai prediksi
           fallen.leaves = TRUE,
           main = "Decision Tree - Emotional Issue Prediction")

# prediction and probability
y_true <- test$emotional_issue
y_pred <- predict(tree_model, test, type = "class")
y_prob <- predict(tree_model, test, type = "prob")[,2]  # probabilitas kelas 1

# Confusion Matrix
conf_mat <- confusionMatrix(as.factor(y_pred), as.factor(y_true), positive = "1")
print(conf_mat)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 34030 16704
##          1 18757 35976
##                                           
##                Accuracy : 0.6638          
##                  95% CI : (0.6609, 0.6666)
##     No Information Rate : 0.5005          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3276          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6829          
##             Specificity : 0.6447          
##          Pos Pred Value : 0.6573          
##          Neg Pred Value : 0.6708          
##              Prevalence : 0.4995          
##          Detection Rate : 0.3411          
##    Detection Prevalence : 0.5190          
##       Balanced Accuracy : 0.6638          
##                                           
##        'Positive' Class : 1               
##

# Extract the matrix
accuracy <- conf_mat$overall["Accuracy"]
precision <- conf_mat$byClass["Precision"]
recall <- conf_mat$byClass["Recall"]
f1 <- conf_mat$byClass["F1"]

# AUC
roc_obj <- roc(y_true, y_prob)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc_value <- auc(roc_obj)

cat("Accuracy :", round(accuracy, 3), "\n")

## Accuracy : 0.664

cat("Precision:", round(precision, 3), "\n")

## Precision: 0.657

cat("Recall   :", round(recall, 3), "\n")

## Recall   : 0.683

cat("F1 Score :", round(f1, 3), "\n")

## F1 Score : 0.67

cat("AUC      :", round(auc_value, 3), "\n")

## AUC      : 0.692

d) logistic regression

# employ logistic regression model

balanced_data$klas_desakota <- relevel(factor(balanced_data$klas_desakota), ref = "rural")
balanced_data$marital_status <- relevel(factor(balanced_data$marital_status), ref = "not married")
balanced_data$food_1 <- relevel(factor(balanced_data$food_1), ref = "not worry about the food")
balanced_data$business_sector_class <- relevel(factor(balanced_data$business_sector_class), ref = "agriculture and mining")
balanced_data$who_breadwinner <- relevel(factor(balanced_data$who_breadwinner), ref = "breadwinner is head of hh")
balanced_data$work_status_cat <- relevel(factor(balanced_data$work_status_cat), ref = "employee")


model_log <- glm(emotional_issue ~ klas_desakota + marital_status + 
                   education + food_1 + 
                   business_sector_class + who_breadwinner + work_status_cat,
                 data = balanced_data, family = binomial)


summary(model_log)

## 
## Call:
## glm(formula = emotional_issue ~ klas_desakota + marital_status + 
##     education + food_1 + business_sector_class + who_breadwinner + 
##     work_status_cat, family = binomial, data = balanced_data)
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                 0.9475764  0.0206818   45.82
## klas_desakotaurban                          0.0564941  0.0066464    8.50
## marital_statusdivorced                      0.9284584  0.0174064   53.34
## marital_statusmarried                      -0.0728071  0.0077623   -9.38
## marital_statuswidowed                       0.7335387  0.0118951   61.67
## education                                  -0.0733801  0.0005485 -133.78
## food_1worry about the food                  0.7088507  0.0070320  100.80
## business_sector_classmanufactures          -0.2097425  0.0138851  -15.11
## business_sector_classservices              -0.3832188  0.0084232  -45.50
## who_breadwinnerbreadwinner is member of hh -1.2202903  0.0169069  -72.18
## who_breadwinnerother                       -0.9708390  0.0160418  -60.52
## work_status_catemployer                     0.5497438  0.0097040   56.65
## work_status_catfreelance                    0.1781487  0.0155978   11.42
## work_status_catunpaid                       0.3465021  0.0138976   24.93
##                                            Pr(>|z|)    
## (Intercept)                                  <2e-16 ***
## klas_desakotaurban                           <2e-16 ***
## marital_statusdivorced                       <2e-16 ***
## marital_statusmarried                        <2e-16 ***
## marital_statuswidowed                        <2e-16 ***
## education                                    <2e-16 ***
## food_1worry about the food                   <2e-16 ***
## business_sector_classmanufactures            <2e-16 ***
## business_sector_classservices                <2e-16 ***
## who_breadwinnerbreadwinner is member of hh   <2e-16 ***
## who_breadwinnerother                         <2e-16 ***
## work_status_catemployer                      <2e-16 ***
## work_status_catfreelance                     <2e-16 ***
## work_status_catunpaid                        <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 731044  on 527336  degrees of freedom
## Residual deviance: 640839  on 527323  degrees of freedom
## AIC: 640867
## 
## Number of Fisher Scoring iterations: 4

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

vif_values <- vif(model_log)
vif_values

##                           GVIF Df GVIF^(1/(2*Df))
## klas_desakota         1.134717  1        1.065231
## marital_status        1.296932  3        1.044286
## education             1.168630  1        1.081032
## food_1                1.013975  1        1.006963
## business_sector_class 1.377008  2        1.083263
## who_breadwinner       1.246825  2        1.056699
## work_status_cat       1.402500  3        1.057995

# Extract and convert the coeff to AOR
exp_coef <- exp(coef(model_log))  # AOR
conf_int <- exp(confint(model_log))  # 95% CI dari AOR

## Waiting for profiling to be done...

# create AOR table
aor_table <- cbind(
  AOR = round(exp_coef, 3),
  `2.5%` = round(conf_int[, 1], 3),
  `97.5%` = round(conf_int[, 2], 3)
)

print(aor_table)

##                                              AOR  2.5% 97.5%
## (Intercept)                                2.579 2.477 2.686
## klas_desakotaurban                         1.058 1.044 1.072
## marital_statusdivorced                     2.531 2.446 2.619
## marital_statusmarried                      0.930 0.916 0.944
## marital_statuswidowed                      2.082 2.034 2.132
## education                                  0.929 0.928 0.930
## food_1worry about the food                 2.032 2.004 2.060
## business_sector_classmanufactures          0.811 0.789 0.833
## business_sector_classservices              0.682 0.671 0.693
## who_breadwinnerbreadwinner is member of hh 0.295 0.286 0.305
## who_breadwinnerother                       0.379 0.367 0.391
## work_status_catemployer                    1.733 1.700 1.766
## work_status_catfreelance                   1.195 1.159 1.232
## work_status_catunpaid                      1.414 1.376 1.453

Part 2: Mental Health Data Classification using Supervised Learning (SUSENAS DATA)

Dhiar Larasati

2025-05-11

Background