This is report part 2. In this report, we employed supervised learning and logistic regression to evaluate features that significant to classify people who prone to emotional issue. The exploration of SUSENAS data is written in previous report:
library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(foreign)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:gridExtra':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(ROSE)
## Loaded ROSE 0.0-4
df_filtered <- read.csv("data for supervised learning.csv")
# Ensure target is a factor
df_filtered$emotional_issue <- as.factor(df_filtered$emotional_issue)
# Select relevant predictors
features <- df_filtered %>%
select(emotional_issue,
edu_level,
education,
klas_desakota,
marital_status,
business_sector_class,
work_status_cat,
who_breadwinner,
food_1)
# Select relevant predictors
features <- features %>%
mutate(emotional_issue = as.factor(emotional_issue),
edu_level = as.factor(edu_level),
klas_desakota = as.factor(klas_desakota),
marital_status = as.factor(marital_status),
business_sector_class = as.factor(business_sector_class),
work_status_cat = as.factor(work_status_cat),
who_breadwinner = as.factor(who_breadwinner),
food_1 = as.factor(food_1))
table(features$emotional_issue)
##
## 0 1
## 524147 3190
set.seed(123)
# Balance the classes
balanced_data <- ROSE(emotional_issue ~ ., data = features, seed = 123)$data
# Check new class distribution
table(balanced_data$emotional_issue)
##
## 0 1
## 263936 263401
balanced_data_RF <- balanced_data%>%
select(!education)
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
trainIndex <- createDataPartition(balanced_data_RF$emotional_issue, p = .8, list = FALSE)
train <- balanced_data_RF[trainIndex, ]
test <- balanced_data_RF[-trainIndex, ]
rf_model <- randomForest(
emotional_issue ~ .,
data = train,
ntree = 200,
importance = TRUE
)
varImpPlot(rf_model,
main = "Feature Importance (Balanced with ROSE)",
pch = 16,
col = "darkgreen")
library(rpart)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# prediction and probability
y_true <- test$emotional_issue
y_pred <- predict(rf_model, test, type = "class")
y_prob <- predict(rf_model, test, type = "prob")[,2] # probabilitas kelas 1
# Confusion Matrix
conf_mat <- confusionMatrix(as.factor(y_pred), as.factor(y_true), positive = "1")
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36831 16551
## 1 15956 36129
##
## Accuracy : 0.6918
## 95% CI : (0.689, 0.6946)
## No Information Rate : 0.5005
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3836
##
## Mcnemar's Test P-Value : 0.0009857
##
## Sensitivity : 0.6858
## Specificity : 0.6977
## Pos Pred Value : 0.6937
## Neg Pred Value : 0.6900
## Prevalence : 0.4995
## Detection Rate : 0.3426
## Detection Prevalence : 0.4939
## Balanced Accuracy : 0.6918
##
## 'Positive' Class : 1
##
# Extract the matrix
accuracy <- conf_mat$overall["Accuracy"]
precision <- conf_mat$byClass["Precision"]
recall <- conf_mat$byClass["Recall"]
f1 <- conf_mat$byClass["F1"]
# AUC
roc_obj <- roc(y_true, y_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_obj)
cat("Accuracy :", round(accuracy, 3), "\n")
## Accuracy : 0.692
cat("Precision:", round(precision, 3), "\n")
## Precision: 0.694
cat("Recall :", round(recall, 3), "\n")
## Recall : 0.686
cat("F1 Score :", round(f1, 3), "\n")
## F1 Score : 0.69
cat("AUC :", round(auc_value, 3), "\n")
## AUC : 0.74
b) decision tree
set.seed(123)
tree_model <- rpart(
emotional_issue ~ .,
data = train,
method = "class",
control = rpart.control(cp = 0.01) # Anda bisa tuning cp (complexity parameter)
)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.1
rpart.plot(tree_model,
type = 2, # label di cabang
extra = 104, # tampilkan prob dan nilai prediksi
fallen.leaves = TRUE,
main = "Decision Tree - Emotional Issue Prediction")
# prediction and probability
y_true <- test$emotional_issue
y_pred <- predict(tree_model, test, type = "class")
y_prob <- predict(tree_model, test, type = "prob")[,2] # probabilitas kelas 1
# Confusion Matrix
conf_mat <- confusionMatrix(as.factor(y_pred), as.factor(y_true), positive = "1")
print(conf_mat)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 34030 16704
## 1 18757 35976
##
## Accuracy : 0.6638
## 95% CI : (0.6609, 0.6666)
## No Information Rate : 0.5005
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3276
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6829
## Specificity : 0.6447
## Pos Pred Value : 0.6573
## Neg Pred Value : 0.6708
## Prevalence : 0.4995
## Detection Rate : 0.3411
## Detection Prevalence : 0.5190
## Balanced Accuracy : 0.6638
##
## 'Positive' Class : 1
##
# Extract the matrix
accuracy <- conf_mat$overall["Accuracy"]
precision <- conf_mat$byClass["Precision"]
recall <- conf_mat$byClass["Recall"]
f1 <- conf_mat$byClass["F1"]
# AUC
roc_obj <- roc(y_true, y_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_value <- auc(roc_obj)
cat("Accuracy :", round(accuracy, 3), "\n")
## Accuracy : 0.664
cat("Precision:", round(precision, 3), "\n")
## Precision: 0.657
cat("Recall :", round(recall, 3), "\n")
## Recall : 0.683
cat("F1 Score :", round(f1, 3), "\n")
## F1 Score : 0.67
cat("AUC :", round(auc_value, 3), "\n")
## AUC : 0.692
d) logistic regression
# employ logistic regression model
balanced_data$klas_desakota <- relevel(factor(balanced_data$klas_desakota), ref = "rural")
balanced_data$marital_status <- relevel(factor(balanced_data$marital_status), ref = "not married")
balanced_data$food_1 <- relevel(factor(balanced_data$food_1), ref = "not worry about the food")
balanced_data$business_sector_class <- relevel(factor(balanced_data$business_sector_class), ref = "agriculture and mining")
balanced_data$who_breadwinner <- relevel(factor(balanced_data$who_breadwinner), ref = "breadwinner is head of hh")
balanced_data$work_status_cat <- relevel(factor(balanced_data$work_status_cat), ref = "employee")
model_log <- glm(emotional_issue ~ klas_desakota + marital_status +
education + food_1 +
business_sector_class + who_breadwinner + work_status_cat,
data = balanced_data, family = binomial)
summary(model_log)
##
## Call:
## glm(formula = emotional_issue ~ klas_desakota + marital_status +
## education + food_1 + business_sector_class + who_breadwinner +
## work_status_cat, family = binomial, data = balanced_data)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.9475764 0.0206818 45.82
## klas_desakotaurban 0.0564941 0.0066464 8.50
## marital_statusdivorced 0.9284584 0.0174064 53.34
## marital_statusmarried -0.0728071 0.0077623 -9.38
## marital_statuswidowed 0.7335387 0.0118951 61.67
## education -0.0733801 0.0005485 -133.78
## food_1worry about the food 0.7088507 0.0070320 100.80
## business_sector_classmanufactures -0.2097425 0.0138851 -15.11
## business_sector_classservices -0.3832188 0.0084232 -45.50
## who_breadwinnerbreadwinner is member of hh -1.2202903 0.0169069 -72.18
## who_breadwinnerother -0.9708390 0.0160418 -60.52
## work_status_catemployer 0.5497438 0.0097040 56.65
## work_status_catfreelance 0.1781487 0.0155978 11.42
## work_status_catunpaid 0.3465021 0.0138976 24.93
## Pr(>|z|)
## (Intercept) <2e-16 ***
## klas_desakotaurban <2e-16 ***
## marital_statusdivorced <2e-16 ***
## marital_statusmarried <2e-16 ***
## marital_statuswidowed <2e-16 ***
## education <2e-16 ***
## food_1worry about the food <2e-16 ***
## business_sector_classmanufactures <2e-16 ***
## business_sector_classservices <2e-16 ***
## who_breadwinnerbreadwinner is member of hh <2e-16 ***
## who_breadwinnerother <2e-16 ***
## work_status_catemployer <2e-16 ***
## work_status_catfreelance <2e-16 ***
## work_status_catunpaid <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 731044 on 527336 degrees of freedom
## Residual deviance: 640839 on 527323 degrees of freedom
## AIC: 640867
##
## Number of Fisher Scoring iterations: 4
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif_values <- vif(model_log)
vif_values
## GVIF Df GVIF^(1/(2*Df))
## klas_desakota 1.134717 1 1.065231
## marital_status 1.296932 3 1.044286
## education 1.168630 1 1.081032
## food_1 1.013975 1 1.006963
## business_sector_class 1.377008 2 1.083263
## who_breadwinner 1.246825 2 1.056699
## work_status_cat 1.402500 3 1.057995
# Extract and convert the coeff to AOR
exp_coef <- exp(coef(model_log)) # AOR
conf_int <- exp(confint(model_log)) # 95% CI dari AOR
## Waiting for profiling to be done...
# create AOR table
aor_table <- cbind(
AOR = round(exp_coef, 3),
`2.5%` = round(conf_int[, 1], 3),
`97.5%` = round(conf_int[, 2], 3)
)
print(aor_table)
## AOR 2.5% 97.5%
## (Intercept) 2.579 2.477 2.686
## klas_desakotaurban 1.058 1.044 1.072
## marital_statusdivorced 2.531 2.446 2.619
## marital_statusmarried 0.930 0.916 0.944
## marital_statuswidowed 2.082 2.034 2.132
## education 0.929 0.928 0.930
## food_1worry about the food 2.032 2.004 2.060
## business_sector_classmanufactures 0.811 0.789 0.833
## business_sector_classservices 0.682 0.671 0.693
## who_breadwinnerbreadwinner is member of hh 0.295 0.286 0.305
## who_breadwinnerother 0.379 0.367 0.391
## work_status_catemployer 1.733 1.700 1.766
## work_status_catfreelance 1.195 1.159 1.232
## work_status_catunpaid 1.414 1.376 1.453