options(repos = c(CRAN = "https://cloud.r-project.org"))

install.packages("readxl")
## Installing package into 'C:/Users/Acer/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Acer\AppData\Local\Temp\RtmpCGCAkT\downloaded_packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
data <- read_excel("D:/2025/JSTAR/DATA/data diolah-final.xlsx")
head(data)
## # A tibble: 6 × 9
##    NEET Jenis_Kelamin Status_Kawin Disabilitas Migrasi Pendidikan
##   <dbl>         <dbl>        <dbl>       <dbl>   <dbl>      <dbl>
## 1     0             1            1           1       0          0
## 2     0             0            1           1       0          0
## 3     0             0            1           1       0          0
## 4     0             0            1           1       0          0
## 5     0             0            1           1       0          0
## 6     0             0            1           1       0          0
## # ℹ 3 more variables: Wilayah_Tempat_Tinggal <dbl>,
## #   Jumlah_Anggota_Rumah_Tangga <dbl>, Kelompok_Umur <dbl>
install.packages(c(
  "tidyverse",
  "caret",
  "car",
  "pscl",
  "lmtest",
  "MASS",
  "sjPlot"
))
## Installing packages into 'C:/Users/Acer/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## package 'caret' successfully unpacked and MD5 sums checked
## package 'car' successfully unpacked and MD5 sums checked
## package 'pscl' successfully unpacked and MD5 sums checked
## package 'lmtest' successfully unpacked and MD5 sums checked
## package 'MASS' successfully unpacked and MD5 sums checked
## package 'sjPlot' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Acer\AppData\Local\Temp\RtmpCGCAkT\downloaded_packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(pscl)
## Warning: package 'pscl' was built under R version 4.5.2
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.2
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(sjPlot)
## Warning: package 'sjPlot' was built under R version 4.5.2
## 
## Attaching package: 'sjPlot'
## 
## The following object is masked from 'package:ggplot2':
## 
##     set_theme
data$NEET <- as.factor(data$NEET)
data$Jenis_Kelamin <- as.factor(data$Jenis_Kelamin)
data$Status_Kawin <- as.factor(data$Status_Kawin)
data$Disabilitas <- as.factor(data$Disabilitas)
data$Migrasi <- as.factor(data$Migrasi)
data$Pendidikan <- as.factor(data$Pendidikan)
data$Wilayah_Tempat_Tinggal <- as.factor(data$Wilayah_Tempat_Tinggal)
data$Kelompok_Umur <- as.factor(data$Kelompok_Umur)
model <- glm(
  NEET ~ Jenis_Kelamin + Status_Kawin + Disabilitas + Migrasi +
    Pendidikan + Wilayah_Tempat_Tinggal +
    Jumlah_Anggota_Rumah_Tangga + Kelompok_Umur,
  data = data,
  family = binomial(link = "logit")
)

summary(model)
## 
## Call:
## glm(formula = NEET ~ Jenis_Kelamin + Status_Kawin + Disabilitas + 
##     Migrasi + Pendidikan + Wilayah_Tempat_Tinggal + Jumlah_Anggota_Rumah_Tangga + 
##     Kelompok_Umur, family = binomial(link = "logit"), data = data)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  0.13933    0.23483   0.593   0.5530    
## Jenis_Kelamin1               0.20247    0.07309   2.770   0.0056 ** 
## Status_Kawin1               -0.68709    0.11434  -6.009 1.86e-09 ***
## Disabilitas1                -2.41201    0.16985 -14.201  < 2e-16 ***
## Migrasi1                    -0.37160    0.15962  -2.328   0.0199 *  
## Pendidikan1                  1.29404    0.08692  14.889  < 2e-16 ***
## Wilayah_Tempat_Tinggal1     -0.09089    0.08758  -1.038   0.2993    
## Jumlah_Anggota_Rumah_Tangga  0.22825    0.13481   1.693   0.0904 .  
## Kelompok_Umur1               0.18940    0.08684   2.181   0.0292 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5796.4  on 7119  degrees of freedom
## Residual deviance: 5213.3  on 7111  degrees of freedom
## AIC: 5231.3
## 
## Number of Fisher Scoring iterations: 5

Uji Simultan

model_null <- glm(NEET ~ 1, data = data, family = binomial)
anova(model_null, model, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: NEET ~ 1
## Model 2: NEET ~ Jenis_Kelamin + Status_Kawin + Disabilitas + Migrasi + 
##     Pendidikan + Wilayah_Tempat_Tinggal + Jumlah_Anggota_Rumah_Tangga + 
##     Kelompok_Umur
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7119     5796.4                          
## 2      7111     5213.3  8   583.06 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Odds Ratio + Confidence Interval

exp(cbind(OR = coef(model), confint(model)))
## Waiting for profiling to be done...
##                                     OR      2.5 %    97.5 %
## (Intercept)                 1.14950798 0.72439451 1.8196969
## Jenis_Kelamin1              1.22441949 1.06106178 1.4131869
## Status_Kawin1               0.50303772 0.40258382 0.6303802
## Disabilitas1                0.08963454 0.06417692 0.1250021
## Migrasi1                    0.68963248 0.50021052 0.9360979
## Pendidikan1                 3.64750106 3.07777517 4.3274845
## Wilayah_Tempat_Tinggal1     0.91311388 0.76789465 1.0825597
## Jumlah_Anggota_Rumah_Tangga 1.25639343 0.96932156 1.6450815
## Kelompok_Umur1              1.20852469 1.01937090 1.4328348
tab_model(model)
  NEET
Predictors Odds Ratios CI p
(Intercept) 1.15 0.72 – 1.82 0.553
Jenis Kelamin [1] 1.22 1.06 – 1.41 0.006
Status Kawin [1] 0.50 0.40 – 0.63 <0.001
Disabilitas [1] 0.09 0.06 – 0.13 <0.001
Migrasi [1] 0.69 0.50 – 0.94 0.020
Pendidikan [1] 3.65 3.08 – 4.33 <0.001
Wilayah Tempat Tinggal
[1]
0.91 0.77 – 1.08 0.299
Jumlah Anggota Rumah
Tangga
1.26 0.97 – 1.65 0.090
Kelompok Umur [1] 1.21 1.02 – 1.43 0.029
Observations 7120
R2 Tjur 0.089

Hosmer–Lemeshow Test

install.packages("ResourceSelection", dependencies = TRUE)
## Installing package into 'C:/Users/Acer/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ResourceSelection' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Acer\AppData\Local\Temp\RtmpCGCAkT\downloaded_packages
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.5.2
## ResourceSelection 0.3-6   2023-06-27
data$NEET <- as.numeric(as.character(data$NEET))
hoslem.test(data$NEET, fitted(model), g = 10)
## Warning in hoslem.test(data$NEET, fitted(model), g = 10): The data did not
## allow for the requested number of bins.
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  data$NEET, fitted(model)
## X-squared = 65.982, df = 6, p-value = 2.718e-12

SMOTE

install.packages("smotefamily")
## Installing package into 'C:/Users/Acer/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'smotefamily' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Acer\AppData\Local\Temp\RtmpCGCAkT\downloaded_packages
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.5.2
data$NEET <- as.factor(data$NEET)

data_num <- model.matrix(NEET ~ ., data)[, -1]   # hilangkan intercept
data_for_smote <- data.frame(
  NEET = data$NEET,
  data_num
)
library(smotefamily)

smote_res <- SMOTE(
  X = data_for_smote[, -1], 
  target = data_for_smote$NEET,
  K = 5
)

data_smote <- smote_res$data
str(data_smote)
## 'data.frame':    12145 obs. of  9 variables:
##  $ Jenis_Kelamin1             : num  0 1 1 0 1 0 0 1 0 1 ...
##  $ Status_Kawin1              : num  1 1 1 1 0 1 1 1 1 1 ...
##  $ Disabilitas1               : num  1 0 1 1 1 1 0 1 1 1 ...
##  $ Migrasi1                   : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ Pendidikan1                : num  1 0 0 0 1 1 1 1 0 1 ...
##  $ Wilayah_Tempat_Tinggal1    : num  1 0 0 0 0 1 0 0 0 0 ...
##  $ Jumlah_Anggota_Rumah_Tangga: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Kelompok_Umur1             : num  1 0 0 1 1 1 1 1 0 1 ...
##  $ class                      : chr  "1" "1" "1" "1" ...
table(data_smote$class)
## 
##    0    1 
## 6115 6030
data_smote$class <- as.factor(data_smote$class)
table(data_smote$class)
## 
##    0    1 
## 6115 6030
str(data_smote$class)
##  Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
model_smote <- glm(class ~ ., 
                   data = data_smote,
                   family = binomial)

summary(model_smote)
## 
## Call:
## glm(formula = class ~ ., family = binomial, data = data_smote)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  2.01123    0.15956  12.605  < 2e-16 ***
## Jenis_Kelamin1               0.13362    0.04118   3.245  0.00117 ** 
## Status_Kawin1               -0.82495    0.07135 -11.561  < 2e-16 ***
## Disabilitas1                -2.53995    0.12544 -20.248  < 2e-16 ***
## Migrasi1                    -0.48682    0.09166  -5.311 1.09e-07 ***
## Pendidikan1                  1.35125    0.04471  30.223  < 2e-16 ***
## Wilayah_Tempat_Tinggal1     -0.11743    0.04910  -2.392  0.01677 *  
## Jumlah_Anggota_Rumah_Tangga  0.33023    0.07825   4.220 2.44e-05 ***
## Kelompok_Umur1               0.37642    0.04575   8.228  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16836  on 12144  degrees of freedom
## Residual deviance: 14706  on 12136  degrees of freedom
## AIC: 14724
## 
## Number of Fisher Scoring iterations: 4

Uji Simultan

model_null <- glm(class ~ 1, 
                  data = data_smote, 
                  family = binomial)

anova(model_null, model_smote, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: class ~ 1
## Model 2: class ~ Jenis_Kelamin1 + Status_Kawin1 + Disabilitas1 + Migrasi1 + 
##     Pendidikan1 + Wilayah_Tempat_Tinggal1 + Jumlah_Anggota_Rumah_Tangga + 
##     Kelompok_Umur1
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     12144      16836                          
## 2     12136      14706  8   2130.4 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Uji Goodness of Fit (Hosmer–Lemeshow)

install.packages("ResourceSelection")
## Warning: package 'ResourceSelection' is in use and will not be installed
library(ResourceSelection)
library(ResourceSelection)

with(model_smote, cbind(res.deviance = deviance, df = df.residual,
                        p = pchisq(deviance, df.residual, lower.tail = FALSE)))
##      res.deviance    df           p
## [1,]     14705.57 12136 3.28094e-54
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(pROC)

ROC Curve & AUC

library(pROC)

pred <- fitted(model_smote)

valid_index <- !is.na(pred)

roc_obj <- roc(
  response = data_smote$class[valid_index],
  predictor = pred[valid_index]
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.7269
plot(roc_obj)

table(data_smote$class)
## 
##    0    1 
## 6115 6030
str(data_smote$class)
##  Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# Odds Ratio
exp(coef(model_smote))
##                 (Intercept)              Jenis_Kelamin1 
##                  7.47248909                  1.14295428 
##               Status_Kawin1                Disabilitas1 
##                  0.43825700                  0.07887014 
##                    Migrasi1                 Pendidikan1 
##                  0.61457990                  3.86226328 
##     Wilayah_Tempat_Tinggal1 Jumlah_Anggota_Rumah_Tangga 
##                  0.88919994                  1.39128674 
##              Kelompok_Umur1 
##                  1.45706396
# Odds Ratio dengan Confidence Interval 95%
exp(cbind(OR = coef(model_smote), confint(model_smote)))
## Waiting for profiling to be done...
##                                     OR     2.5 %     97.5 %
## (Intercept)                 7.47248909 5.4868450 10.2601139
## Jenis_Kelamin1              1.14295428 1.0543292  1.2390199
## Status_Kawin1               0.43825700 0.3808223  0.5037496
## Disabilitas1                0.07887014 0.0612730  0.1002533
## Migrasi1                    0.61457990 0.5135711  0.7356849
## Pendidikan1                 3.86226328 3.5389461  4.2168845
## Wilayah_Tempat_Tinggal1     0.88919994 0.8075594  0.9789670
## Jumlah_Anggota_Rumah_Tangga 1.39128674 1.1936396  1.6222568
## Kelompok_Umur1              1.45706396 1.3320256  1.5936671