options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("readxl")
## Installing package into 'C:/Users/Acer/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Acer\AppData\Local\Temp\RtmpCGCAkT\downloaded_packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
data <- read_excel("D:/2025/JSTAR/DATA/data diolah-final.xlsx")
head(data)
## # A tibble: 6 × 9
## NEET Jenis_Kelamin Status_Kawin Disabilitas Migrasi Pendidikan
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 1 1 1 0 0
## 2 0 0 1 1 0 0
## 3 0 0 1 1 0 0
## 4 0 0 1 1 0 0
## 5 0 0 1 1 0 0
## 6 0 0 1 1 0 0
## # ℹ 3 more variables: Wilayah_Tempat_Tinggal <dbl>,
## # Jumlah_Anggota_Rumah_Tangga <dbl>, Kelompok_Umur <dbl>
install.packages(c(
"tidyverse",
"caret",
"car",
"pscl",
"lmtest",
"MASS",
"sjPlot"
))
## Installing packages into 'C:/Users/Acer/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## package 'caret' successfully unpacked and MD5 sums checked
## package 'car' successfully unpacked and MD5 sums checked
## package 'pscl' successfully unpacked and MD5 sums checked
## package 'lmtest' successfully unpacked and MD5 sums checked
## package 'MASS' successfully unpacked and MD5 sums checked
## package 'sjPlot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Acer\AppData\Local\Temp\RtmpCGCAkT\downloaded_packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(car)
## Warning: package 'car' was built under R version 4.5.2
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(pscl)
## Warning: package 'pscl' was built under R version 4.5.2
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.2
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(sjPlot)
## Warning: package 'sjPlot' was built under R version 4.5.2
##
## Attaching package: 'sjPlot'
##
## The following object is masked from 'package:ggplot2':
##
## set_theme
data$NEET <- as.factor(data$NEET)
data$Jenis_Kelamin <- as.factor(data$Jenis_Kelamin)
data$Status_Kawin <- as.factor(data$Status_Kawin)
data$Disabilitas <- as.factor(data$Disabilitas)
data$Migrasi <- as.factor(data$Migrasi)
data$Pendidikan <- as.factor(data$Pendidikan)
data$Wilayah_Tempat_Tinggal <- as.factor(data$Wilayah_Tempat_Tinggal)
data$Kelompok_Umur <- as.factor(data$Kelompok_Umur)
model <- glm(
NEET ~ Jenis_Kelamin + Status_Kawin + Disabilitas + Migrasi +
Pendidikan + Wilayah_Tempat_Tinggal +
Jumlah_Anggota_Rumah_Tangga + Kelompok_Umur,
data = data,
family = binomial(link = "logit")
)
summary(model)
##
## Call:
## glm(formula = NEET ~ Jenis_Kelamin + Status_Kawin + Disabilitas +
## Migrasi + Pendidikan + Wilayah_Tempat_Tinggal + Jumlah_Anggota_Rumah_Tangga +
## Kelompok_Umur, family = binomial(link = "logit"), data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.13933 0.23483 0.593 0.5530
## Jenis_Kelamin1 0.20247 0.07309 2.770 0.0056 **
## Status_Kawin1 -0.68709 0.11434 -6.009 1.86e-09 ***
## Disabilitas1 -2.41201 0.16985 -14.201 < 2e-16 ***
## Migrasi1 -0.37160 0.15962 -2.328 0.0199 *
## Pendidikan1 1.29404 0.08692 14.889 < 2e-16 ***
## Wilayah_Tempat_Tinggal1 -0.09089 0.08758 -1.038 0.2993
## Jumlah_Anggota_Rumah_Tangga 0.22825 0.13481 1.693 0.0904 .
## Kelompok_Umur1 0.18940 0.08684 2.181 0.0292 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5796.4 on 7119 degrees of freedom
## Residual deviance: 5213.3 on 7111 degrees of freedom
## AIC: 5231.3
##
## Number of Fisher Scoring iterations: 5
Uji Simultan
model_null <- glm(NEET ~ 1, data = data, family = binomial)
anova(model_null, model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: NEET ~ 1
## Model 2: NEET ~ Jenis_Kelamin + Status_Kawin + Disabilitas + Migrasi +
## Pendidikan + Wilayah_Tempat_Tinggal + Jumlah_Anggota_Rumah_Tangga +
## Kelompok_Umur
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7119 5796.4
## 2 7111 5213.3 8 583.06 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Odds Ratio + Confidence Interval
exp(cbind(OR = coef(model), confint(model)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 1.14950798 0.72439451 1.8196969
## Jenis_Kelamin1 1.22441949 1.06106178 1.4131869
## Status_Kawin1 0.50303772 0.40258382 0.6303802
## Disabilitas1 0.08963454 0.06417692 0.1250021
## Migrasi1 0.68963248 0.50021052 0.9360979
## Pendidikan1 3.64750106 3.07777517 4.3274845
## Wilayah_Tempat_Tinggal1 0.91311388 0.76789465 1.0825597
## Jumlah_Anggota_Rumah_Tangga 1.25639343 0.96932156 1.6450815
## Kelompok_Umur1 1.20852469 1.01937090 1.4328348
tab_model(model)
| NEET | |||
|---|---|---|---|
| Predictors | Odds Ratios | CI | p |
| (Intercept) | 1.15 | 0.72 – 1.82 | 0.553 |
| Jenis Kelamin [1] | 1.22 | 1.06 – 1.41 | 0.006 |
| Status Kawin [1] | 0.50 | 0.40 – 0.63 | <0.001 |
| Disabilitas [1] | 0.09 | 0.06 – 0.13 | <0.001 |
| Migrasi [1] | 0.69 | 0.50 – 0.94 | 0.020 |
| Pendidikan [1] | 3.65 | 3.08 – 4.33 | <0.001 |
|
Wilayah Tempat Tinggal [1] |
0.91 | 0.77 – 1.08 | 0.299 |
|
Jumlah Anggota Rumah Tangga |
1.26 | 0.97 – 1.65 | 0.090 |
| Kelompok Umur [1] | 1.21 | 1.02 – 1.43 | 0.029 |
| Observations | 7120 | ||
| R2 Tjur | 0.089 | ||
Hosmer–Lemeshow Test
install.packages("ResourceSelection", dependencies = TRUE)
## Installing package into 'C:/Users/Acer/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ResourceSelection' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Acer\AppData\Local\Temp\RtmpCGCAkT\downloaded_packages
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.5.2
## ResourceSelection 0.3-6 2023-06-27
data$NEET <- as.numeric(as.character(data$NEET))
hoslem.test(data$NEET, fitted(model), g = 10)
## Warning in hoslem.test(data$NEET, fitted(model), g = 10): The data did not
## allow for the requested number of bins.
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: data$NEET, fitted(model)
## X-squared = 65.982, df = 6, p-value = 2.718e-12
SMOTE
install.packages("smotefamily")
## Installing package into 'C:/Users/Acer/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'smotefamily' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Acer\AppData\Local\Temp\RtmpCGCAkT\downloaded_packages
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.5.2
data$NEET <- as.factor(data$NEET)
data_num <- model.matrix(NEET ~ ., data)[, -1] # hilangkan intercept
data_for_smote <- data.frame(
NEET = data$NEET,
data_num
)
library(smotefamily)
smote_res <- SMOTE(
X = data_for_smote[, -1],
target = data_for_smote$NEET,
K = 5
)
data_smote <- smote_res$data
str(data_smote)
## 'data.frame': 12145 obs. of 9 variables:
## $ Jenis_Kelamin1 : num 0 1 1 0 1 0 0 1 0 1 ...
## $ Status_Kawin1 : num 1 1 1 1 0 1 1 1 1 1 ...
## $ Disabilitas1 : num 1 0 1 1 1 1 0 1 1 1 ...
## $ Migrasi1 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ Pendidikan1 : num 1 0 0 0 1 1 1 1 0 1 ...
## $ Wilayah_Tempat_Tinggal1 : num 1 0 0 0 0 1 0 0 0 0 ...
## $ Jumlah_Anggota_Rumah_Tangga: num 1 1 1 1 1 1 1 1 1 1 ...
## $ Kelompok_Umur1 : num 1 0 0 1 1 1 1 1 0 1 ...
## $ class : chr "1" "1" "1" "1" ...
table(data_smote$class)
##
## 0 1
## 6115 6030
data_smote$class <- as.factor(data_smote$class)
table(data_smote$class)
##
## 0 1
## 6115 6030
str(data_smote$class)
## Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
model_smote <- glm(class ~ .,
data = data_smote,
family = binomial)
summary(model_smote)
##
## Call:
## glm(formula = class ~ ., family = binomial, data = data_smote)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.01123 0.15956 12.605 < 2e-16 ***
## Jenis_Kelamin1 0.13362 0.04118 3.245 0.00117 **
## Status_Kawin1 -0.82495 0.07135 -11.561 < 2e-16 ***
## Disabilitas1 -2.53995 0.12544 -20.248 < 2e-16 ***
## Migrasi1 -0.48682 0.09166 -5.311 1.09e-07 ***
## Pendidikan1 1.35125 0.04471 30.223 < 2e-16 ***
## Wilayah_Tempat_Tinggal1 -0.11743 0.04910 -2.392 0.01677 *
## Jumlah_Anggota_Rumah_Tangga 0.33023 0.07825 4.220 2.44e-05 ***
## Kelompok_Umur1 0.37642 0.04575 8.228 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16836 on 12144 degrees of freedom
## Residual deviance: 14706 on 12136 degrees of freedom
## AIC: 14724
##
## Number of Fisher Scoring iterations: 4
Uji Simultan
model_null <- glm(class ~ 1,
data = data_smote,
family = binomial)
anova(model_null, model_smote, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: class ~ 1
## Model 2: class ~ Jenis_Kelamin1 + Status_Kawin1 + Disabilitas1 + Migrasi1 +
## Pendidikan1 + Wilayah_Tempat_Tinggal1 + Jumlah_Anggota_Rumah_Tangga +
## Kelompok_Umur1
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 12144 16836
## 2 12136 14706 8 2130.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Uji Goodness of Fit (Hosmer–Lemeshow)
install.packages("ResourceSelection")
## Warning: package 'ResourceSelection' is in use and will not be installed
library(ResourceSelection)
library(ResourceSelection)
with(model_smote, cbind(res.deviance = deviance, df = df.residual,
p = pchisq(deviance, df.residual, lower.tail = FALSE)))
## res.deviance df p
## [1,] 14705.57 12136 3.28094e-54
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(pROC)
ROC Curve & AUC
library(pROC)
pred <- fitted(model_smote)
valid_index <- !is.na(pred)
roc_obj <- roc(
response = data_smote$class[valid_index],
predictor = pred[valid_index]
)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.7269
plot(roc_obj)
table(data_smote$class)
##
## 0 1
## 6115 6030
str(data_smote$class)
## Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# Odds Ratio
exp(coef(model_smote))
## (Intercept) Jenis_Kelamin1
## 7.47248909 1.14295428
## Status_Kawin1 Disabilitas1
## 0.43825700 0.07887014
## Migrasi1 Pendidikan1
## 0.61457990 3.86226328
## Wilayah_Tempat_Tinggal1 Jumlah_Anggota_Rumah_Tangga
## 0.88919994 1.39128674
## Kelompok_Umur1
## 1.45706396
# Odds Ratio dengan Confidence Interval 95%
exp(cbind(OR = coef(model_smote), confint(model_smote)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 7.47248909 5.4868450 10.2601139
## Jenis_Kelamin1 1.14295428 1.0543292 1.2390199
## Status_Kawin1 0.43825700 0.3808223 0.5037496
## Disabilitas1 0.07887014 0.0612730 0.1002533
## Migrasi1 0.61457990 0.5135711 0.7356849
## Pendidikan1 3.86226328 3.5389461 4.2168845
## Wilayah_Tempat_Tinggal1 0.88919994 0.8075594 0.9789670
## Jumlah_Anggota_Rumah_Tangga 1.39128674 1.1936396 1.6222568
## Kelompok_Umur1 1.45706396 1.3320256 1.5936671