Load package
library(tidyverse)
library(caret)
library(broom)
library(pROC)
library(ResourceSelection)
1. Load data CPNS 2024
cpns <- read.csv("C:/Users/priya/Downloads/sscasn_formasi_cpns_2024.csv")
glimpse(cpns)
## Rows: 653,621
## Columns: 14
## $ formation_id <chr> "8a01869291548667019154b712642868", "8a018692915486…
## $ agency <chr> "Badan Kependudukan dan Keluarga Berencana Nasional…
## $ admission_path <chr> "CPNS", "CPNS", "CPNS", "CPNS", "CPNS", "CPNS", "CP…
## $ fomation_category <chr> "UMUM", "UMUM", "UMUM", "UMUM", "UMUM", "UMUM", "UM…
## $ position <chr> "PENYULUH KELUARGA BERENCANA AHLI PERTAMA", "PENYUL…
## $ location <chr> "Perwakilan BKKBN Provinsi Jawa Timur", "Perwakilan…
## $ num_of_formation <int> 13, 2, 16, 7, 1, 9, 7, 4, 1, 3, 21, 4, 2, 4, 19, 13…
## $ allow_disability <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ minimum_salary_idr <int> 2785700, 2785700, 2785700, 2785700, 2785700, 278570…
## $ maximum_salary_idr <int> 9024448, 9024448, 9024448, 9024448, 9024448, 902444…
## $ education_code <int> 5106191, 5106191, 5106191, 5106191, 5106191, 510619…
## $ education_major <chr> "S-1 ILMU GIZI MASYARAKAT", "S-1 ILMU GIZI MASYARAK…
## $ education_level_id <int> 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,…
## $ education_level <chr> "S-1/Sarjana", "S-1/Sarjana", "S-1/Sarjana", "S-1/S…
# Ubah allow_disability jadi faktor biner: 0 = Tidak, 1 = Ya
cpns$allow_disability <- factor(
cpns$allow_disability,
levels = c(0, 1),
labels = c("Tidak", "Ya")
)
2. Pilih variabel yang digunakan dalam model
data_final <- cpns %>%
select(
allow_disability, # Y
fomation_category, # X1 (kategori formasi)
num_of_formation, # X2 (jumlah formasi)
minimum_salary_idr, # X3 (gaji minimum)
education_level # X4 (tingkat pendidikan)
) %>%
drop_na()
3. Split data: 80% train, 20% test
set.seed(123)
idx_train <- createDataPartition(
data_final$allow_disability,
p = 0.8,
list = FALSE
)
train <- data_final[idx_train, ]
test <- data_final[-idx_train, ]
# Cek proporsi kelas
prop.table(table(train$allow_disability))
##
## Tidak Ya
## 0.3026848 0.6973152
prop.table(table(test$allow_disability))
##
## Tidak Ya
## 0.3026835 0.6973165
4. Model Regresi Logistik Biner
model_logit <- glm(
allow_disability ~
fomation_category +
education_level +
num_of_formation * minimum_salary_idr,
data = train,
family = binomial
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_logit)
##
## Call:
## glm(formula = allow_disability ~ fomation_category + education_level +
## num_of_formation * minimum_salary_idr, family = binomial,
## data = train)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 3.330e+00 6.510e-01
## fomation_categoryLULUSAN TERBAIK -2.439e+00 5.076e-01
## fomation_categoryORANG ASLI PAPUA (OAP) -4.078e+00 5.071e-01
## fomation_categoryPENYANDANG DISABILITAS 1.333e+01 1.598e+01
## fomation_categoryPUTRA/PUTRI DAERAH TERTINGGAL -7.558e-01 5.214e-01
## fomation_categoryPUTRA/PUTRI KALIMANTAN -2.403e+00 5.075e-01
## fomation_categoryPUTRA/PUTRI PAPUA DAN PAPUA BARAT -2.578e+00 5.080e-01
## fomation_categoryUMUM -2.227e+00 5.070e-01
## fomation_categoryUMUM (NON OAP) -4.220e+00 5.073e-01
## education_levelDiploma III/Sarjana Muda -3.301e-01 4.083e-01
## education_levelDiploma IV -7.040e-02 4.083e-01
## education_levelS-1/Sarjana -6.510e-02 4.083e-01
## education_levelS-2 2.138e-01 4.085e-01
## education_levelS-3/Doktor 1.489e+00 4.136e-01
## education_levelSLTA -1.028e+00 4.106e-01
## education_levelSLTA Keguruan 1.034e+00 6.656e-01
## education_levelSLTP 6.392e-01 1.155e+00
## education_levelSMK/SLTA Kejuruan -1.002e+00 4.103e-01
## num_of_formation 1.970e-02 1.059e-03
## minimum_salary_idr -1.003e-08 1.509e-09
## num_of_formation:minimum_salary_idr -2.258e-09 1.487e-10
## z value Pr(>|z|)
## (Intercept) 5.116 3.12e-07 ***
## fomation_categoryLULUSAN TERBAIK -4.805 1.55e-06 ***
## fomation_categoryORANG ASLI PAPUA (OAP) -8.042 8.82e-16 ***
## fomation_categoryPENYANDANG DISABILITAS 0.834 0.404286
## fomation_categoryPUTRA/PUTRI DAERAH TERTINGGAL -1.450 0.147140
## fomation_categoryPUTRA/PUTRI KALIMANTAN -4.735 2.19e-06 ***
## fomation_categoryPUTRA/PUTRI PAPUA DAN PAPUA BARAT -5.075 3.87e-07 ***
## fomation_categoryUMUM -4.392 1.12e-05 ***
## fomation_categoryUMUM (NON OAP) -8.319 < 2e-16 ***
## education_levelDiploma III/Sarjana Muda -0.809 0.418796
## education_levelDiploma IV -0.172 0.863113
## education_levelS-1/Sarjana -0.159 0.873326
## education_levelS-2 0.523 0.600730
## education_levelS-3/Doktor 3.602 0.000316 ***
## education_levelSLTA -2.504 0.012273 *
## education_levelSLTA Keguruan 1.553 0.120415
## education_levelSLTP 0.553 0.579946
## education_levelSMK/SLTA Kejuruan -2.442 0.014588 *
## num_of_formation 18.614 < 2e-16 ***
## minimum_salary_idr -6.649 2.96e-11 ***
## num_of_formation:minimum_salary_idr -15.188 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 641199 on 522896 degrees of freedom
## Residual deviance: 583633 on 522876 degrees of freedom
## AIC: 583675
##
## Number of Fisher Scoring iterations: 15
5. Odds Ratio (OR) + Confidence Interval
or_table <- tidy(model_logit) %>%
mutate(
OR = exp(estimate),
CI_lower = exp(estimate - 1.96 * std.error),
CI_upper = exp(estimate + 1.96 * std.error)
)
or_table
## # A tibble: 21 × 8
## term estimate std.error statistic p.value OR CI_lower CI_upper
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.33 0.651 5.12 3.12e- 7 2.79e+1 7.80e+0 1.00e+ 2
## 2 fomation_cat… -2.44 0.508 -4.80 1.55e- 6 8.73e-2 3.23e-2 2.36e- 1
## 3 fomation_cat… -4.08 0.507 -8.04 8.82e-16 1.69e-2 6.27e-3 4.58e- 2
## 4 fomation_cat… 13.3 16.0 0.834 4.04e- 1 6.15e+5 1.53e-8 2.47e+19
## 5 fomation_cat… -0.756 0.521 -1.45 1.47e- 1 4.70e-1 1.69e-1 1.30e+ 0
## 6 fomation_cat… -2.40 0.508 -4.74 2.19e- 6 9.04e-2 3.34e-2 2.45e- 1
## 7 fomation_cat… -2.58 0.508 -5.08 3.87e- 7 7.59e-2 2.80e-2 2.05e- 1
## 8 fomation_cat… -2.23 0.507 -4.39 1.12e- 5 1.08e-1 3.99e-2 2.91e- 1
## 9 fomation_cat… -4.22 0.507 -8.32 8.87e-17 1.47e-2 5.44e-3 3.97e- 2
## 10 education_le… -0.330 0.408 -0.809 4.19e- 1 7.19e-1 3.23e-1 1.60e+ 0
## # ℹ 11 more rows
6. Uji Goodness of Fit (Hosmer–Lemeshow)
prob_train <- predict(model_logit, type = "response")
y_train_num <- ifelse(train$allow_disability == "Ya", 1, 0)
hl <- hoslem.test(y_train_num, prob_train, g = 10)
hl
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: y_train_num, prob_train
## X-squared = 13324, df = 8, p-value < 2.2e-16
7. Evaluasi Model di Data Test:
Confusion Matrix
prob_test <- predict(model_logit, newdata = test, type = "response")
pred_class <- ifelse(prob_test >= 0.5, "Ya", "Tidak") %>%
factor(levels = c("Tidak", "Ya"))
cm <- confusionMatrix(
data = pred_class,
reference = test$allow_disability,
positive = "Ya"
)
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak Ya
## Tidak 8772 3372
## Ya 30796 87784
##
## Accuracy : 0.7386
## 95% CI : (0.7362, 0.741)
## No Information Rate : 0.6973
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2298
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9630
## Specificity : 0.2217
## Pos Pred Value : 0.7403
## Neg Pred Value : 0.7223
## Prevalence : 0.6973
## Detection Rate : 0.6715
## Detection Prevalence : 0.9071
## Balanced Accuracy : 0.5924
##
## 'Positive' Class : Ya
##
ROC Curve & AUC
roc_obj <- roc(
response = test$allow_disability,
predictor = prob_test,
levels = c("Tidak", "Ya"),
direction = "<"
)
auc(roc_obj)
## Area under the curve: 0.6602
plot(roc_obj, main = "ROC Curve Model Regresi Logistik Formasi CPNS 2024")
