Load package

library(tidyverse)
library(caret)
library(broom)
library(pROC)
library(ResourceSelection)

1. Load data CPNS 2024

cpns <- read.csv("C:/Users/priya/Downloads/sscasn_formasi_cpns_2024.csv")
glimpse(cpns)
## Rows: 653,621
## Columns: 14
## $ formation_id       <chr> "8a01869291548667019154b712642868", "8a018692915486…
## $ agency             <chr> "Badan Kependudukan dan Keluarga Berencana Nasional…
## $ admission_path     <chr> "CPNS", "CPNS", "CPNS", "CPNS", "CPNS", "CPNS", "CP…
## $ fomation_category  <chr> "UMUM", "UMUM", "UMUM", "UMUM", "UMUM", "UMUM", "UM…
## $ position           <chr> "PENYULUH KELUARGA BERENCANA AHLI PERTAMA", "PENYUL…
## $ location           <chr> "Perwakilan BKKBN Provinsi Jawa Timur", "Perwakilan…
## $ num_of_formation   <int> 13, 2, 16, 7, 1, 9, 7, 4, 1, 3, 21, 4, 2, 4, 19, 13…
## $ allow_disability   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ minimum_salary_idr <int> 2785700, 2785700, 2785700, 2785700, 2785700, 278570…
## $ maximum_salary_idr <int> 9024448, 9024448, 9024448, 9024448, 9024448, 902444…
## $ education_code     <int> 5106191, 5106191, 5106191, 5106191, 5106191, 510619…
## $ education_major    <chr> "S-1 ILMU GIZI MASYARAKAT", "S-1 ILMU GIZI MASYARAK…
## $ education_level_id <int> 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,…
## $ education_level    <chr> "S-1/Sarjana", "S-1/Sarjana", "S-1/Sarjana", "S-1/S…
# Ubah allow_disability jadi faktor biner: 0 = Tidak, 1 = Ya
cpns$allow_disability <- factor(
  cpns$allow_disability,
  levels = c(0, 1),
  labels = c("Tidak", "Ya")
)

2. Pilih variabel yang digunakan dalam model

data_final <- cpns %>%
  select(
    allow_disability,     # Y
    fomation_category,    # X1 (kategori formasi)
    num_of_formation,     # X2 (jumlah formasi)
    minimum_salary_idr,   # X3 (gaji minimum)
    education_level       # X4 (tingkat pendidikan)
  ) %>%
  drop_na()

3. Split data: 80% train, 20% test

set.seed(123)
idx_train <- createDataPartition(
  data_final$allow_disability,
  p = 0.8,
  list = FALSE
)

train <- data_final[idx_train, ]
test  <- data_final[-idx_train, ]

# Cek proporsi kelas 
prop.table(table(train$allow_disability))
## 
##     Tidak        Ya 
## 0.3026848 0.6973152
prop.table(table(test$allow_disability))
## 
##     Tidak        Ya 
## 0.3026835 0.6973165

4. Model Regresi Logistik Biner

model_logit <- glm(
  allow_disability ~ 
    fomation_category +
    education_level +
    num_of_formation * minimum_salary_idr,
  data = train,
  family = binomial
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_logit)
## 
## Call:
## glm(formula = allow_disability ~ fomation_category + education_level + 
##     num_of_formation * minimum_salary_idr, family = binomial, 
##     data = train)
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                         3.330e+00  6.510e-01
## fomation_categoryLULUSAN TERBAIK                   -2.439e+00  5.076e-01
## fomation_categoryORANG ASLI PAPUA (OAP)            -4.078e+00  5.071e-01
## fomation_categoryPENYANDANG DISABILITAS             1.333e+01  1.598e+01
## fomation_categoryPUTRA/PUTRI DAERAH TERTINGGAL     -7.558e-01  5.214e-01
## fomation_categoryPUTRA/PUTRI KALIMANTAN            -2.403e+00  5.075e-01
## fomation_categoryPUTRA/PUTRI PAPUA DAN PAPUA BARAT -2.578e+00  5.080e-01
## fomation_categoryUMUM                              -2.227e+00  5.070e-01
## fomation_categoryUMUM (NON OAP)                    -4.220e+00  5.073e-01
## education_levelDiploma III/Sarjana Muda            -3.301e-01  4.083e-01
## education_levelDiploma IV                          -7.040e-02  4.083e-01
## education_levelS-1/Sarjana                         -6.510e-02  4.083e-01
## education_levelS-2                                  2.138e-01  4.085e-01
## education_levelS-3/Doktor                           1.489e+00  4.136e-01
## education_levelSLTA                                -1.028e+00  4.106e-01
## education_levelSLTA Keguruan                        1.034e+00  6.656e-01
## education_levelSLTP                                 6.392e-01  1.155e+00
## education_levelSMK/SLTA Kejuruan                   -1.002e+00  4.103e-01
## num_of_formation                                    1.970e-02  1.059e-03
## minimum_salary_idr                                 -1.003e-08  1.509e-09
## num_of_formation:minimum_salary_idr                -2.258e-09  1.487e-10
##                                                    z value Pr(>|z|)    
## (Intercept)                                          5.116 3.12e-07 ***
## fomation_categoryLULUSAN TERBAIK                    -4.805 1.55e-06 ***
## fomation_categoryORANG ASLI PAPUA (OAP)             -8.042 8.82e-16 ***
## fomation_categoryPENYANDANG DISABILITAS              0.834 0.404286    
## fomation_categoryPUTRA/PUTRI DAERAH TERTINGGAL      -1.450 0.147140    
## fomation_categoryPUTRA/PUTRI KALIMANTAN             -4.735 2.19e-06 ***
## fomation_categoryPUTRA/PUTRI PAPUA DAN PAPUA BARAT  -5.075 3.87e-07 ***
## fomation_categoryUMUM                               -4.392 1.12e-05 ***
## fomation_categoryUMUM (NON OAP)                     -8.319  < 2e-16 ***
## education_levelDiploma III/Sarjana Muda             -0.809 0.418796    
## education_levelDiploma IV                           -0.172 0.863113    
## education_levelS-1/Sarjana                          -0.159 0.873326    
## education_levelS-2                                   0.523 0.600730    
## education_levelS-3/Doktor                            3.602 0.000316 ***
## education_levelSLTA                                 -2.504 0.012273 *  
## education_levelSLTA Keguruan                         1.553 0.120415    
## education_levelSLTP                                  0.553 0.579946    
## education_levelSMK/SLTA Kejuruan                    -2.442 0.014588 *  
## num_of_formation                                    18.614  < 2e-16 ***
## minimum_salary_idr                                  -6.649 2.96e-11 ***
## num_of_formation:minimum_salary_idr                -15.188  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 641199  on 522896  degrees of freedom
## Residual deviance: 583633  on 522876  degrees of freedom
## AIC: 583675
## 
## Number of Fisher Scoring iterations: 15

5. Odds Ratio (OR) + Confidence Interval

or_table <- tidy(model_logit) %>%
  mutate(
    OR       = exp(estimate),
    CI_lower = exp(estimate - 1.96 * std.error),
    CI_upper = exp(estimate + 1.96 * std.error)
  )

or_table   
## # A tibble: 21 × 8
##    term          estimate std.error statistic  p.value      OR CI_lower CI_upper
##    <chr>            <dbl>     <dbl>     <dbl>    <dbl>   <dbl>    <dbl>    <dbl>
##  1 (Intercept)      3.33      0.651     5.12  3.12e- 7 2.79e+1  7.80e+0 1.00e+ 2
##  2 fomation_cat…   -2.44      0.508    -4.80  1.55e- 6 8.73e-2  3.23e-2 2.36e- 1
##  3 fomation_cat…   -4.08      0.507    -8.04  8.82e-16 1.69e-2  6.27e-3 4.58e- 2
##  4 fomation_cat…   13.3      16.0       0.834 4.04e- 1 6.15e+5  1.53e-8 2.47e+19
##  5 fomation_cat…   -0.756     0.521    -1.45  1.47e- 1 4.70e-1  1.69e-1 1.30e+ 0
##  6 fomation_cat…   -2.40      0.508    -4.74  2.19e- 6 9.04e-2  3.34e-2 2.45e- 1
##  7 fomation_cat…   -2.58      0.508    -5.08  3.87e- 7 7.59e-2  2.80e-2 2.05e- 1
##  8 fomation_cat…   -2.23      0.507    -4.39  1.12e- 5 1.08e-1  3.99e-2 2.91e- 1
##  9 fomation_cat…   -4.22      0.507    -8.32  8.87e-17 1.47e-2  5.44e-3 3.97e- 2
## 10 education_le…   -0.330     0.408    -0.809 4.19e- 1 7.19e-1  3.23e-1 1.60e+ 0
## # ℹ 11 more rows

6. Uji Goodness of Fit (Hosmer–Lemeshow)

prob_train <- predict(model_logit, type = "response")
y_train_num <- ifelse(train$allow_disability == "Ya", 1, 0)

hl <- hoslem.test(y_train_num, prob_train, g = 10)
hl   
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  y_train_num, prob_train
## X-squared = 13324, df = 8, p-value < 2.2e-16

7. Evaluasi Model di Data Test:

Confusion Matrix

prob_test <- predict(model_logit, newdata = test, type = "response")

pred_class <- ifelse(prob_test >= 0.5, "Ya", "Tidak") %>%
  factor(levels = c("Tidak", "Ya"))

cm <- confusionMatrix(
  data = pred_class,
  reference = test$allow_disability,
  positive = "Ya"
)
cm   
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Tidak    Ya
##      Tidak  8772  3372
##      Ya    30796 87784
##                                          
##                Accuracy : 0.7386         
##                  95% CI : (0.7362, 0.741)
##     No Information Rate : 0.6973         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.2298         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9630         
##             Specificity : 0.2217         
##          Pos Pred Value : 0.7403         
##          Neg Pred Value : 0.7223         
##              Prevalence : 0.6973         
##          Detection Rate : 0.6715         
##    Detection Prevalence : 0.9071         
##       Balanced Accuracy : 0.5924         
##                                          
##        'Positive' Class : Ya             
## 

ROC Curve & AUC

roc_obj <- roc(
  response = test$allow_disability,
  predictor = prob_test,
  levels = c("Tidak", "Ya"),
  direction = "<"
)

auc(roc_obj)  
## Area under the curve: 0.6602
plot(roc_obj, main = "ROC Curve Model Regresi Logistik Formasi CPNS 2024")