2023-06-12

Klasifikasi Kondisi Penerimaan Bantuan Daerah DI Kota Bogor Tahun 2020 menggunakan Quasi-Binomial Logistic Regression (L-BFGS)

Anggota:

  1. Muhammad Nachnoer Novatron Fitra Arss (G1401201014)
  2. Alwi Sidiq (G1401201020)
  3. Dhiya Ulayya Tsabitah (G1401201013)
  4. Duwi Sulistianingsih (G1401201033)
  5. Nickyta Shavira Maharani (G1401201037)
  6. Hanaa Budinur Izdihaar (G1401201076)

Call Package

lapply(c("caret","vip","readxl","tidyverse","h2o","cvms","DescTools","aod",
         "e1071","caTools","h2o","UBL","MLmetrics"),library,character.only=T)[[1]]
##  [1] "caret"     "lattice"   "ggplot2"   "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"

Input Data

dataoss<-data.frame(read_excel("C:/Users/falco/Downloads/Data OS Penerima Bantuan Daerah.xlsx"))
head(dataoss)
##   R2209 R1804 R1809A R1809C R2001B R1817 R301 R1811A R1814A R2001H
## 1     5    50      1      1      5     4    3      2      2      1
## 2     5    35      1      1      1     4    4      1      4      1
## 3     5    70      1      1      1     4    2      2      2      5
## 4     5    60      1      1      1     4    4      2      2      1
## 5     5   239      1      1      1     3    3      1      2      1
## 6     5   100      1      1      5     4    4      1      5      1

Pre-process

dataoss$R2209<-as.factor(dataoss$R2209);dataos<-dataoss
dataos[sapply(dataos,is.numeric)]<-lapply(dataos[sapply(dataos,is.numeric)],as.factor)
dataos$R1804<-dataoss$R1804
table(dataos$R2209)
## 
##   1   5 
##  32 732

Splitting

set.seed(123);i<-createDataPartition(dataos$R2209,p=0.75,list=F)
str<-dataos[i,];table(str$R2209)
## 
##   1   5 
##  24 549
sts<-dataos[-i,]

Balancing

str<-SmoteClassif(R2209~.,str,dist="HEOM")
str$R2209<-ifelse(str$R2209=="1",1,0)
sts$R2209<-ifelse(sts$R2209=="1",1,0)
table(str$R2209)
## 
##   0   1 
## 286 285

Modelling

h2o.init();hll<-as.h2o(str);hmm<-as.h2o(sts)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         10 minutes 20 seconds 
##     H2O cluster timezone:       Asia/Bangkok 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.38.0.1 
##     H2O cluster version age:    8 months and 24 days !!! 
##     H2O cluster name:           H2O_started_from_R_Fitra_ysu792 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.78 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.2 (2022-10-31 ucrt)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
glm<-h2o.glm(x=names(hll)[-1],y="R2209",hll,nfolds=15,seed=123,score_each_iteration = T,
             keep_cross_validation_predictions = T,family="quasibinomial",
             solver="L_BFGS",lambda_search=T,link="logit")
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%

Confusion Matrix

cm<-h2o.performance(glm);cm@metrics$cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        0.0 1.0  Error       Rate
## 0.0    224  62 0.2168 = 62 / 286
## 1.0     20 265 0.0702 = 20 / 285
## Totals 244 327 0.1436 = 82 / 571

Accuration

cm@metrics$max_criteria_and_metric_scores[4,"value"]
## [1] 0.8563923

Maximum Metrics

cm@metrics$max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.476357   0.866013 230
## 2                       max f2  0.347318   0.923949 257
## 3                 max f0point5  0.653570   0.865012 167
## 4                 max accuracy  0.477570   0.856392 228
## 5                max precision  0.997385   1.000000   0
## 6                   max recall  0.115436   1.000000 320
## 7              max specificity  0.997385   1.000000   0
## 8             max absolute_mcc  0.476357   0.720695 230
## 9   max min_per_class_accuracy  0.563058   0.835664 197
## 10 max mean_per_class_accuracy  0.476357   0.856521 230
## 11                     max tns  0.997385 286.000000   0
## 12                     max fns  0.997385 281.000000   0
## 13                     max fps  0.000137 286.000000 399
## 14                     max tps  0.115436 285.000000 320
## 15                     max tnr  0.997385   1.000000   0
## 16                     max fnr  0.997385   0.985965   0
## 17                     max fpr  0.000137   1.000000 399
## 18                     max tpr  0.115436   1.000000 320

Learning Curve Plot

h2o.learning_curve_plot(glm,"auc",cv_ribbon=T,cv_lines=F)

Variable Importance Plot

h2o.permutation_importance_plot(glm,hmm)

Predictions

hp<-as.vector(h2o.predict(glm,hmm)$predict);HP<-ifelse(hp=="1.0",1,0);HP
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
##   [1] 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0
##  [38] 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1
## [112] 0 0 0 1 0 0 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1
## [149] 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0
## [186] 0 1 0 1 0 1

Confusion Matrix Prediction

h<-h2o.performance(glm,hmm);h@metrics$cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        0.0 1.0  Error       Rate
## 0.0     86  97 0.5301 = 97 / 183
## 1.0      1   7 0.1250 =    1 / 8
## Totals  87 104 0.5131 = 98 / 191

Accuration Prediction

h@metrics$max_criteria_and_metric_scores[4,"value"]
## [1] 0.9528796

Maximum Metrics Prediction

h@metrics$max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.152779   0.125000  96
## 2                       max f2  0.152779   0.257353  96
## 3                 max f0point5  0.152779   0.082547  96
## 4                 max accuracy  0.990780   0.952880   0
## 5                max precision  0.152779   0.067308  96
## 6                   max recall  0.002976   1.000000 164
## 7              max specificity  0.990780   0.994536   0
## 8             max absolute_mcc  0.152779   0.138754  96
## 9   max min_per_class_accuracy  0.261984   0.557377  79
## 10 max mean_per_class_accuracy  0.152779   0.672473  96
## 11                     max tns  0.990780 182.000000   0
## 12                     max fns  0.990780   8.000000   0
## 13                     max fps  0.000167 183.000000 181
## 14                     max tps  0.002976   8.000000 164
## 15                     max tnr  0.990780   0.994536   0
## 16                     max fnr  0.990780   1.000000   0
## 17                     max fpr  0.000167   1.000000 181
## 18                     max tpr  0.002976   1.000000 164

Using optim package

Relevel

str[sapply(str,is.factor)]<-lapply(str[sapply(str,is.factor)],as.numeric)
sts[sapply(sts,is.factor)]<-lapply(sts[sapply(sts,is.factor)],as.numeric)

Define the negative log-likelihood function

negative_log_likelihood <- function(coef, X, y) {
  # Compute the linear combination of features and coefficients
  linear_pred <- X %*% coef
  
  # Compute the predicted probabilities using the logistic function
  predicted_probs <- 1 / (1 + exp(-linear_pred))
  
  # Compute the negative log-likelihood
  neg_log_likelihood <- -sum(y * log(predicted_probs-min(predicted_probs)+1) + (1 - y) * log((1 - predicted_probs)-min((1 - predicted_probs))+1))
  
  return(neg_log_likelihood)
}

Specify the initial values for the coefficients

initial_coef <- matrix(1, 9)  # Replace num_features with the actual number of features

Specify the bounds for the coefficients

lower_bound <- rep(-Inf, 9)  # Replace num_features with the actual number of features
upper_bound <- rep(Inf, 9)  # Replace num_features with the actual number of features

Define other necessary arguments

X <- as.matrix(str[,-1])  # Replace ... with the actual design matrix (features)
X_test<-as.matrix(sts[,-1])
y <- str$R2209  # Replace ... with the actual vector of binary labels (0 or 1)

Use the optim function with method = “L-BFGS-B” to minimize the negative log-likelihood

result <- optim(par = initial_coef, fn = negative_log_likelihood, gr = NULL, method = "L-BFGS-B",
                 X = X, y = y,lower=lower_bound,upper=upper_bound)

Retrieve the optimized coefficients

optimized_coef <- result$par;optimized_coef
##              [,1]
##  [1,]   -10.42480
##  [2,]    11.96905
##  [3,]  2017.75600
##  [4,]   882.61200
##  [5,] -3127.67329
##  [6,]  1385.68601
##  [7,]   521.05455
##  [8,]  1158.74730
##  [9,] -1611.36948

Prediction Performance

prob<-1/(1+exp(-X_test%*%optimized_coef))
pred<-ifelse(prob>=0.5,1,0)
akt<-sts$R2209
MLmetrics::Accuracy(pred,akt)
## [1] 0.7434555

Confusion Matrix

confusionMatrix(as.factor(pred),as.factor(akt))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 139   5
##          1  44   3
##                                           
##                Accuracy : 0.7435          
##                  95% CI : (0.6754, 0.8038)
##     No Information Rate : 0.9581          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0404          
##                                           
##  Mcnemar's Test P-Value : 5.681e-08       
##                                           
##             Sensitivity : 0.75956         
##             Specificity : 0.37500         
##          Pos Pred Value : 0.96528         
##          Neg Pred Value : 0.06383         
##              Prevalence : 0.95812         
##          Detection Rate : 0.72775         
##    Detection Prevalence : 0.75393         
##       Balanced Accuracy : 0.56728         
##                                           
##        'Positive' Class : 0               
## 

Regresi Logistik

Modelling

reglog<-glm(R2209~.,data=str,family=binomial(link="logit"));summary(reglog)
## 
## Call:
## glm(formula = R2209 ~ ., family = binomial(link = "logit"), data = str)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2746  -0.8201  -0.0629   0.8934   2.2626  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.703160   1.224296  -0.574   0.5657    
## R1804       -0.001272   0.001529  -0.832   0.4054    
## R1809A      -0.738392   0.408969  -1.805   0.0710 .  
## R1809C       0.681914   0.103023   6.619 3.61e-11 ***
## R2001B       1.480136   0.317735   4.658 3.19e-06 ***
## R1817       -0.805247   0.150593  -5.347 8.93e-08 ***
## R301         0.580363   0.090966   6.380 1.77e-10 ***
## R1811A       0.038300   0.231908   0.165   0.8688    
## R1814A       0.281501   0.111474   2.525   0.0116 *  
## R2001H      -1.299778   0.283100  -4.591 4.41e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 791.57  on 570  degrees of freedom
## Residual deviance: 576.93  on 561  degrees of freedom
## AIC: 596.93
## 
## Number of Fisher Scoring iterations: 5

G Test

G2<-pscl::pR2(reglog)[3];G2
## fitting null model for pseudo-r2
##       G2 
## 214.6467
chisq<-qchisq(0.05,reglog$df.null-reglog$df.residual);chisq
## [1] 3.325113
pvalue<-1-pchisq(G2[[1]],df=reglog$df.null-reglog$df.residual);pvalue
## [1] 0
#G2>chisq & p-value < 0.05 -> Tolak H0

Wald Test

car::Anova(reglog,type="II",test="Wald")
## Analysis of Deviance Table (Type II tests)
## 
## Response: R2209
##        Df   Chisq Pr(>Chisq)    
## R1804   1  0.6923    0.40537    
## R1809A  1  3.2598    0.07100 .  
## R1809C  1 43.8119  3.615e-11 ***
## R2001B  1 21.7007  3.187e-06 ***
## R1817   1 28.5922  8.934e-08 ***
## R301    1 40.7048  1.771e-10 ***
## R1811A  1  0.0273    0.86883    
## R1814A  1  6.3769    0.01156 *  
## R2001H  1 21.0794  4.406e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Prediction

prp<-predict(reglog,sts);pr<-ifelse(prp>0.5,1,0);pr
##   1   3   9  17  18  22  27  28  32  35  42  43  44  54  57  59  61  63  64  69 
##   0   0   0   1   0   0   0   0   0   0   1   0   0   1   1   0   0   0   1   1 
##  71  76  78  83  85  88  94  95  99 101 103 104 112 128 131 138 142 148 151 152 
##   0   1   1   0   0   1   0   0   0   0   0   0   0   0   0   0   1   1   0   0 
## 153 154 156 157 161 164 180 181 187 190 191 201 207 216 217 223 228 229 235 243 
##   0   0   0   0   0   0   0   1   1   1   1   0   0   0   1   1   0   0   0   0 
## 247 249 261 263 265 269 273 274 285 288 290 293 294 295 296 297 305 306 315 322 
##   0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0   0   0   0   0 
## 334 335 336 340 342 343 344 351 355 362 363 364 367 377 381 382 387 391 398 400 
##   0   0   0   0   0   0   0   0   0   0   0   1   0   1   0   0   0   0   0   0 
## 407 409 415 416 422 427 429 433 441 442 450 457 464 465 468 469 477 479 495 497 
##   0   0   0   1   1   0   0   1   0   0   0   0   0   0   0   1   1   0   1   0 
## 509 511 512 513 518 521 528 529 533 534 539 542 543 549 555 562 564 577 578 580 
##   1   0   0   0   1   0   0   0   0   0   0   0   1   0   1   0   0   0   0   0 
## 587 588 592 600 601 603 607 614 615 617 618 627 630 632 635 636 641 643 644 645 
##   0   1   0   0   0   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0 
## 647 648 649 655 662 664 665 667 670 674 683 689 699 706 715 716 721 723 725 726 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 728 738 739 740 745 748 752 754 759 760 762 
##   0   0   0   0   0   0   0   0   0   0   1

Accuration

accrl<-Accuracy(pr,sts$R2209);accrl
## [1] 0.7958115

Confusion Matrix

confusionMatrix(as.factor(pr),as.factor(sts$R2209))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 150   6
##          1  33   2
##                                           
##                Accuracy : 0.7958          
##                  95% CI : (0.7316, 0.8506)
##     No Information Rate : 0.9581          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0267          
##                                           
##  Mcnemar's Test P-Value : 3.136e-05       
##                                           
##             Sensitivity : 0.81967         
##             Specificity : 0.25000         
##          Pos Pred Value : 0.96154         
##          Neg Pred Value : 0.05714         
##              Prevalence : 0.95812         
##          Detection Rate : 0.78534         
##    Detection Prevalence : 0.81675         
##       Balanced Accuracy : 0.53484         
##                                           
##        'Positive' Class : 0               
##