2023-06-12

Klasifikasi Kondisi Penerimaan Bantuan Daerah DI Kota Bogor Tahun 2020 menggunakan Quasi-Binomial Logistic Regression (L-BFGS)

Anggota:

Muhammad Nachnoer Novatron Fitra Arss (G1401201014)
Alwi Sidiq (G1401201020)
Dhiya Ulayya Tsabitah (G1401201013)
Duwi Sulistianingsih (G1401201033)
Nickyta Shavira Maharani (G1401201037)
Hanaa Budinur Izdihaar (G1401201076)

Call Package

lapply(c("caret","vip","readxl","tidyverse","h2o","cvms","DescTools","aod",
         "e1071","caTools","h2o","UBL","MLmetrics"),library,character.only=T)[[1]]

##  [1] "caret"     "lattice"   "ggplot2"   "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"

Input Data

dataoss<-data.frame(read_excel("C:/Users/falco/Downloads/Data OS Penerima Bantuan Daerah.xlsx"))
head(dataoss)

##   R2209 R1804 R1809A R1809C R2001B R1817 R301 R1811A R1814A R2001H
## 1     5    50      1      1      5     4    3      2      2      1
## 2     5    35      1      1      1     4    4      1      4      1
## 3     5    70      1      1      1     4    2      2      2      5
## 4     5    60      1      1      1     4    4      2      2      1
## 5     5   239      1      1      1     3    3      1      2      1
## 6     5   100      1      1      5     4    4      1      5      1

Pre-process

dataoss$R2209<-as.factor(dataoss$R2209);dataos<-dataoss
dataos[sapply(dataos,is.numeric)]<-lapply(dataos[sapply(dataos,is.numeric)],as.factor)
dataos$R1804<-dataoss$R1804
table(dataos$R2209)

## 
##   1   5 
##  32 732

Splitting

set.seed(123);i<-createDataPartition(dataos$R2209,p=0.75,list=F)
str<-dataos[i,];table(str$R2209)

## 
##   1   5 
##  24 549

sts<-dataos[-i,]

Balancing

str<-SmoteClassif(R2209~.,str,dist="HEOM")
str$R2209<-ifelse(str$R2209=="1",1,0)
sts$R2209<-ifelse(sts$R2209=="1",1,0)
table(str$R2209)

## 
##   0   1 
## 286 285

Modelling

h2o.init();hll<-as.h2o(str);hmm<-as.h2o(sts)

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         10 minutes 20 seconds 
##     H2O cluster timezone:       Asia/Bangkok 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.38.0.1 
##     H2O cluster version age:    8 months and 24 days !!! 
##     H2O cluster name:           H2O_started_from_R_Fitra_ysu792 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.78 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.2 (2022-10-31 ucrt)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

glm<-h2o.glm(x=names(hll)[-1],y="R2209",hll,nfolds=15,seed=123,score_each_iteration = T,
             keep_cross_validation_predictions = T,family="quasibinomial",
             solver="L_BFGS",lambda_search=T,link="logit")

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%

Confusion Matrix

cm<-h2o.performance(glm);cm@metrics$cm$table

## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        0.0 1.0  Error       Rate
## 0.0    224  62 0.2168 = 62 / 286
## 1.0     20 265 0.0702 = 20 / 285
## Totals 244 327 0.1436 = 82 / 571

Accuration

cm@metrics$max_criteria_and_metric_scores[4,"value"]

## [1] 0.8563923

Maximum Metrics

cm@metrics$max_criteria_and_metric_scores

## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.476357   0.866013 230
## 2                       max f2  0.347318   0.923949 257
## 3                 max f0point5  0.653570   0.865012 167
## 4                 max accuracy  0.477570   0.856392 228
## 5                max precision  0.997385   1.000000   0
## 6                   max recall  0.115436   1.000000 320
## 7              max specificity  0.997385   1.000000   0
## 8             max absolute_mcc  0.476357   0.720695 230
## 9   max min_per_class_accuracy  0.563058   0.835664 197
## 10 max mean_per_class_accuracy  0.476357   0.856521 230
## 11                     max tns  0.997385 286.000000   0
## 12                     max fns  0.997385 281.000000   0
## 13                     max fps  0.000137 286.000000 399
## 14                     max tps  0.115436 285.000000 320
## 15                     max tnr  0.997385   1.000000   0
## 16                     max fnr  0.997385   0.985965   0
## 17                     max fpr  0.000137   1.000000 399
## 18                     max tpr  0.115436   1.000000 320

Learning Curve Plot

h2o.learning_curve_plot(glm,"auc",cv_ribbon=T,cv_lines=F)

Variable Importance Plot

h2o.permutation_importance_plot(glm,hmm)

Predictions

hp<-as.vector(h2o.predict(glm,hmm)$predict);HP<-ifelse(hp=="1.0",1,0);HP

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

##   [1] 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0
##  [38] 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1
## [112] 0 0 0 1 0 0 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1
## [149] 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0
## [186] 0 1 0 1 0 1

Confusion Matrix Prediction

h<-h2o.performance(glm,hmm);h@metrics$cm$table

## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        0.0 1.0  Error       Rate
## 0.0     86  97 0.5301 = 97 / 183
## 1.0      1   7 0.1250 =    1 / 8
## Totals  87 104 0.5131 = 98 / 191

Accuration Prediction

h@metrics$max_criteria_and_metric_scores[4,"value"]

## [1] 0.9528796

Maximum Metrics Prediction

h@metrics$max_criteria_and_metric_scores

## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.152779   0.125000  96
## 2                       max f2  0.152779   0.257353  96
## 3                 max f0point5  0.152779   0.082547  96
## 4                 max accuracy  0.990780   0.952880   0
## 5                max precision  0.152779   0.067308  96
## 6                   max recall  0.002976   1.000000 164
## 7              max specificity  0.990780   0.994536   0
## 8             max absolute_mcc  0.152779   0.138754  96
## 9   max min_per_class_accuracy  0.261984   0.557377  79
## 10 max mean_per_class_accuracy  0.152779   0.672473  96
## 11                     max tns  0.990780 182.000000   0
## 12                     max fns  0.990780   8.000000   0
## 13                     max fps  0.000167 183.000000 181
## 14                     max tps  0.002976   8.000000 164
## 15                     max tnr  0.990780   0.994536   0
## 16                     max fnr  0.990780   1.000000   0
## 17                     max fpr  0.000167   1.000000 181
## 18                     max tpr  0.002976   1.000000 164

Using optim package

Relevel

str[sapply(str,is.factor)]<-lapply(str[sapply(str,is.factor)],as.numeric)
sts[sapply(sts,is.factor)]<-lapply(sts[sapply(sts,is.factor)],as.numeric)

Define the negative log-likelihood function

negative_log_likelihood <- function(coef, X, y) {
  # Compute the linear combination of features and coefficients
  linear_pred <- X %*% coef
  
  # Compute the predicted probabilities using the logistic function
  predicted_probs <- 1 / (1 + exp(-linear_pred))
  
  # Compute the negative log-likelihood
  neg_log_likelihood <- -sum(y * log(predicted_probs-min(predicted_probs)+1) + (1 - y) * log((1 - predicted_probs)-min((1 - predicted_probs))+1))
  
  return(neg_log_likelihood)
}

Specify the initial values for the coefficients

initial_coef <- matrix(1, 9)  # Replace num_features with the actual number of features

Specify the bounds for the coefficients

lower_bound <- rep(-Inf, 9)  # Replace num_features with the actual number of features
upper_bound <- rep(Inf, 9)  # Replace num_features with the actual number of features

Define other necessary arguments

X <- as.matrix(str[,-1])  # Replace ... with the actual design matrix (features)
X_test<-as.matrix(sts[,-1])
y <- str$R2209  # Replace ... with the actual vector of binary labels (0 or 1)

Use the optim function with method = “L-BFGS-B” to minimize the negative log-likelihood

result <- optim(par = initial_coef, fn = negative_log_likelihood, gr = NULL, method = "L-BFGS-B",
                 X = X, y = y,lower=lower_bound,upper=upper_bound)

Retrieve the optimized coefficients

optimized_coef <- result$par;optimized_coef

##              [,1]
##  [1,]   -10.42480
##  [2,]    11.96905
##  [3,]  2017.75600
##  [4,]   882.61200
##  [5,] -3127.67329
##  [6,]  1385.68601
##  [7,]   521.05455
##  [8,]  1158.74730
##  [9,] -1611.36948

Prediction Performance

prob<-1/(1+exp(-X_test%*%optimized_coef))
pred<-ifelse(prob>=0.5,1,0)
akt<-sts$R2209
MLmetrics::Accuracy(pred,akt)

## [1] 0.7434555

Confusion Matrix

confusionMatrix(as.factor(pred),as.factor(akt))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 139   5
##          1  44   3
##                                           
##                Accuracy : 0.7435          
##                  95% CI : (0.6754, 0.8038)
##     No Information Rate : 0.9581          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0404          
##                                           
##  Mcnemar's Test P-Value : 5.681e-08       
##                                           
##             Sensitivity : 0.75956         
##             Specificity : 0.37500         
##          Pos Pred Value : 0.96528         
##          Neg Pred Value : 0.06383         
##              Prevalence : 0.95812         
##          Detection Rate : 0.72775         
##    Detection Prevalence : 0.75393         
##       Balanced Accuracy : 0.56728         
##                                           
##        'Positive' Class : 0               
##

Regresi Logistik

Modelling

reglog<-glm(R2209~.,data=str,family=binomial(link="logit"));summary(reglog)

## 
## Call:
## glm(formula = R2209 ~ ., family = binomial(link = "logit"), data = str)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2746  -0.8201  -0.0629   0.8934   2.2626  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.703160   1.224296  -0.574   0.5657    
## R1804       -0.001272   0.001529  -0.832   0.4054    
## R1809A      -0.738392   0.408969  -1.805   0.0710 .  
## R1809C       0.681914   0.103023   6.619 3.61e-11 ***
## R2001B       1.480136   0.317735   4.658 3.19e-06 ***
## R1817       -0.805247   0.150593  -5.347 8.93e-08 ***
## R301         0.580363   0.090966   6.380 1.77e-10 ***
## R1811A       0.038300   0.231908   0.165   0.8688    
## R1814A       0.281501   0.111474   2.525   0.0116 *  
## R2001H      -1.299778   0.283100  -4.591 4.41e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 791.57  on 570  degrees of freedom
## Residual deviance: 576.93  on 561  degrees of freedom
## AIC: 596.93
## 
## Number of Fisher Scoring iterations: 5

G Test

G2<-pscl::pR2(reglog)[3];G2

## fitting null model for pseudo-r2

##       G2 
## 214.6467

chisq<-qchisq(0.05,reglog$df.null-reglog$df.residual);chisq

## [1] 3.325113

pvalue<-1-pchisq(G2[[1]],df=reglog$df.null-reglog$df.residual);pvalue

## [1] 0

#G2>chisq & p-value < 0.05 -> Tolak H0

Wald Test

car::Anova(reglog,type="II",test="Wald")

## Analysis of Deviance Table (Type II tests)
## 
## Response: R2209
##        Df   Chisq Pr(>Chisq)    
## R1804   1  0.6923    0.40537    
## R1809A  1  3.2598    0.07100 .  
## R1809C  1 43.8119  3.615e-11 ***
## R2001B  1 21.7007  3.187e-06 ***
## R1817   1 28.5922  8.934e-08 ***
## R301    1 40.7048  1.771e-10 ***
## R1811A  1  0.0273    0.86883    
## R1814A  1  6.3769    0.01156 *  
## R2001H  1 21.0794  4.406e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Prediction

prp<-predict(reglog,sts);pr<-ifelse(prp>0.5,1,0);pr

##   1   3   9  17  18  22  27  28  32  35  42  43  44  54  57  59  61  63  64  69 
##   0   0   0   1   0   0   0   0   0   0   1   0   0   1   1   0   0   0   1   1 
##  71  76  78  83  85  88  94  95  99 101 103 104 112 128 131 138 142 148 151 152 
##   0   1   1   0   0   1   0   0   0   0   0   0   0   0   0   0   1   1   0   0 
## 153 154 156 157 161 164 180 181 187 190 191 201 207 216 217 223 228 229 235 243 
##   0   0   0   0   0   0   0   1   1   1   1   0   0   0   1   1   0   0   0   0 
## 247 249 261 263 265 269 273 274 285 288 290 293 294 295 296 297 305 306 315 322 
##   0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0   0   0   0   0 
## 334 335 336 340 342 343 344 351 355 362 363 364 367 377 381 382 387 391 398 400 
##   0   0   0   0   0   0   0   0   0   0   0   1   0   1   0   0   0   0   0   0 
## 407 409 415 416 422 427 429 433 441 442 450 457 464 465 468 469 477 479 495 497 
##   0   0   0   1   1   0   0   1   0   0   0   0   0   0   0   1   1   0   1   0 
## 509 511 512 513 518 521 528 529 533 534 539 542 543 549 555 562 564 577 578 580 
##   1   0   0   0   1   0   0   0   0   0   0   0   1   0   1   0   0   0   0   0 
## 587 588 592 600 601 603 607 614 615 617 618 627 630 632 635 636 641 643 644 645 
##   0   1   0   0   0   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0 
## 647 648 649 655 662 664 665 667 670 674 683 689 699 706 715 716 721 723 725 726 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 728 738 739 740 745 748 752 754 759 760 762 
##   0   0   0   0   0   0   0   0   0   0   1

Accuration

accrl<-Accuracy(pr,sts$R2209);accrl

## [1] 0.7958115

Confusion Matrix

confusionMatrix(as.factor(pr),as.factor(sts$R2209))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 150   6
##          1  33   2
##                                           
##                Accuracy : 0.7958          
##                  95% CI : (0.7316, 0.8506)
##     No Information Rate : 0.9581          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0267          
##                                           
##  Mcnemar's Test P-Value : 3.136e-05       
##                                           
##             Sensitivity : 0.81967         
##             Specificity : 0.25000         
##          Pos Pred Value : 0.96154         
##          Neg Pred Value : 0.05714         
##              Prevalence : 0.95812         
##          Detection Rate : 0.78534         
##    Detection Prevalence : 0.81675         
##       Balanced Accuracy : 0.53484         
##                                           
##        'Positive' Class : 0               
##