Klasifikasi Kondisi Penerimaan Bantuan Daerah DI Kota Bogor Tahun 2020 menggunakan Quasi-Binomial Logistic Regression (L-BFGS)
Anggota:
-
Muhammad Nachnoer Novatron Fitra Arss (G1401201014)
-
Alwi Sidiq (G1401201020)
-
Dhiya Ulayya Tsabitah (G1401201013)
-
Duwi Sulistianingsih (G1401201033)
-
Nickyta Shavira Maharani (G1401201037)
-
Hanaa Budinur Izdihaar (G1401201076)
Call Package
lapply(c("caret","vip","readxl","tidyverse","h2o","cvms","DescTools","aod",
"e1071","caTools","h2o","UBL","MLmetrics"),library,character.only=T)[[1]]
## [1] "caret" "lattice" "ggplot2" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
Pre-process
dataoss$R2209<-as.factor(dataoss$R2209);dataos<-dataoss
dataos[sapply(dataos,is.numeric)]<-lapply(dataos[sapply(dataos,is.numeric)],as.factor)
dataos$R1804<-dataoss$R1804
table(dataos$R2209)
##
## 1 5
## 32 732
Splitting
set.seed(123);i<-createDataPartition(dataos$R2209,p=0.75,list=F)
str<-dataos[i,];table(str$R2209)
##
## 1 5
## 24 549
sts<-dataos[-i,]
Balancing
str<-SmoteClassif(R2209~.,str,dist="HEOM")
str$R2209<-ifelse(str$R2209=="1",1,0)
sts$R2209<-ifelse(sts$R2209=="1",1,0)
table(str$R2209)
##
## 0 1
## 286 285
Modelling
h2o.init();hll<-as.h2o(str);hmm<-as.h2o(sts)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 10 minutes 20 seconds
## H2O cluster timezone: Asia/Bangkok
## H2O data parsing timezone: UTC
## H2O cluster version: 3.38.0.1
## H2O cluster version age: 8 months and 24 days !!!
## H2O cluster name: H2O_started_from_R_Fitra_ysu792
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.78 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.2 (2022-10-31 ucrt)
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
glm<-h2o.glm(x=names(hll)[-1],y="R2209",hll,nfolds=15,seed=123,score_each_iteration = T,
keep_cross_validation_predictions = T,family="quasibinomial",
solver="L_BFGS",lambda_search=T,link="logit")
##
|
| | 0%
|
|============== | 20%
|
|============================================== | 65%
|
|=================================================================== | 96%
|
|======================================================================| 100%
Confusion Matrix
cm<-h2o.performance(glm);cm@metrics$cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0.0 1.0 Error Rate
## 0.0 224 62 0.2168 = 62 / 286
## 1.0 20 265 0.0702 = 20 / 285
## Totals 244 327 0.1436 = 82 / 571
Accuration
cm@metrics$max_criteria_and_metric_scores[4,"value"]
## [1] 0.8563923
Maximum Metrics
cm@metrics$max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.476357 0.866013 230
## 2 max f2 0.347318 0.923949 257
## 3 max f0point5 0.653570 0.865012 167
## 4 max accuracy 0.477570 0.856392 228
## 5 max precision 0.997385 1.000000 0
## 6 max recall 0.115436 1.000000 320
## 7 max specificity 0.997385 1.000000 0
## 8 max absolute_mcc 0.476357 0.720695 230
## 9 max min_per_class_accuracy 0.563058 0.835664 197
## 10 max mean_per_class_accuracy 0.476357 0.856521 230
## 11 max tns 0.997385 286.000000 0
## 12 max fns 0.997385 281.000000 0
## 13 max fps 0.000137 286.000000 399
## 14 max tps 0.115436 285.000000 320
## 15 max tnr 0.997385 1.000000 0
## 16 max fnr 0.997385 0.985965 0
## 17 max fpr 0.000137 1.000000 399
## 18 max tpr 0.115436 1.000000 320
Learning Curve Plot
h2o.learning_curve_plot(glm,"auc",cv_ribbon=T,cv_lines=F)

Variable Importance Plot
h2o.permutation_importance_plot(glm,hmm)

Predictions
hp<-as.vector(h2o.predict(glm,hmm)$predict);HP<-ifelse(hp=="1.0",1,0);HP
##
|
| | 0%
|
|======================================================================| 100%
## [1] 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0
## [38] 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 1 1 0 1 1 0 0 1 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1
## [112] 0 0 0 1 0 0 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1
## [149] 1 0 1 1 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0
## [186] 0 1 0 1 0 1
h<-h2o.performance(glm,hmm);h@metrics$cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0.0 1.0 Error Rate
## 0.0 86 97 0.5301 = 97 / 183
## 1.0 1 7 0.1250 = 1 / 8
## Totals 87 104 0.5131 = 98 / 191
h@metrics$max_criteria_and_metric_scores[4,"value"]
## [1] 0.9528796
h@metrics$max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.152779 0.125000 96
## 2 max f2 0.152779 0.257353 96
## 3 max f0point5 0.152779 0.082547 96
## 4 max accuracy 0.990780 0.952880 0
## 5 max precision 0.152779 0.067308 96
## 6 max recall 0.002976 1.000000 164
## 7 max specificity 0.990780 0.994536 0
## 8 max absolute_mcc 0.152779 0.138754 96
## 9 max min_per_class_accuracy 0.261984 0.557377 79
## 10 max mean_per_class_accuracy 0.152779 0.672473 96
## 11 max tns 0.990780 182.000000 0
## 12 max fns 0.990780 8.000000 0
## 13 max fps 0.000167 183.000000 181
## 14 max tps 0.002976 8.000000 164
## 15 max tnr 0.990780 0.994536 0
## 16 max fnr 0.990780 1.000000 0
## 17 max fpr 0.000167 1.000000 181
## 18 max tpr 0.002976 1.000000 164
Using optim package
Relevel
str[sapply(str,is.factor)]<-lapply(str[sapply(str,is.factor)],as.numeric)
sts[sapply(sts,is.factor)]<-lapply(sts[sapply(sts,is.factor)],as.numeric)
Define the negative log-likelihood function
negative_log_likelihood <- function(coef, X, y) {
# Compute the linear combination of features and coefficients
linear_pred <- X %*% coef
# Compute the predicted probabilities using the logistic function
predicted_probs <- 1 / (1 + exp(-linear_pred))
# Compute the negative log-likelihood
neg_log_likelihood <- -sum(y * log(predicted_probs-min(predicted_probs)+1) + (1 - y) * log((1 - predicted_probs)-min((1 - predicted_probs))+1))
return(neg_log_likelihood)
}
Specify the initial values for the coefficients
initial_coef <- matrix(1, 9) # Replace num_features with the actual number of features
Specify the bounds for the coefficients
lower_bound <- rep(-Inf, 9) # Replace num_features with the actual number of features
upper_bound <- rep(Inf, 9) # Replace num_features with the actual number of features
Define other necessary arguments
X <- as.matrix(str[,-1]) # Replace ... with the actual design matrix (features)
X_test<-as.matrix(sts[,-1])
y <- str$R2209 # Replace ... with the actual vector of binary labels (0 or 1)
Use the optim function with method = “L-BFGS-B” to minimize the negative log-likelihood
result <- optim(par = initial_coef, fn = negative_log_likelihood, gr = NULL, method = "L-BFGS-B",
X = X, y = y,lower=lower_bound,upper=upper_bound)
Retrieve the optimized coefficients
optimized_coef <- result$par;optimized_coef
## [,1]
## [1,] -10.42480
## [2,] 11.96905
## [3,] 2017.75600
## [4,] 882.61200
## [5,] -3127.67329
## [6,] 1385.68601
## [7,] 521.05455
## [8,] 1158.74730
## [9,] -1611.36948
Confusion Matrix
confusionMatrix(as.factor(pred),as.factor(akt))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 139 5
## 1 44 3
##
## Accuracy : 0.7435
## 95% CI : (0.6754, 0.8038)
## No Information Rate : 0.9581
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0404
##
## Mcnemar's Test P-Value : 5.681e-08
##
## Sensitivity : 0.75956
## Specificity : 0.37500
## Pos Pred Value : 0.96528
## Neg Pred Value : 0.06383
## Prevalence : 0.95812
## Detection Rate : 0.72775
## Detection Prevalence : 0.75393
## Balanced Accuracy : 0.56728
##
## 'Positive' Class : 0
##
Regresi Logistik
Modelling
reglog<-glm(R2209~.,data=str,family=binomial(link="logit"));summary(reglog)
##
## Call:
## glm(formula = R2209 ~ ., family = binomial(link = "logit"), data = str)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2746 -0.8201 -0.0629 0.8934 2.2626
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.703160 1.224296 -0.574 0.5657
## R1804 -0.001272 0.001529 -0.832 0.4054
## R1809A -0.738392 0.408969 -1.805 0.0710 .
## R1809C 0.681914 0.103023 6.619 3.61e-11 ***
## R2001B 1.480136 0.317735 4.658 3.19e-06 ***
## R1817 -0.805247 0.150593 -5.347 8.93e-08 ***
## R301 0.580363 0.090966 6.380 1.77e-10 ***
## R1811A 0.038300 0.231908 0.165 0.8688
## R1814A 0.281501 0.111474 2.525 0.0116 *
## R2001H -1.299778 0.283100 -4.591 4.41e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 791.57 on 570 degrees of freedom
## Residual deviance: 576.93 on 561 degrees of freedom
## AIC: 596.93
##
## Number of Fisher Scoring iterations: 5
G Test
G2<-pscl::pR2(reglog)[3];G2
## fitting null model for pseudo-r2
## G2
## 214.6467
chisq<-qchisq(0.05,reglog$df.null-reglog$df.residual);chisq
## [1] 3.325113
pvalue<-1-pchisq(G2[[1]],df=reglog$df.null-reglog$df.residual);pvalue
## [1] 0
#G2>chisq & p-value < 0.05 -> Tolak H0
Wald Test
car::Anova(reglog,type="II",test="Wald")
## Analysis of Deviance Table (Type II tests)
##
## Response: R2209
## Df Chisq Pr(>Chisq)
## R1804 1 0.6923 0.40537
## R1809A 1 3.2598 0.07100 .
## R1809C 1 43.8119 3.615e-11 ***
## R2001B 1 21.7007 3.187e-06 ***
## R1817 1 28.5922 8.934e-08 ***
## R301 1 40.7048 1.771e-10 ***
## R1811A 1 0.0273 0.86883
## R1814A 1 6.3769 0.01156 *
## R2001H 1 21.0794 4.406e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Prediction
prp<-predict(reglog,sts);pr<-ifelse(prp>0.5,1,0);pr
## 1 3 9 17 18 22 27 28 32 35 42 43 44 54 57 59 61 63 64 69
## 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1
## 71 76 78 83 85 88 94 95 99 101 103 104 112 128 131 138 142 148 151 152
## 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0
## 153 154 156 157 161 164 180 181 187 190 191 201 207 216 217 223 228 229 235 243
## 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 0 0 0 0
## 247 249 261 263 265 269 273 274 285 288 290 293 294 295 296 297 305 306 315 322
## 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
## 334 335 336 340 342 343 344 351 355 362 363 364 367 377 381 382 387 391 398 400
## 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0
## 407 409 415 416 422 427 429 433 441 442 450 457 464 465 468 469 477 479 495 497
## 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0
## 509 511 512 513 518 521 528 529 533 534 539 542 543 549 555 562 564 577 578 580
## 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0
## 587 588 592 600 601 603 607 614 615 617 618 627 630 632 635 636 641 643 644 645
## 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## 647 648 649 655 662 664 665 667 670 674 683 689 699 706 715 716 721 723 725 726
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 728 738 739 740 745 748 752 754 759 760 762
## 0 0 0 0 0 0 0 0 0 0 1
Accuration
accrl<-Accuracy(pr,sts$R2209);accrl
## [1] 0.7958115
Confusion Matrix
confusionMatrix(as.factor(pr),as.factor(sts$R2209))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 150 6
## 1 33 2
##
## Accuracy : 0.7958
## 95% CI : (0.7316, 0.8506)
## No Information Rate : 0.9581
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0267
##
## Mcnemar's Test P-Value : 3.136e-05
##
## Sensitivity : 0.81967
## Specificity : 0.25000
## Pos Pred Value : 0.96154
## Neg Pred Value : 0.05714
## Prevalence : 0.95812
## Detection Rate : 0.78534
## Detection Prevalence : 0.81675
## Balanced Accuracy : 0.53484
##
## 'Positive' Class : 0
##