2023-05-19

Call Package

lapply(c("caret","vip","readxl","tidyverse","h2o","cvms","e1071","caTools","h2o","UBL"),library,character.only=T)[[1]]
##  [1] "caret"     "lattice"   "ggplot2"   "stats"     "graphics"  "grDevices"
##  [7] "utils"     "datasets"  "methods"   "base"

Input Data

dataoss<-data.frame(read_excel("C:/Users/falco/Downloads/Data OS Penerima Bantuan Daerah.xlsx"))
head(dataoss)
##   R2209 R1804 R1809A R1809C R2001B R1817 R301 R1811A R1814A R2001H
## 1     5    50      1      1      5     4    3      2      2      1
## 2     5    35      1      1      1     4    4      1      4      1
## 3     5    70      1      1      1     4    2      2      2      5
## 4     5    60      1      1      1     4    4      2      2      1
## 5     5   239      1      1      1     3    3      1      2      1
## 6     5   100      1      1      5     4    4      1      5      1

Pre-process

dataoss$R2209<-as.factor(dataoss$R2209);dataos<-dataoss
dataos[sapply(dataos,is.numeric)]<-lapply(dataos[sapply(dataos,is.numeric)],as.factor)
dataos$R1804<-dataoss$R1804
table(dataos$R2209)
## 
##   1   5 
##  32 732

Balancing

dataos<-SmoteClassif(R2209~.,dataos,dist="HEOM")
dataos$R2209<-ifelse(dataos$R2209=="1",1,0)
table(dataos$R2209)
## 
##   0   1 
## 382 382

Splitting

set.seed(123);i<-createDataPartition(dataos$R2209,p=0.75,list=F)
str<-dataos[i,]
sts<-dataos[-i,]

Modelling

h2o.init();hll<-as.h2o(str);hmm<-as.h2o(sts)
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 hours 26 minutes 
##     H2O cluster timezone:       Asia/Bangkok 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.38.0.1 
##     H2O cluster version age:    7 months and 29 days !!! 
##     H2O cluster name:           H2O_started_from_R_Fitra_ilh682 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.59 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.2.2 (2022-10-31 ucrt)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
glm<-h2o.glm(x=names(hll)[-1],y="R2209",hll,nfolds=15,seed=123,score_each_iteration = T,
             keep_cross_validation_predictions = T,family="quasibinomial",
             solver="L_BFGS",lambda_search=T,link="logit")
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |======================================================================| 100%

Confusion Matrix

cm<-h2o.performance(glm);cm@metrics$cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        0.0 1.0  Error        Rate
## 0.0    210  77 0.2683 =  77 / 287
## 1.0     40 247 0.1394 =  40 / 287
## Totals 250 324 0.2038 = 117 / 574

AUC

cm@metrics$AUC
## [1] 0.8701393

Maximum Metrics

cm@metrics$max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold      value idx
## 1                       max f1  0.481661   0.808511 210
## 2                       max f2  0.361903   0.886158 264
## 3                 max f0point5  0.546462   0.806330 170
## 4                 max accuracy  0.486330   0.796167 209
## 5                max precision  0.994034   1.000000   0
## 6                   max recall  0.050544   1.000000 367
## 7              max specificity  0.994034   1.000000   0
## 8             max absolute_mcc  0.481661   0.597319 210
## 9   max min_per_class_accuracy  0.532453   0.790941 183
## 10 max mean_per_class_accuracy  0.486330   0.796167 209
## 11                     max tns  0.994034 287.000000   0
## 12                     max fns  0.994034 286.000000   0
## 13                     max fps  0.001618 287.000000 399
## 14                     max tps  0.050544 287.000000 367
## 15                     max tnr  0.994034   1.000000   0
## 16                     max fnr  0.994034   0.996516   0
## 17                     max fpr  0.001618   1.000000 399
## 18                     max tpr  0.050544   1.000000 367

Learning Curve Plot

h2o.learning_curve_plot(glm,"auc",cv_ribbon=T,cv_lines=F)

Variable Importance Plot

h2o.permutation_importance_plot(glm,hmm)

Predictions

hp<-as.vector(h2o.predict(glm,hmm)$predict);HP<-ifelse(hp=="1.0",1,0);HP
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
##   [1] 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0
##  [75] 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 0 0 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1
## [186] 1 1 1 1 1

Confusion Matrix [Prediction]

h<-h2o.performance(glm,hmm);h@metrics$cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##        0.0 1.0  Error       Rate
## 0.0     64  31 0.3263 =  31 / 95
## 1.0      6  89 0.0632 =   6 / 95
## Totals  70 120 0.1947 = 37 / 190

AUC [Prediction]

h@metrics$AUC
## [1] 0.866482

Maximum Metrics [Prediction]

h@metrics$max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
##                         metric threshold     value idx
## 1                       max f1  0.299752  0.827907 118
## 2                       max f2  0.272544  0.897436 125
## 3                 max f0point5  0.504486  0.806100  89
## 4                 max accuracy  0.376026  0.810526 111
## 5                max precision  0.994672  1.000000   0
## 6                   max recall  0.027103  1.000000 173
## 7              max specificity  0.994672  1.000000   0
## 8             max absolute_mcc  0.299752  0.632832 118
## 9   max min_per_class_accuracy  0.444822  0.789474  92
## 10 max mean_per_class_accuracy  0.376026  0.810526 111
## 11                     max tns  0.994672 95.000000   0
## 12                     max fns  0.994672 94.000000   0
## 13                     max fps  0.002740 95.000000 188
## 14                     max tps  0.027103 95.000000 173
## 15                     max tnr  0.994672  1.000000   0
## 16                     max fnr  0.994672  0.989474   0
## 17                     max fpr  0.002740  1.000000 188
## 18                     max tpr  0.027103  1.000000 173