Call Package
lapply(c("caret","vip","readxl","tidyverse","h2o","cvms","e1071","caTools","h2o","UBL"),library,character.only=T)[[1]]
## [1] "caret" "lattice" "ggplot2" "stats" "graphics" "grDevices"
## [7] "utils" "datasets" "methods" "base"
Pre-process
dataoss$R2209<-as.factor(dataoss$R2209);dataos<-dataoss
dataos[sapply(dataos,is.numeric)]<-lapply(dataos[sapply(dataos,is.numeric)],as.factor)
dataos$R1804<-dataoss$R1804
table(dataos$R2209)
##
## 1 5
## 32 732
Balancing
dataos<-SmoteClassif(R2209~.,dataos,dist="HEOM")
dataos$R2209<-ifelse(dataos$R2209=="1",1,0)
table(dataos$R2209)
##
## 0 1
## 382 382
Splitting
set.seed(123);i<-createDataPartition(dataos$R2209,p=0.75,list=F)
str<-dataos[i,]
sts<-dataos[-i,]
Modelling
h2o.init();hll<-as.h2o(str);hmm<-as.h2o(sts)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 3 hours 26 minutes
## H2O cluster timezone: Asia/Bangkok
## H2O data parsing timezone: UTC
## H2O cluster version: 3.38.0.1
## H2O cluster version age: 7 months and 29 days !!!
## H2O cluster name: H2O_started_from_R_Fitra_ilh682
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.59 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## R Version: R version 4.2.2 (2022-10-31 ucrt)
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
glm<-h2o.glm(x=names(hll)[-1],y="R2209",hll,nfolds=15,seed=123,score_each_iteration = T,
keep_cross_validation_predictions = T,family="quasibinomial",
solver="L_BFGS",lambda_search=T,link="logit")
##
|
| | 0%
|
|== | 2%
|
|============= | 19%
|
|============== | 20%
|
|========================== | 37%
|
|================================ | 46%
|
|============================================ | 63%
|
|================================================== | 72%
|
|============================================================== | 88%
|
|================================================================ | 92%
|
|======================================================================| 100%
Confusion Matrix
cm<-h2o.performance(glm);cm@metrics$cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0.0 1.0 Error Rate
## 0.0 210 77 0.2683 = 77 / 287
## 1.0 40 247 0.1394 = 40 / 287
## Totals 250 324 0.2038 = 117 / 574
AUC
cm@metrics$AUC
## [1] 0.8701393
Maximum Metrics
cm@metrics$max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.481661 0.808511 210
## 2 max f2 0.361903 0.886158 264
## 3 max f0point5 0.546462 0.806330 170
## 4 max accuracy 0.486330 0.796167 209
## 5 max precision 0.994034 1.000000 0
## 6 max recall 0.050544 1.000000 367
## 7 max specificity 0.994034 1.000000 0
## 8 max absolute_mcc 0.481661 0.597319 210
## 9 max min_per_class_accuracy 0.532453 0.790941 183
## 10 max mean_per_class_accuracy 0.486330 0.796167 209
## 11 max tns 0.994034 287.000000 0
## 12 max fns 0.994034 286.000000 0
## 13 max fps 0.001618 287.000000 399
## 14 max tps 0.050544 287.000000 367
## 15 max tnr 0.994034 1.000000 0
## 16 max fnr 0.994034 0.996516 0
## 17 max fpr 0.001618 1.000000 399
## 18 max tpr 0.050544 1.000000 367
Learning Curve Plot
h2o.learning_curve_plot(glm,"auc",cv_ribbon=T,cv_lines=F)

Variable Importance Plot
h2o.permutation_importance_plot(glm,hmm)

Predictions
hp<-as.vector(h2o.predict(glm,hmm)$predict);HP<-ifelse(hp=="1.0",1,0);HP
##
|
| | 0%
|
|======================================================================| 100%
## [1] 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0
## [75] 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 0 0 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1
## [186] 1 1 1 1 1
Confusion Matrix [Prediction]
h<-h2o.performance(glm,hmm);h@metrics$cm$table
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0.0 1.0 Error Rate
## 0.0 64 31 0.3263 = 31 / 95
## 1.0 6 89 0.0632 = 6 / 95
## Totals 70 120 0.1947 = 37 / 190
AUC [Prediction]
h@metrics$AUC
## [1] 0.866482
Maximum Metrics [Prediction]
h@metrics$max_criteria_and_metric_scores
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.299752 0.827907 118
## 2 max f2 0.272544 0.897436 125
## 3 max f0point5 0.504486 0.806100 89
## 4 max accuracy 0.376026 0.810526 111
## 5 max precision 0.994672 1.000000 0
## 6 max recall 0.027103 1.000000 173
## 7 max specificity 0.994672 1.000000 0
## 8 max absolute_mcc 0.299752 0.632832 118
## 9 max min_per_class_accuracy 0.444822 0.789474 92
## 10 max mean_per_class_accuracy 0.376026 0.810526 111
## 11 max tns 0.994672 95.000000 0
## 12 max fns 0.994672 94.000000 0
## 13 max fps 0.002740 95.000000 188
## 14 max tps 0.027103 95.000000 173
## 15 max tnr 0.994672 1.000000 0
## 16 max fnr 0.994672 0.989474 0
## 17 max fpr 0.002740 1.000000 188
## 18 max tpr 0.027103 1.000000 173