1 H2O package

1.1 packages

library(h2o)
library(tidyverse)
library(caret)

1.2 start h2o & convert

# start h2o cluster
invisible(h2o.init())   # 실패시-> 요구하는 폴더 위치에 맞게 java 설치:64bit

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\user\AppData\Local\Temp\RtmpO2ogjq/h2o_user_started_from_r.out
##     C:\Users\user\AppData\Local\Temp\RtmpO2ogjq/h2o_user_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         6 seconds 121 milliseconds 
##     H2O cluster timezone:       Asia/Seoul 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.4 
##     H2O cluster version age:    1 month and 13 days  
##     H2O cluster name:           H2O_started_from_R_user_pje900 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.97 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.2 (2019-12-12)

data(iris)

ind <- sample(2, nrow(iris), replace = TRUE, prob=c(0.7,0.3))
train <- iris[ind==1, ]
test <- iris[ind==2, ]
dim(train); dim(test)

## [1] 113   5

## [1] 37  5

# convert data as h2o type
train_h <- as.h2o(train)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

test_h <- as.h2o(test)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

str(train_h)

## Class 'H2OFrame' <environment: 0x0000000026387b50> 
##  - attr(*, "op")= chr "Parse"
##  - attr(*, "id")= chr "train_sid_bdc6_1"
##  - attr(*, "eval")= logi FALSE
##  - attr(*, "nrow")= int 113
##  - attr(*, "ncol")= int 5
##  - attr(*, "types")=List of 5
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "real"
##   ..$ : chr "enum"
##  - attr(*, "data")='data.frame': 10 obs. of  5 variables:
##   ..$ Sepal.Length: num  4.9 4.7 4.6 5 4.6 5 4.4 4.9 5.4 4.8
##   ..$ Sepal.Width : num  3 3.2 3.1 3.6 3.4 3.4 2.9 3.1 3.7 3
##   ..$ Petal.Length: num  1.4 1.3 1.5 1.4 1.4 1.5 1.4 1.5 1.5 1.4
##   ..$ Petal.Width : num  0.2 0.2 0.2 0.2 0.3 0.2 0.2 0.1 0.2 0.1
##   ..$ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1

# set label type
y <- 'Species'
pred <- setdiff(names(train), y)
y

## [1] "Species"

#convert variables to factors
train[,y] <- as.factor(train[,y])
test[,y] <- as.factor(test[,y])

1.3 Run AutoML

# Run AutoML for 20 base models
aml <- h2o.automl(x = pred, y = y,    # x:the predictor variables, y:the response variable 
                  training_frame = train_h,
                  max_models = 20,
                  seed = 1,
                  max_runtime_secs = 20
)

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   2%
## 14:30:45.626: AutoML: XGBoost is not available; skipping it.
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |===============================                                       |  45%
## 14:30:55.297: Skipping training of model GBM_5_AutoML_20200406_143045 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200406_143045.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 113.0.
## 
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |======================================================================| 100%

summary(aml)

##    Length     Class      Mode 
##         1 H2OAutoML        S4

# AutoML Leaderboard
lb <- aml@leaderboard
lb

##                                              model_id mean_per_class_error
## 1                        DRF_1_AutoML_20200406_143045           0.02706767
## 2                        GBM_1_AutoML_20200406_143045           0.03583960
## 3          GBM_grid__1_AutoML_20200406_143045_model_6           0.03583960
## 4                        GBM_4_AutoML_20200406_143045           0.03659148
## 5          GBM_grid__1_AutoML_20200406_143045_model_5           0.03659148
## 6 StackedEnsemble_BestOfFamily_AutoML_20200406_143045           0.03659148
##     logloss      rmse        mse
## 1 0.0867549 0.1602558 0.02568193
## 2 0.1235628 0.1761163 0.03101696
## 3 0.2905164 0.1899214 0.03607015
## 4 0.0953745 0.1610637 0.02594153
## 5 1.0466614 0.6488654 0.42102635
## 6 0.2387510 0.2312317 0.05346812
## 
## [19 rows x 5 columns]

1.4 prediction & result

# prediction result on test data
prediction <- h2o.predict(aml@leader, test_h[,-5]) %>%
  as.data.frame()

## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%

# create a confusion matrix
caret::confusionMatrix(test$Species, prediction$predict)

## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         11         1
##   virginica       0          1        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9459          
##                  95% CI : (0.8181, 0.9934)
##     No Information Rate : 0.4054          
##     P-Value [Acc > NIR] : 4.618e-12       
##                                           
##                   Kappa : 0.9178          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9167           0.9333
## Specificity                 1.0000            0.9600           0.9545
## Pos Pred Value              1.0000            0.9167           0.9333
## Neg Pred Value              1.0000            0.9600           0.9545
## Prevalence                  0.2703            0.3243           0.4054
## Detection Rate              0.2703            0.2973           0.3784
## Detection Prevalence        0.2703            0.3243           0.4054
## Balanced Accuracy           1.0000            0.9383           0.9439

# close h2o connection
h2o.shutdown(prompt = F)

## [1] TRUE

2 automl Package

The automl package is availabe on CRAN. The automl package fits from simple regression to highly customizable deep neural networks either with gradient descent or metaheuristic, using automatic hyper parameters tuning and custom cost function. A mix inspired by the common tricks on Deep Learning and Particle Swarm Optimization. Below is a sample code for how to use in R.

2.1 packages

library(automl)

2.2 modeling amlmodel

# Run AutoML 
amlmodel <- automl_train_manual(Xref = subset(train, select = -c(Species)),
                               Yref = subset(train, select = c(Species))$Species %>% as.numeric(),
                               hpar = list(learningrate = 0.01,
                                           minibatchsize = 2^2,
                                           numiterations = 100))

## (cost: mse)
## cost epoch10: 0.0138984111953427 (cv cost: 0.0371361326799912) (LR:  0.01 ) 
## cost epoch20: 0.0346146875590912 (cv cost: 0.0275259242254334) (LR:  0.01 ) 
## cost epoch30: 0.0200165412181943 (cv cost: 0.025126128933467) (LR:  0.01 ) 
## cost epoch40: 0.0153757259338059 (cv cost: 0.0252914182311191) (LR:  0.01 ) 
## cost epoch50: 0.0159535801066202 (cv cost: 0.0256464356209419) (LR:  0.01 ) 
## cost epoch60: 0.0180796810302528 (cv cost: 0.0264267550807272) (LR:  0.01 ) 
## cost epoch70: 0.0196639881138745 (cv cost: 0.0263238127972592) (LR:  0.01 ) 
## cost epoch80: 0.020272788827965 (cv cost: 0.0267909690501396) (LR:  0.01 ) 
## cost epoch90: 0.020742072177992 (cv cost: 0.0267062962575585) (LR:  0.01 ) 
## cost epoch100: 0.0209857691292358 (cv cost: 0.0266518382480363) (LR:  0.01 ) 
##    dim X: [4,102]
##    dim W1: [10,4] (min|max: -1.63519364947803, 1.95469902227911)
##    dim bB1: [10,1] (min|max: -0.654786433868327, 0.3323745421597)
##    dim W2: [1,10] (min|max: -0.308971702106429, 0.36554644154666)
##    dim bB2: [1,1] (min|max: 0.653129607145688, 0.653129607145688)
##    dim Y: [1,102]

2.3 prediction & result

prediction <- automl_predict(model = amlmodel, X = test[,1:4]) 

prediction <- ifelse(prediction > 2.5, 3, ifelse(prediction > 1.5, 2, 1)) %>% as.factor()
prediction

##  [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## Levels: 1 2 3

confusionMatrix(as.factor(as.numeric(test$Species)), prediction) #type이 다름

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 10  0  0
##          2  0 11  1
##          3  0  0 15
## 
## Overall Statistics
##                                           
##                Accuracy : 0.973           
##                  95% CI : (0.8584, 0.9993)
##     No Information Rate : 0.4324          
##     P-Value [Acc > NIR] : 1.675e-12       
##                                           
##                   Kappa : 0.9588          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000   0.9375
## Specificity            1.0000   0.9615   1.0000
## Pos Pred Value         1.0000   0.9167   1.0000
## Neg Pred Value         1.0000   1.0000   0.9545
## Prevalence             0.2703   0.2973   0.4324
## Detection Rate         0.2703   0.2973   0.4054
## Detection Prevalence   0.2703   0.3243   0.4054
## Balanced Accuracy      1.0000   0.9808   0.9688

autoML

updragon

2020 4 6