library(h2o)
h2o.init()
 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         1 days 15 minutes 
    H2O cluster version:        3.14.0.3 
    H2O cluster version age:    12 days  
    H2O cluster name:           H2O_started_from_R_r631758_lcl606 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   3.16 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
    R Version:                  R version 3.4.2 (2017-09-28) 
h2o.removeAll()
[1] 0

load sample dataset

titanic<-h2o.importFile(path="http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=====================================================================================| 100%
dim(titanic)
[1] 1309   14
head(titanic)
tail(titanic)
summary(titanic, exact_quantiles=T)
 pclass          survived        name sex         age               sibsp           
 Min.   :1.000   Min.   :0.000        male  :843  Min.   : 0.1667   Min.   :0.0000  
 1st Qu.:2.000   1st Qu.:0.000        female:466  1st Qu.:21.0000   1st Qu.:0.0000  
 Median :3.000   Median :0.000                    Median :28.0000   Median :0.0000  
 Mean   :2.295   Mean   :0.382                    Mean   :29.8811   Mean   :0.4989  
 3rd Qu.:3.000   3rd Qu.:1.000                    3rd Qu.:39.0000   3rd Qu.:1.0000  
 Max.   :3.000   Max.   :1.000                    Max.   :80.0000   Max.   :8.0000  
                                                  NA's   :263                       
 parch           ticket            fare              cabin                 embarked
 Min.   :0.000   Min.   :    680   Min.   :  0.000   C23 C25 C27    :   6  S :914  
 1st Qu.:0.000   1st Qu.:  19950   1st Qu.:  7.896   B57 B59 B63 B66:   5  C :270  
 Median :0.000   Median : 234604   Median : 14.454   G6             :   5  Q :123  
 Mean   :0.385   Mean   : 249039   Mean   : 33.295   B96 B98        :   4  NA:  2  
 3rd Qu.:0.000   3rd Qu.: 347468   3rd Qu.: 31.275   C22 C26        :   4          
 Max.   :9.000   Max.   :3101298   Max.   :512.329   C78            :   4          
                 NA's   :352       NA's   :1         NA             :1014          
 boat             body            home.dest                
 Min.   : 1.000   Min.   :  1.0   New York  NY        : 64 
 1st Qu.: 5.000   1st Qu.: 72.0   London              : 14 
 Median :10.000   Median :155.0   Montreal  PQ        : 10 
 Mean   : 9.405   Mean   :160.8   Cornwall / Akron  OH:  9 
 3rd Qu.:13.000   3rd Qu.:256.0   Paris  France       :  9 
 Max.   :16.000   Max.   :328.0   Philadelphia  PA    :  8 
 NA's   :911      NA's   :1188    NA                  :564 
str(titanic)
Class 'H2OFrame' <environment: 0x0000000020321850> 
 - attr(*, "op")= chr "Parse"
 - attr(*, "id")= chr "Key_Frame__http___s3_amazonaws_com_h2o_public_test_data_smalldata_gbm_test_titanic.hex_sid_888a_41"
 - attr(*, "eval")= logi FALSE
 - attr(*, "nrow")= int 1309
 - attr(*, "ncol")= int 14
 - attr(*, "types")=List of 14
  ..$ : chr "int"
  ..$ : chr "int"
  ..$ : chr "string"
  ..$ : chr "enum"
  ..$ : chr "real"
  ..$ : chr "int"
  ..$ : chr "int"
  ..$ : chr "int"
  ..$ : chr "real"
  ..$ : chr "enum"
  ..$ : chr "enum"
  ..$ : chr "int"
  ..$ : chr "int"
  ..$ : chr "enum"
 - attr(*, "data")='data.frame':    10 obs. of  14 variables:
  ..$ pclass   : num  1 1 1 1 1 1 1 1 1 1
  ..$ survived : num  1 1 0 0 0 1 1 0 1 0
  ..$ name     : chr  "Allen  Miss. Elisabeth Walton" "Allison  Master. Hudson Trevor" "Allison  Miss. Helen Loraine" "Allison  Mr. Hudson Joshua Creighton" ...
  ..$ sex      : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2
  ..$ age      : num  29 0.917 2 30 25 ...
  ..$ sibsp    : num  0 1 1 1 1 0 1 0 2 0
  ..$ parch    : num  0 2 2 2 2 0 0 0 0 0
  ..$ ticket   : num  24160 113781 113781 113781 113781 ...
  ..$ fare     : num  211 152 152 152 152 ...
  ..$ cabin    : Factor w/ 186 levels "A10","A11","A14",..: 44 80 80 80 80 150 146 16 62 NA
  ..$ embarked : Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 3 3 3 1
  ..$ boat     : num  2 11 NaN NaN NaN 3 10 NaN NaN NaN
  ..$ body     : num  NaN NaN NaN 135 NaN NaN NaN NaN NaN 22
  ..$ home.dest: Factor w/ 369 levels "?Havana  Cuba",..: 309 231 231 231 231 237 162 24 22 229

set response and predictors

titanic$survived<-as.factor(titanic$survived)
response<-"survived"
predictors<-paste( colnames(titanic[,-c(2,3)]), sep="")
predictors
 [1] "pclass"    "sex"       "age"       "sibsp"     "parch"     "ticket"    "fare"     
 [8] "cabin"     "embarked"  "boat"      "body"      "home.dest"

split data

splits<-h2o.splitFrame(data=titanic, ratios=c(0.6,0.2), destination_frames = c("train.hex", "valid.hex", "test.hex"), seed=1234)
train<-splits[[1]]
valid<-splits[[2]]
test<-splits[[3]]

start with very basic model

model.gbm1<-h2o.gbm(x=predictors, y=response, training_frame = train)

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=================                                                                    |  20%
  |                                                                                           
  |=====================================================================================| 100%
 model.gbm1
Model Details:
==============

H2OBinomialModel: gbm
Model ID:  GBM_model_R_1507141024239_18396 
Model Summary: 
  number_of_trees number_of_internal_trees model_size_in_bytes min_depth max_depth mean_depth
1              50                       50               22845         2         5    4.94000
  min_leaves max_leaves mean_leaves
1          3         21    13.02000


H2OBinomialMetrics: gbm
** Reported on training data. **

MSE:  0.02096719
RMSE:  0.1448005
LogLoss:  0.08788473
Mean Per-Class Error:  0.02596078
AUC:  0.9960535
Gini:  0.992107

Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
         0   1    Error     Rate
0      478   1 0.002088   =1/479
1       15 286 0.049834  =15/301
Totals 493 287 0.020513  =16/780

Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.499288 0.972789 165
2                       max f2  0.140574 0.970684 191
3                 max f0point5  0.499288 0.986888 165
4                 max accuracy  0.499288 0.979487 165
5                max precision  0.996316 1.000000   0
6                   max recall  0.056272 1.000000 235
7              max specificity  0.996316 1.000000   0
8             max absolute_mcc  0.499288 0.957042 165
9   max min_per_class_accuracy  0.275850 0.966777 174
10 max mean_per_class_accuracy  0.499288 0.974039 165

Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`

get AUC

h2o.auc(h2o.performance(model.gbm1, newdata=valid))
[1] 0.9500141

trained with 80% of the data

model.gbm2<-h2o.gbm(x=predictors, y=response, training_frame = h2o.rbind(train, valid), nfolds=4, seed=100000)

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=====================                                                                |  25%
  |                                                                                           
  |===================================================================================  |  98%
  |                                                                                           
  |=====================================================================================| 100%

Show a detailed summary of the cross validation metrics

This gives you an idea of the variance between the folds

model.gbm2@model$cross_validation_metrics_summary
Cross-Validation Metrics Summary: 
                               mean           sd  cv_1_valid  cv_2_valid  cv_3_valid
accuracy                 0.93055546  0.008629293   0.9166667  0.92134833  0.94716984
auc                       0.9485867 0.0090652285   0.9281929   0.9501641  0.95242697
err                     0.069444545  0.008629293 0.083333336  0.07865169  0.05283019
err_count                     18.25    2.0841665        21.0        21.0        14.0
f0point5                 0.93016684  0.008899874       0.925   0.9259259  0.95137423
f1                        0.9050068  0.014668015  0.87573963   0.8955224  0.92783505
f2                        0.8818632   0.02512356  0.83146065    0.867052   0.9054326
lift_top_group            2.6012814  0.063610904   2.7391305    2.518868   2.6237624
logloss                  0.23374325  0.026640536  0.28481033  0.25281268  0.20978114
max_per_class_error      0.13256821  0.032050658  0.19565217   0.1509434  0.10891089
mcc                       0.8534678  0.018669134  0.82108814  0.83597547  0.88815725
mean_per_class_accuracy  0.91830176  0.012959264   0.8927989  0.90900034   0.9363982
mean_per_class_error     0.08169827  0.012959264 0.107201084  0.09099965 0.063601784
mse                       0.0636791  0.008084995  0.07762853 0.072123334 0.054258037
precision                  0.948204  0.013882402  0.96103895  0.94736844   0.9677419
r2                        0.7304616  0.035632342   0.6651004   0.6987226   0.7699668
recall                    0.8674318  0.032050658   0.8043478   0.8490566   0.8910891
rmse                     0.25132284  0.016061164  0.27861896  0.26855788  0.23293355
specificity               0.9691717  0.010593492     0.98125   0.9689441  0.98170733
                         cv_4_valid
accuracy                 0.93703705
auc                      0.96356285
err                     0.062962964
err_count                      17.0
f0point5                  0.9183673
f1                        0.9209302
f2                       0.92350745
lift_top_group            2.5233645
logloss                  0.18756889
max_per_class_error      0.07476635
mcc                       0.8686504
mean_per_class_accuracy   0.9350095
mean_per_class_error     0.06499054
mse                      0.05070648
precision                 0.9166667
r2                       0.78805673
recall                   0.92523366
rmse                     0.22518098
specificity               0.9447853

get the cross-validated AUC by scoring the combined holdout predictions

h2o.auc(h2o.performance(model.gbm2))
[1] 0.9953856
h2o.auc(h2o.performance(model.gbm2,xval=TRUE))
[1] 0.9474834

fine tuning parameters

starttime<-Sys.time()
model.gbm3<-h2o.gbm(x=predictors, y=response, training_frame = train, validation_frame = valid, ntrees=10000, learn_rate=0.01, stopping_rounds=5, stopping_tolerance = 1e-4, stopping_metric="AUC",sample_rate=0.8, col_sample_rate = 0.8, seed=1234, score_tree_interval = 10)

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=====================================================================================| 100%
gbm3_time<-Sys.time()-starttime
print(paste("Took", round(gbm3_time, digits=2), units(gbm3_time), "to build GBM3 model."))
[1] "Took 1.54 secs to build GBM3 model."

get the AUC on the validation set

result does not get better

h2o.auc(h2o.performance(model.gbm3, valid=TRUE))
[1] 0.9424908

exploring more fine tuning parameters in depth

depth 10 is usually plenty of depth for most datasets, but you never know

hyper.params=list(max_depth=seq(1,29,2))
grid<-h2o.grid(hyper_params=hyper.params,
               search_criteria =list(strategy="Cartesian"),
               algorithm="gbm",
               grid_id="depth_grid",
               x=predictors,
               y=response,
               training_frame=train,
               validation_frame=valid,
               ntrees=10000,
               learn_rate=0.05,
               learn_rate_annealing=0.99, 
               sample_rate=0.8,
               col_sample_rate=0.8,
               seed=1234,
               stopping_rounds=5,
               stopping_tolerance=1e-4,
               stopping_metric="AUC",
               score_tree_interval=10
               )

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |======                                                                               |   7%
  |                                                                                           
  |===========                                                                          |  13%
  |                                                                                           
  |============                                                                         |  14%
  |                                                                                           
  |=================                                                                    |  20%
  |                                                                                           
  |=======================                                                              |  27%
  |                                                                                           
  |============================                                                         |  33%
  |                                                                                           
  |=============================                                                        |  34%
  |                                                                                           
  |==================================                                                   |  40%
  |                                                                                           
  |========================================                                             |  47%
  |                                                                                           
  |=============================================                                        |  53%
  |                                                                                           
  |=============================================                                        |  54%
  |                                                                                           
  |==============================================                                       |  54%
  |                                                                                           
  |===================================================                                  |  60%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |==============================================================                       |  73%
  |                                                                                           
  |==============================================================                       |  74%
  |                                                                                           
  |===============================================================                      |  74%
  |                                                                                           
  |====================================================================                 |  80%
  |                                                                                           
  |==========================================================================           |  87%
  |                                                                                           
  |===============================================================================      |  93%
  |                                                                                           
  |===============================================================================      |  94%
  |                                                                                           
  |================================================================================     |  94%
  |                                                                                           
  |=====================================================================================| 100%
grid
H2O Grid Details
================

Grid ID: depth_grid 
Used hyper parameters: 
  -  max_depth 
Number of models: 15 
Number of failed models: 0 

Hyper-Parameter Search Summary: ordered by increasing logloss
   max_depth           model_ids             logloss
1         13  depth_grid_model_6 0.20109637892392768
2          9  depth_grid_model_4 0.20160720998146248
3          7  depth_grid_model_3  0.2024624226746261
4          5  depth_grid_model_2  0.2029008034398234
5         11  depth_grid_model_5 0.20343494648988525
6         29 depth_grid_model_14 0.20446595941168913
7         19  depth_grid_model_9 0.20446595941168913
8         21 depth_grid_model_10 0.20446595941168913
9         25 depth_grid_model_12 0.20446595941168913
10        27 depth_grid_model_13 0.20446595941168913
11        23 depth_grid_model_11 0.20446595941168913
12        17  depth_grid_model_8  0.2044659596864782
13        15  depth_grid_model_7 0.20463752833415869
14         3  depth_grid_model_1 0.20971798928576324
15         1  depth_grid_model_0 0.23401163708609685

sort the grid models by decreasing AUC

sortedGrid<-h2o.getGrid("depth_grid", sort_by="AUC", decreasing=TRUE)
sortedGrid
H2O Grid Details
================

Grid ID: depth_grid 
Used hyper parameters: 
  -  max_depth 
Number of models: 15 
Number of failed models: 0 

Hyper-Parameter Search Summary: ordered by decreasing AUC
   max_depth           model_ids                auc
1         13  depth_grid_model_6 0.9525218371372218
2          9  depth_grid_model_4 0.9519019442096365
3         11  depth_grid_model_5 0.9512820512820513
4          7  depth_grid_model_3 0.9512256973795435
5          5  depth_grid_model_2 0.9511411665257818
6         29 depth_grid_model_14 0.9505494505494505
7         17  depth_grid_model_8 0.9505494505494505
8         19  depth_grid_model_9 0.9505494505494505
9         21 depth_grid_model_10 0.9505494505494505
10        25 depth_grid_model_12 0.9505494505494505
11        27 depth_grid_model_13 0.9505494505494505
12        23 depth_grid_model_11 0.9505494505494505
13        15  depth_grid_model_7 0.9503240349394196
14         1  depth_grid_model_0 0.9462383770076077
15         3  depth_grid_model_1 0.9458157227387998

find the range of max_depth for the top 5 models

topDepths=sortedGrid@summary_table$max_depth[1:5]
minDepth=min(as.numeric(topDepths))
maxDepth=max(as.numeric(topDepths))
minDepth
[1] 5
maxDepth
[1] 13

select sequencially

hyper.params = list( 
  ## restrict the search to the range of max_depth established above
  max_depth = seq(minDepth,maxDepth,1),                                      
  
  ## search a large space of row sampling rates per tree
  sample_rate = seq(0.2,1,0.01),                                             
  
  ## search a large space of column sampling rates per split
  col_sample_rate = seq(0.2,1,0.01),                                         
  
  ## search a large space of column sampling rates per tree
  col_sample_rate_per_tree = seq(0.2,1,0.01),                                
  
  ## search a large space of how column sampling per split should change as a function of the depth of the split
  col_sample_rate_change_per_level = seq(0.9,1.1,0.01),                      
  
  ## search a large space of the number of min rows in a terminal node
  min_rows = 2^seq(0,log2(nrow(train))-1,1),                                 
  
  ## search a large space of the number of bins for split-finding for continuous and integer columns
  nbins = 2^seq(4,10,1),                                                     
  
  ## search a large space of the number of bins for split-finding for categorical columns
  nbins_cats = 2^seq(4,12,1),                                                
  
  ## search a few minimum required relative error improvement thresholds for a split to happen
  min_split_improvement = c(0,1e-8,1e-6,1e-4),                               
  
  ## try all histogram types (QuantilesGlobal and RoundRobin are good for numeric columns with outliers)
  histogram_type = c("UniformAdaptive","QuantilesGlobal","RoundRobin")       
)
search.criteria=list(
  strategy = "RandomDiscrete",      
  
  ## limit the runtime to 60 minutes
  max_runtime_secs = 3600,         
  
  ## build no more than 100 models
  max_models = 100,                  
  
  ## random number generator seed to make sampling of parameter combinations reproducible
  seed = 1234,                        
  
  ## early stopping once the leaderboard of the top 5 models is converged to 0.1% relative difference
  stopping_rounds = 5,                
  stopping_metric = "AUC",
  stopping_tolerance = 1e-3
)
grid<-h2o.grid(
  ## hyper parameters
  hyper_params = hyper.params,
  
  ## hyper-parameter search configuration (see above)
  search_criteria = search.criteria,
  
  ## which algorithm to run
  algorithm = "gbm",
  
  ## identifier for the grid, to later retrieve it
  grid_id = "final_grid", 
  
  ## standard model parameters
  x = predictors, 
  y = response, 
  training_frame = train, 
  validation_frame = valid,
  
  ## more trees is better if the learning rate is small enough
  ## use "more than enough" trees - we have early stopping
  ntrees = 10000,                                                            
  
  ## smaller learning rate is better
  ## since we have learning_rate_annealing, we can afford to start with a bigger learning rate
  learn_rate = 0.05,                                                         
  
  ## learning rate annealing: learning_rate shrinks by 1% after every tree 
  ## (use 1.00 to disable, but then lower the learning_rate)
  learn_rate_annealing = 0.99,                                               
  
  ## early stopping based on timeout (no model should take more than 1 hour - modify as needed)
  max_runtime_secs = 600,                                                 
  
  ## early stopping once the validation AUC doesn't improve by at least 0.01% for 5 consecutive scoring events
  stopping_rounds = 5, stopping_tolerance = 1e-4, stopping_metric = "AUC", 
  
  ## score every 10 trees to make early stopping reproducible (it depends on the scoring interval)
  score_tree_interval = 10,                                                
  
  ## base random number generator seed for each model (automatically gets incremented internally for each model)
  seed = 1234                
)

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=                                                                                    |   1%
  |                                                                                           
  |===                                                                                  |   4%
  |                                                                                           
  |====                                                                                 |   5%
  |                                                                                           
  |=====                                                                                |   6%
  |                                                                                           
  |=======                                                                              |   8%
  |                                                                                           
  |========                                                                             |   9%
  |                                                                                           
  |=========                                                                            |  10%
  |                                                                                           
  |=========                                                                            |  11%
  |                                                                                           
  |==========                                                                           |  12%
  |                                                                                           
  |===========                                                                          |  13%
  |                                                                                           
  |============                                                                         |  14%
  |                                                                                           
  |=============                                                                        |  15%
  |                                                                                           
  |==============                                                                       |  16%
  |                                                                                           
  |===============                                                                      |  18%
  |                                                                                           
  |================                                                                     |  19%
  |                                                                                           
  |=================                                                                    |  20%
  |                                                                                           
  |==================                                                                   |  21%
  |                                                                                           
  |===================                                                                  |  22%
  |                                                                                           
  |====================                                                                 |  23%
  |                                                                                           
  |======================                                                               |  26%
  |                                                                                           
  |========================                                                             |  28%
  |                                                                                           
  |=========================                                                            |  29%
  |                                                                                           
  |==========================                                                           |  30%
  |                                                                                           
  |===========================                                                          |  32%
  |                                                                                           
  |============================                                                         |  33%
  |                                                                                           
  |==============================                                                       |  35%
  |                                                                                           
  |===============================                                                      |  36%
  |                                                                                           
  |===============================                                                      |  37%
  |                                                                                           
  |================================                                                     |  38%
  |                                                                                           
  |=================================                                                    |  39%
  |                                                                                           
  |==================================                                                   |  40%
  |                                                                                           
  |===================================                                                  |  41%
  |                                                                                           
  |====================================                                                 |  42%
  |                                                                                           
  |=====================================                                                |  44%
  |                                                                                           
  |======================================                                               |  45%
  |                                                                                           
  |=======================================                                              |  46%
  |                                                                                           
  |========================================                                             |  47%
  |                                                                                           
  |==========================================                                           |  49%
  |                                                                                           
  |===========================================                                          |  50%
  |                                                                                           
  |============================================                                         |  52%
  |                                                                                           
  |=============================================                                        |  53%
  |                                                                                           
  |==============================================                                       |  54%
  |                                                                                           
  |===============================================                                      |  55%
  |                                                                                           
  |================================================                                     |  56%
  |                                                                                           
  |================================================                                     |  57%
  |                                                                                           
  |=================================================                                    |  58%
  |                                                                                           
  |==================================================                                   |  59%
  |                                                                                           
  |===================================================                                  |  60%
  |                                                                                           
  |====================================================                                 |  61%
  |                                                                                           
  |=====================================================                                |  62%
  |                                                                                           
  |======================================================                               |  63%
  |                                                                                           
  |======================================================                               |  64%
  |                                                                                           
  |=======================================================                              |  65%
  |                                                                                           
  |========================================================                             |  66%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |==========================================================                           |  68%
  |                                                                                           
  |===========================================================                          |  69%
  |                                                                                           
  |=============================================================                        |  72%
  |                                                                                           
  |==============================================================                       |  73%
  |                                                                                           
  |===============================================================                      |  74%
  |                                                                                           
  |================================================================                     |  75%
  |                                                                                           
  |=================================================================                    |  76%
  |                                                                                           
  |=================================================================                    |  77%
  |                                                                                           
  |==================================================================                   |  78%
  |                                                                                           
  |===================================================================                  |  79%
  |                                                                                           
  |====================================================================                 |  80%
  |                                                                                           
  |=====================================================================                |  81%
  |                                                                                           
  |======================================================================               |  82%
  |                                                                                           
  |=======================================================================              |  83%
  |                                                                                           
  |=======================================================================              |  84%
  |                                                                                           
  |=========================================================================            |  86%
  |                                                                                           
  |==========================================================================           |  87%
  |                                                                                           
  |===========================================================================          |  88%
  |                                                                                           
  |=============================================================================        |  90%
  |                                                                                           
  |=============================================================================        |  91%
  |                                                                                           
  |==============================================================================       |  92%
  |                                                                                           
  |================================================================================     |  94%
  |                                                                                           
  |==================================================================================   |  96%
  |                                                                                           
  |==================================================================================   |  97%
  |                                                                                           
  |==================================================================================== |  99%
  |                                                                                           
  |=====================================================================================| 100%

Sort the grid models by AUC

sortedGrid <- h2o.getGrid("final_grid", sort_by = "auc", decreasing = TRUE)    
sortedGrid
H2O Grid Details
================

Grid ID: final_grid 
Used hyper parameters: 
  -  col_sample_rate 
  -  col_sample_rate_change_per_level 
  -  col_sample_rate_per_tree 
  -  histogram_type 
  -  max_depth 
  -  min_rows 
  -  min_split_improvement 
  -  nbins 
  -  nbins_cats 
  -  sample_rate 
Number of models: 100 
Number of failed models: 0 

Hyper-Parameter Search Summary: ordered by decreasing auc
  col_sample_rate col_sample_rate_change_per_level col_sample_rate_per_tree  histogram_type
1            0.92                             0.93                     0.56 QuantilesGlobal
2            0.49                             1.04                     0.94 QuantilesGlobal
3            0.35                             1.09                     0.83 QuantilesGlobal
4            0.61                             1.04                     0.61 UniformAdaptive
5            0.81                             0.94                     0.89 QuantilesGlobal
  max_depth min_rows min_split_improvement nbins nbins_cats sample_rate           model_ids
1         6      4.0                   0.0   128        128        0.93 final_grid_model_96
2         9      2.0                   0.0    32        256        0.86 final_grid_model_68
3         5      4.0                1.0E-8    64        128        0.69 final_grid_model_38
4        11      1.0                1.0E-4    64         16        0.69 final_grid_model_81
5         8     16.0                1.0E-8  1024         32        0.71 final_grid_model_69
                 auc
1  0.974218089602705
2 0.9738799661876585
3 0.9698224852071006
4 0.9691462383770075
5 0.9684699915469147

---
    col_sample_rate col_sample_rate_change_per_level col_sample_rate_per_tree  histogram_type
95              0.7                             1.08                     0.99 QuantilesGlobal
96              0.5                             1.03                     0.45      RoundRobin
97             0.87                              1.0                      0.2      RoundRobin
98             0.24                             1.08                      0.3 UniformAdaptive
99             0.57                              1.1                     0.68      RoundRobin
100            0.96                             0.94                     0.62 QuantilesGlobal
    max_depth min_rows min_split_improvement nbins nbins_cats sample_rate           model_ids
95          7    256.0                1.0E-4    32         16        0.49 final_grid_model_86
96         13    256.0                1.0E-8   512         16        0.28 final_grid_model_58
97         12    256.0                1.0E-6   512       1024        0.97 final_grid_model_51
98          5    256.0                1.0E-4    32         64        0.97 final_grid_model_44
99         12    256.0                   0.0    16       4096        0.58  final_grid_model_8
100         8    256.0                1.0E-6    64       4096        0.57 final_grid_model_95
                   auc
95  0.8014370245139476
96  0.7997464074387151
97  0.7965624119470274
98  0.7854888701042547
99  0.7836573682727528
100 0.7608058608058608

goe best 5 model AUC

for (i in 1:5) {
  gbm<-h2o.getModel(sortedGrid@model_ids[[i]])
  print(h2o.auc(h2o.performance(gbm,valid=TRUE)))
}
[1] 0.9742181
[1] 0.97388
[1] 0.9698225
[1] 0.9691462
[1] 0.96847

apply the best model to test data

gbm<-h2o.getModel(sortedGrid@model_ids[[1]])
print(h2o.auc(h2o.performance(gbm,newdata=test)))
[1] 0.9824898
gbm@parameters
$model_id
[1] "final_grid_model_96"

$training_frame
[1] "train.hex"

$validation_frame
[1] "valid.hex"

$score_tree_interval
[1] 10

$ntrees
[1] 10000

$max_depth
[1] 6

$min_rows
[1] 4

$nbins
[1] 128

$nbins_cats
[1] 128

$stopping_rounds
[1] 5

$stopping_metric
[1] "AUC"

$stopping_tolerance
[1] 1e-04

$max_runtime_secs
[1] 600

$seed
[1] 1234

$learn_rate
[1] 0.05

$learn_rate_annealing
[1] 0.99

$distribution
[1] "bernoulli"

$sample_rate
[1] 0.93

$col_sample_rate
[1] 0.92

$col_sample_rate_change_per_level
[1] 0.93

$col_sample_rate_per_tree
[1] 0.56

$min_split_improvement
[1] 0

$histogram_type
[1] "QuantilesGlobal"

$x
 [1] "pclass"    "sex"       "age"       "sibsp"     "parch"     "ticket"    "fare"     
 [8] "cabin"     "embarked"  "boat"      "body"      "home.dest"

$y
[1] "survived"

Now we can confirm that these parameters are generally sound, by building a GBM model on the whole dataset (instead of the 60%) and using internal 5-fold cross-validation (re-using all other parameters including the seed):

model<-do.call(h2o.gbm, 
               {
                 p<-gbm@parameters 
                 p$model_id=NULL          ## do not overwrite the original grid model
                 p$training_frame=titanic ## use the full dataset
                 p$validation_frame=NULL  ## no validation frame
                 p$nfolds=5               ## cross-validation
                 p
               })

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=                                                                                    |   1%
  |                                                                                           
  |==                                                                                   |   2%
  |                                                                                           
  |==============                                                                       |  17%
  |                                                                                           
  |============================                                                         |  34%
  |                                                                                           
  |===========================================                                          |  50%
  |                                                                                           
  |===========================================                                          |  51%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=======================================================================              |  84%
  |                                                                                           
  |=====================================================================================| 100%
model@model$cross_validation_metrics_summary
Cross-Validation Metrics Summary: 
                               mean           sd  cv_1_valid  cv_2_valid  cv_3_valid
accuracy                 0.94809973  0.003993344   0.9400749  0.94833946   0.9457364
auc                       0.9743477  0.006040138   0.9674539   0.9610417   0.9794005
err                     0.051900264  0.003993344 0.059925094 0.051660515 0.054263566
err_count                      13.6    1.1489125        16.0        14.0        14.0
f0point5                 0.95091534  0.011208471   0.9623016  0.95454544    0.944206
f1                        0.9295287 0.0049488554   0.9238095   0.9230769   0.9263158
f2                        0.9096094  0.013264459  0.88827837  0.89361703  0.90909094
lift_top_group            2.6258688  0.099894695   2.3839285   2.8229167    2.632653
logloss                  0.19542515  0.015181999  0.20480314  0.23214972  0.19031271
max_per_class_error     0.102922216  0.019955961  0.13392857       0.125  0.10204082
mcc                      0.89094704 0.0075886473   0.8800855   0.8873967   0.8845431
mean_per_class_accuracy   0.9385944 0.0053582946   0.9298099   0.9317857  0.93647957
mean_per_class_error    0.061405573 0.0053582946 0.070190094  0.06821428  0.06352041
mse                     0.051655047 0.0043810816  0.05615356 0.061237488 0.049908444
precision                 0.9660636  0.018879278   0.9897959   0.9767442  0.95652175
r2                        0.7805782  0.019705383   0.7694049  0.73230106    0.788131
recall                    0.8970778  0.019955961   0.8660714       0.875   0.8979592
rmse                     0.22687589  0.009549358  0.23696741   0.2474621  0.22340198
specificity              0.98011106  0.011789808   0.9935484   0.9885714       0.975
                         cv_4_valid  cv_5_valid
accuracy                  0.9488189  0.95752895
auc                       0.9819927  0.98184973
err                     0.051181104 0.042471044
err_count                      13.0        11.0
f0point5                 0.92402464  0.96949893
f1                       0.93264246   0.9417989
f2                        0.9414226  0.91563785
lift_top_group            2.6736841   2.6161616
logloss                  0.17594479  0.17391542
max_per_class_error      0.05263158   0.1010101
mcc                      0.89166886    0.911041
mean_per_class_accuracy    0.948527  0.94636995
mean_per_class_error     0.05147302  0.05363005
mse                      0.04610445   0.0448713
precision                 0.9183673  0.98888886
r2                       0.80308014    0.809974
recall                   0.94736844   0.8989899
rmse                     0.21471947  0.21182847
specificity               0.9496855     0.99375

to save time, let’s just scan through the top 5 models and cross-validate their parameters with nfolds=5 on the entire dataset:

for ( i in 1:5){
  gbm<-h2o.getModel(sortedGrid@model_ids[[i]])
          cvgbm<-do.call(h2o.gbm,
                                   {
          p <- gbm@parameters
          p$model_id = NULL          ## do not overwrite the original grid model
          p$training_frame = titanic     ## use the full dataset
          p$validation_frame = NULL  ## no validation frame
          p$nfolds = 5               ## cross-validation
          p  
                                     
                                   })
                    
    print(gbm@model_id)
  print(cvgbm@model$cross_validation_metrics_summary[2,]) ## Pick out the "AUC" row
                    
}

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=                                                                                    |   1%
  |                                                                                           
  |==                                                                                   |   2%
  |                                                                                           
  |==============                                                                       |  17%
  |                                                                                           
  |=============================                                                        |  34%
  |                                                                                           
  |===========================================                                          |  50%
  |                                                                                           
  |===========================================                                          |  51%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=======================================================================              |  83%
  |                                                                                           
  |=======================================================================              |  84%
  |                                                                                           
  |=====================================================================================| 100%
[1] "final_grid_model_96"
Cross-Validation Metrics Summary: 
         mean          sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
auc 0.9743477 0.006040138  0.9674539  0.9610417  0.9794005  0.9819927 0.98184973

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=                                                                                    |   1%
  |                                                                                           
  |==                                                                                   |   2%
  |                                                                                           
  |==============                                                                       |  17%
  |                                                                                           
  |===============                                                                      |  17%
  |                                                                                           
  |============================                                                         |  33%
  |                                                                                           
  |=============================                                                        |  34%
  |                                                                                           
  |===========================================                                          |  50%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=======================================================================              |  83%
  |                                                                                           
  |=======================================================================              |  84%
  |                                                                                           
  |=====================================================================================| 100%
[1] "final_grid_model_68"
Cross-Validation Metrics Summary: 
         mean           sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
auc 0.9741264 0.0058573517 0.96854836  0.9610417 0.97665817  0.9807349   0.983649

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=                                                                                    |   1%
  |                                                                                           
  |==                                                                                   |   2%
  |                                                                                           
  |===                                                                                  |   3%
  |                                                                                           
  |==============                                                                       |  17%
  |                                                                                           
  |===============                                                                      |  17%
  |                                                                                           
  |=============================                                                        |  34%
  |                                                                                           
  |===========================================                                          |  50%
  |                                                                                           
  |===========================================                                          |  51%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=======================================================================              |  84%
  |                                                                                           
  |=====================================================================================| 100%
[1] "final_grid_model_38"
Cross-Validation Metrics Summary: 
         mean           sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
auc 0.9724971 0.0057914597  0.9625576  0.9624107 0.97927296 0.97835153  0.9798927

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=                                                                                    |   1%
  |                                                                                           
  |==                                                                                   |   2%
  |                                                                                           
  |==============                                                                       |  17%
  |                                                                                           
  |============================                                                         |  33%
  |                                                                                           
  |===========================================                                          |  50%
  |                                                                                           
  |===========================================                                          |  51%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=======================================================================              |  83%
  |                                                                                           
  |=======================================================================              |  84%
  |                                                                                           
  |=====================================================================================| 100%
[1] "final_grid_model_81"
Cross-Validation Metrics Summary: 
         mean          sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
auc 0.9689918 0.006921219 0.96209675  0.9530357 0.97786987  0.9755048   0.976452

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=                                                                                    |   1%
  |                                                                                           
  |==                                                                                   |   2%
  |                                                                                           
  |==============                                                                       |  17%
  |                                                                                           
  |============================                                                         |  33%
  |                                                                                           
  |============================                                                         |  34%
  |                                                                                           
  |=============================                                                        |  34%
  |                                                                                           
  |===========================================                                          |  50%
  |                                                                                           
  |===========================================                                          |  51%
  |                                                                                           
  |=========================================================                            |  67%
  |                                                                                           
  |=======================================================================              |  83%
  |                                                                                           
  |=======================================================================              |  84%
  |                                                                                           
  |=====================================================================================| 100%
[1] "final_grid_model_69"
Cross-Validation Metrics Summary: 
          mean           sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid cv_5_valid
auc 0.97103506 0.0053187287 0.96313363 0.96068454 0.97589284  0.9776233  0.9778409

apply the best model to test data

gbm<-h2o.getModel(sortedGrid@model_ids[[1]])
preds<-h2o.predict(gbm,test)

  |                                                                                           
  |                                                                                     |   0%
  |                                                                                           
  |=====================================================================================| 100%
head(preds)
gbm@model$validation_metrics@metrics$max_criteria_and_metric_scores
Maximum Metrics: Maximum metrics at their respective thresholds
                        metric threshold    value idx
1                       max f1  0.415247 0.929293  92
2                       max f2  0.207864 0.924528 109
3                 max f0point5  0.523349 0.970149  90
4                 max accuracy  0.523349 0.948905  90
5                max precision  0.990276 1.000000   0
6                   max recall  0.057998 1.000000 205
7              max specificity  0.990276 1.000000   0
8             max absolute_mcc  0.523349 0.894631  90
9   max min_per_class_accuracy  0.207864 0.928994 109
10 max mean_per_class_accuracy  0.415247 0.935137  92

While this is running, we can actually look at the model.

To do this we simply need a new connection to H2O.

This R console will run the model, so we need either another R console

or the web browser (or python, etc.).

In the demo, we will use Flow in our web browser

And the focus will be to look at model performance, since we are using R to

control H2O. So we can simply type in:

getModel “final_grid_model_96”

https://github.com/h2oai/h2o-3/blob/master/h2o-docs/src/product/tutorials/gbm/gbmTuning.Rmd

---
title: "Fine tuning GBM"
output: html_notebook
---


```{r}
library(h2o)
h2o.init()
h2o.removeAll()
```

#load sample dataset
```{r}
titanic<-h2o.importFile(path="http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
dim(titanic)
head(titanic)
tail(titanic)
summary(titanic, exact_quantiles=T)
str(titanic)
```


#set response and predictors
```{r}
titanic$survived<-as.factor(titanic$survived)
response<-"survived"
predictors<-paste( colnames(titanic[,-c(2,3)]), sep="")
predictors
```

#split data
```{r}
splits<-h2o.splitFrame(data=titanic, ratios=c(0.6,0.2), destination_frames = c("train.hex", "valid.hex", "test.hex"), seed=1234)
train<-splits[[1]]
valid<-splits[[2]]
test<-splits[[3]]
```

#start with very basic model
```{r}
model.gbm1<-h2o.gbm(x=predictors, y=response, training_frame = train)
 model.gbm1
```
#get AUC
```{r}
h2o.auc(h2o.performance(model.gbm1, newdata=valid))
```
#trained with 80% of the data
```{r}
model.gbm2<-h2o.gbm(x=predictors, y=response, training_frame = h2o.rbind(train, valid), nfolds=4, seed=100000)
```

## Show a detailed summary of the cross validation metrics
## This gives you an idea of the variance between the folds
```{r}
model.gbm2@model$cross_validation_metrics_summary
```

#get the cross-validated AUC by scoring the combined holdout predictions
```{r}
h2o.auc(h2o.performance(model.gbm2))
h2o.auc(h2o.performance(model.gbm2,xval=TRUE))
```

#fine tuning parameters
```{r}
starttime<-Sys.time()
model.gbm3<-h2o.gbm(x=predictors, y=response, training_frame = train, validation_frame = valid, ntrees=10000, learn_rate=0.01, stopping_rounds=5, stopping_tolerance = 1e-4, stopping_metric="AUC",sample_rate=0.8, col_sample_rate = 0.8, seed=1234, score_tree_interval = 10)
gbm3_time<-Sys.time()-starttime
print(paste("Took", round(gbm3_time, digits=2), units(gbm3_time), "to build GBM3 model."))
```
#get the AUC on the validation set
#result does not get better
```{r}
h2o.auc(h2o.performance(model.gbm3, valid=TRUE))
```

#exploring more fine tuning parameters in depth
#depth 10 is usually plenty of depth for most datasets, but you never know
```{r}
hyper.params=list(max_depth=seq(1,29,2))
grid<-h2o.grid(hyper_params=hyper.params,
               search_criteria =list(strategy="Cartesian"),
               algorithm="gbm",
               grid_id="depth_grid",
               x=predictors,
               y=response,
               training_frame=train,
               validation_frame=valid,
               ntrees=10000,
               learn_rate=0.05,
               learn_rate_annealing=0.99, 
               sample_rate=0.8,
               col_sample_rate=0.8,
               seed=1234,
               stopping_rounds=5,
               stopping_tolerance=1e-4,
               stopping_metric="AUC",
               score_tree_interval=10
               )
grid
```

#sort the grid models by decreasing AUC
```{r}
sortedGrid<-h2o.getGrid("depth_grid", sort_by="AUC", decreasing=TRUE)
sortedGrid
```
#find the range of max_depth for the top 5 models
```{r}
topDepths=sortedGrid@summary_table$max_depth[1:5]
minDepth=min(as.numeric(topDepths))
maxDepth=max(as.numeric(topDepths))
minDepth
maxDepth
```

#select sequencially
```{r}
hyper.params = list( 
  ## restrict the search to the range of max_depth established above
  max_depth = seq(minDepth,maxDepth,1),                                      
  
  ## search a large space of row sampling rates per tree
  sample_rate = seq(0.2,1,0.01),                                             
  
  ## search a large space of column sampling rates per split
  col_sample_rate = seq(0.2,1,0.01),                                         
  
  ## search a large space of column sampling rates per tree
  col_sample_rate_per_tree = seq(0.2,1,0.01),                                
  
  ## search a large space of how column sampling per split should change as a function of the depth of the split
  col_sample_rate_change_per_level = seq(0.9,1.1,0.01),                      
  
  ## search a large space of the number of min rows in a terminal node
  min_rows = 2^seq(0,log2(nrow(train))-1,1),                                 
  
  ## search a large space of the number of bins for split-finding for continuous and integer columns
  nbins = 2^seq(4,10,1),                                                     
  
  ## search a large space of the number of bins for split-finding for categorical columns
  nbins_cats = 2^seq(4,12,1),                                                
  
  ## search a few minimum required relative error improvement thresholds for a split to happen
  min_split_improvement = c(0,1e-8,1e-6,1e-4),                               
  
  ## try all histogram types (QuantilesGlobal and RoundRobin are good for numeric columns with outliers)
  histogram_type = c("UniformAdaptive","QuantilesGlobal","RoundRobin")       
)
search.criteria=list(
  strategy = "RandomDiscrete",      
  
  ## limit the runtime to 60 minutes
  max_runtime_secs = 3600,         
  
  ## build no more than 100 models
  max_models = 100,                  
  
  ## random number generator seed to make sampling of parameter combinations reproducible
  seed = 1234,                        
  
  ## early stopping once the leaderboard of the top 5 models is converged to 0.1% relative difference
  stopping_rounds = 5,                
  stopping_metric = "AUC",
  stopping_tolerance = 1e-3
)
grid<-h2o.grid(
  ## hyper parameters
  hyper_params = hyper.params,
  
  ## hyper-parameter search configuration (see above)
  search_criteria = search.criteria,
  
  ## which algorithm to run
  algorithm = "gbm",
  
  ## identifier for the grid, to later retrieve it
  grid_id = "final_grid", 
  
  ## standard model parameters
  x = predictors, 
  y = response, 
  training_frame = train, 
  validation_frame = valid,
  
  ## more trees is better if the learning rate is small enough
  ## use "more than enough" trees - we have early stopping
  ntrees = 10000,                                                            
  
  ## smaller learning rate is better
  ## since we have learning_rate_annealing, we can afford to start with a bigger learning rate
  learn_rate = 0.05,                                                         
  
  ## learning rate annealing: learning_rate shrinks by 1% after every tree 
  ## (use 1.00 to disable, but then lower the learning_rate)
  learn_rate_annealing = 0.99,                                               
  
  ## early stopping based on timeout (no model should take more than 1 hour - modify as needed)
  max_runtime_secs = 600,                                                 
  
  ## early stopping once the validation AUC doesn't improve by at least 0.01% for 5 consecutive scoring events
  stopping_rounds = 5, stopping_tolerance = 1e-4, stopping_metric = "AUC", 
  
  ## score every 10 trees to make early stopping reproducible (it depends on the scoring interval)
  score_tree_interval = 10,                                                
  
  ## base random number generator seed for each model (automatically gets incremented internally for each model)
  seed = 1234                
)

```

## Sort the grid models by AUC
```{r}
sortedGrid <- h2o.getGrid("final_grid", sort_by = "auc", decreasing = TRUE)    
sortedGrid
```

#goe best 5 model AUC
```{r}
for (i in 1:5) {
  gbm<-h2o.getModel(sortedGrid@model_ids[[i]])
  print(h2o.auc(h2o.performance(gbm,valid=TRUE)))
}
```

#apply the best model to test data
```{r}
gbm<-h2o.getModel(sortedGrid@model_ids[[1]])
print(h2o.auc(h2o.performance(gbm,newdata=test)))
gbm@parameters
```

#Now we can confirm that these parameters are generally sound, by building a GBM model on the whole dataset (instead of the 60%) and using internal 5-fold cross-validation (re-using all other parameters including the seed):

```{r}
model<-do.call(h2o.gbm, 
               {
                 p<-gbm@parameters 
                 p$model_id=NULL          ## do not overwrite the original grid model
                 p$training_frame=titanic ## use the full dataset
                 p$validation_frame=NULL  ## no validation frame
                 p$nfolds=5               ## cross-validation
                 p
               })
model@model$cross_validation_metrics_summary
```

#to save time, let's just scan through the top 5 models and cross-validate their parameters with nfolds=5 on the entire dataset:
```{r}
for ( i in 1:5){
  gbm<-h2o.getModel(sortedGrid@model_ids[[i]])
          cvgbm<-do.call(h2o.gbm,
                                   {
          p <- gbm@parameters
          p$model_id = NULL          ## do not overwrite the original grid model
          p$training_frame = titanic     ## use the full dataset
          p$validation_frame = NULL  ## no validation frame
          p$nfolds = 5               ## cross-validation
          p  
                                     
                                   })
                    
    print(gbm@model_id)
  print(cvgbm@model$cross_validation_metrics_summary[2,]) ## Pick out the "AUC" row
                    
}
```

#apply the best model to test data
```{r}
gbm<-h2o.getModel(sortedGrid@model_ids[[1]])
preds<-h2o.predict(gbm,test)
head(preds)
gbm@model$validation_metrics@metrics$max_criteria_and_metric_scores
```

#### While this is running, we can actually look at the model.
#### To do this we simply need a new connection to H2O.
#### This R console will run the model, so we need either another R console
####   or the web browser (or python, etc.).
#### In the demo, we will use Flow in our web browser
####  http://localhost:54321
#### And the focus will be to look at model performance, since we are using R to 
####  control H2O. So we can simply type in:
####  getModel "final_grid_model_96"


#https://github.com/h2oai/h2o-3/blob/master/h2o-docs/src/product/tutorials/gbm/gbmTuning.Rmd
