Hyperparameter Tuning

Purpose

I run through some examples of hyperparameter tuning and how to use h2o to improve efficiency. I start by loading machine learning packages and sourcing the data.

Simple Hypertuning Example

I start by comparing the traditional, manual process of hypertuning.

# Fit a linear model on the breast_cancer_data.
linear_model <- lm(concavity_mean ~ symmetry_mean, breast_cancer_data)

# Look at the summary of the linear_model.
summary(linear_model)

## 
## Call:
## lm(formula = concavity_mean ~ symmetry_mean, data = breast_cancer_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.201877 -0.039201 -0.008432  0.030655  0.226150 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -0.15311    0.04086  -3.747 0.000303 ***
## symmetry_mean  1.33366    0.21257   6.274 9.57e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06412 on 98 degrees of freedom
## Multiple R-squared:  0.2866, Adjusted R-squared:  0.2793 
## F-statistic: 39.36 on 1 and 98 DF,  p-value: 9.575e-09

# Extract the coefficients.
linear_model$coefficients

##   (Intercept) symmetry_mean 
##    -0.1531055     1.3336568

# Plot linear relationship.
ggplot(data = breast_cancer_data, 
        aes(x = symmetry_mean, y = concavity_mean)) +
  geom_point(color = "grey") +
  geom_abline(slope = linear_model$coefficients[2], 
      intercept = linear_model$coefficients[1])

# Create partition index
index <- createDataPartition(breast_cancer_data$diagnosis, p = 0.7, list = FALSE)

# Set seed.
set.seed(42)
# Start timer.
tic()
# Train model.
gbm_model <- caret::train(diagnosis ~ ., 
                   data = bc_train_data, 
                   method = "gbm", 
                   trControl = trainControl(method = "repeatedcv", number = 5, repeats = 3),
                   verbose = FALSE,
                   tuneLength = 4)
# Stop timer.
toc()

## 2.52 sec elapsed

# Define hyperparameter grid.
hyperparams <- expand.grid(n.trees = 200, 
                           interaction.depth = 1, 
                           shrinkage = 0.1, 
                           n.minobsinnode = 10)

# Apply hyperparameter grid to train().
set.seed(42)
gbm_model <- caret::train(diagnosis ~ ., 
                   data = bc_train_data, 
                   method = "gbm", 
                   trControl = trainControl(method = "repeatedcv", number = 5, repeats = 3),
                   verbose = FALSE,
                   tuneGrid = hyperparams)

Dynamically Create Hypertuning Grid

I make the model slightly more robust by tesing a range of hypertuning parameters in a single pass.

# Define Cartesian grid
man_grid <- expand.grid(degree = c(1,2,3), 
                scale = c(0.1,0.01,0.001), 
                C = 0.5)

# Plot default
plot(man_grid) #svm_model_voters_grid

# Define the grid with hyperparameter ranges
big_grid <- expand.grid(size = seq(from = 1, to = 5, by = 1),
                        decay = c(0, 1))
big_grid

##    size decay
## 1     1     0
## 2     2     0
## 3     3     0
## 4     4     0
## 5     5     0
## 6     1     1
## 7     2     1
## 8     3     1
## 9     4     1
## 10    5     1

# Train control with random search
fitControl <- trainControl(method = "repeatedcv",
                           number = 3,
                           repeats = 5,
                           search = "random")

# Define trainControl function
fitControl <- trainControl(method = "adaptive_cv",
                           number = 3, repeats = 3)

Caret Functions Help Streamline Hyperparameter Tuning Process

I show how the Caret package can be a really useful tool for streamling the tuning process.

# Create classification task
task <- makeClassifTask(data = knowledge_train_data, 
            target = "UNS")

# Define learner
lrn <- makeLearner("classif.nnet", predict.type = "prob", fix.factors.prediction = TRUE)

# Define set of parameters
param_set <- makeParamSet(
  makeDiscreteParam("size", values = c(2,3,5)),
  makeNumericParam("decay", lower = 0.0001, upper = 0.1)
)

# Get the parameter set for neural networks of the nnet package
getParamSet("classif.nnet")

##            Type len    Def      Constr Req Tunable Trafo
## size    integer   -      3    0 to Inf   -    TRUE     -
## maxit   integer   -    100    1 to Inf   -    TRUE     -
## skip    logical   -  FALSE           -   -    TRUE     -
## rang    numeric   -    0.7 -Inf to Inf   -    TRUE     -
## decay   numeric   -      0 -Inf to Inf   -    TRUE     -
## Hess    logical   -  FALSE           -   -    TRUE     -
## trace   logical   -   TRUE           -   -   FALSE     -
## MaxNWts integer   -   1000    1 to Inf   -   FALSE     -
## abstol  numeric   - 0.0001 -Inf to Inf   -    TRUE     -
## reltol  numeric   -  1e-08 -Inf to Inf   -    TRUE     -

# Define a random search tuning method.
ctrl_random <- makeTuneControlRandom(maxit = 6)

# Create holdout sampling
holdout <- makeResampleDesc("Holdout")

# Perform tuning
lrn_tune <- tuneParams(learner = lrn, task = task, resampling = holdout, control = ctrl_random, par.set = param_set)

## # weights:  30
## initial  value 88.276312 
## iter  10 value 32.719882
## iter  20 value 28.501970
## iter  30 value 27.896941
## iter  40 value 27.852690
## iter  50 value 27.820994
## iter  60 value 27.815146
## final  value 27.815100 
## converged
## # weights:  21
## initial  value 102.087794 
## iter  10 value 79.245636
## iter  20 value 24.493575
## iter  30 value 21.553594
## iter  40 value 18.602719
## iter  50 value 18.306930
## iter  60 value 17.813602
## iter  70 value 17.785617
## iter  80 value 17.785417
## final  value 17.785416 
## converged
## # weights:  48
## initial  value 95.090120 
## iter  10 value 39.457704
## iter  20 value 29.360027
## iter  30 value 27.380836
## iter  40 value 27.313609
## iter  50 value 27.253186
## iter  60 value 27.223225
## iter  70 value 27.210176
## iter  80 value 27.196560
## iter  90 value 27.191039
## iter 100 value 27.189062
## final  value 27.189062 
## stopped after 100 iterations
## # weights:  30
## initial  value 91.616546 
## iter  10 value 25.105422
## iter  20 value 13.126203
## iter  30 value 11.754646
## iter  40 value 11.463565
## iter  50 value 11.382937
## iter  60 value 11.328545
## iter  70 value 11.289303
## iter  80 value 11.059538
## iter  90 value 9.609672
## iter 100 value 9.184671
## final  value 9.184671 
## stopped after 100 iterations
## # weights:  48
## initial  value 89.213986 
## iter  10 value 48.399811
## iter  20 value 27.295199
## iter  30 value 26.699349
## iter  40 value 26.375264
## iter  50 value 26.307667
## iter  60 value 26.279739
## iter  70 value 26.239254
## iter  80 value 26.226135
## iter  90 value 26.195305
## iter 100 value 26.192000
## final  value 26.192000 
## stopped after 100 iterations
## # weights:  30
## initial  value 95.371253 
## iter  10 value 54.320666
## iter  20 value 40.642532
## iter  30 value 38.357368
## iter  40 value 37.316452
## iter  50 value 36.331431
## iter  60 value 36.295579
## iter  70 value 36.249260
## iter  80 value 36.243204
## iter  90 value 36.243134
## iter  90 value 36.243134
## iter  90 value 36.243134
## final  value 36.243134 
## converged

# Create holdout sampling
holdout <- makeResampleDesc("Holdout", predict = "both")

# Perform tuning
lrn_tune <- tuneParams(learner = lrn, 
                       task = task, 
                       resampling = holdout, 
                       control = ctrl_random, 
                       par.set = param_set,
                       measures = list(acc, setAggregation(acc, train.mean), mmce, setAggregation(mmce, train.mean)))

## # weights:  30
## initial  value 90.285152 
## iter  10 value 33.529980
## iter  20 value 29.081884
## iter  30 value 28.338633
## iter  40 value 28.112368
## iter  50 value 27.776494
## iter  60 value 27.584923
## iter  70 value 27.432008
## iter  80 value 27.371791
## iter  90 value 27.365573
## final  value 27.365525 
## converged
## # weights:  30
## initial  value 91.399022 
## iter  10 value 19.863632
## iter  20 value 7.807128
## iter  30 value 6.266995
## iter  40 value 6.067984
## iter  50 value 5.942195
## iter  60 value 5.846610
## iter  70 value 5.823793
## iter  80 value 5.769556
## iter  90 value 5.704664
## iter 100 value 5.645761
## final  value 5.645761 
## stopped after 100 iterations
## # weights:  30
## initial  value 117.140093 
## iter  10 value 55.573086
## iter  20 value 27.778096
## iter  30 value 27.345771
## iter  40 value 27.227039
## iter  50 value 27.113231
## iter  60 value 26.768572
## iter  70 value 26.637632
## iter  80 value 26.583758
## iter  90 value 26.560183
## iter 100 value 26.559881
## final  value 26.559881 
## stopped after 100 iterations
## # weights:  21
## initial  value 108.373769 
## iter  10 value 75.197924
## iter  20 value 26.576844
## iter  30 value 26.154237
## iter  40 value 25.919051
## iter  50 value 25.865777
## iter  60 value 25.851037
## final  value 25.850974 
## converged
## # weights:  48
## initial  value 101.401373 
## iter  10 value 55.728384
## iter  20 value 35.165787
## iter  30 value 33.852393
## iter  40 value 33.529308
## iter  50 value 33.350026
## iter  60 value 33.259396
## iter  70 value 33.204076
## iter  80 value 33.177365
## iter  90 value 33.176549
## iter 100 value 33.176209
## final  value 33.176209 
## stopped after 100 iterations
## # weights:  21
## initial  value 102.089357 
## iter  10 value 64.293318
## iter  20 value 27.359183
## iter  30 value 26.487057
## iter  40 value 26.424772
## iter  50 value 26.423979
## iter  60 value 26.423688
## final  value 26.423682 
## converged

# Set hyperparameters
lrn_best <- setHyperPars(lrn, par.vals = list(size = 1, 
                                              maxit = 150, 
                                              decay = 0))

# Train model
model_best <- train(lrn_best, task)

## # weights:  12
## initial  value 136.836022 
## iter  10 value 110.284864
## iter  20 value 42.408320
## iter  30 value 13.849382
## iter  40 value 13.605203
## iter  50 value 13.514116
## iter  60 value 13.445979
## iter  70 value 13.166502
## iter  80 value 13.159266
## iter  90 value 13.151792
## iter 100 value 13.131009
## iter 110 value 13.129824
## iter 120 value 13.122746
## iter 130 value 13.119993
## iter 140 value 13.119079
## iter 150 value 13.115702
## final  value 13.115702 
## stopped after 150 iterations

h2o Clusters Are More Efficient

By running hyperparameter tuning through h2o clusters I can try several different parameters across several different models while using the Java Virtual Machine to prevent out of memory exceptions on my computer.

# Initialise h2o cluster
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         28 minutes 13 seconds 
##     H2O cluster timezone:       America/Chicago 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.2 
##     H2O cluster version age:    2 months and 14 days  
##     H2O cluster name:           H2O_started_from_R_deangc_smo670 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   7.05 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.0 (2019-04-26)

# Convert data to h2o frame
seeds_train_data_hf <- as.h2o(seeds_train_data)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

y <- "seed_type"
x <- setdiff(colnames(seeds_train_data_hf), y)

seeds_train_data_hf[, y] <- as.factor(seeds_train_data_hf[, y])

sframe <- h2o.splitFrame(seeds_train_data_hf, seed = 42)
train <- sframe[[1]]
valid <- sframe[[2]]

# Train random forest model
rf_model <- h2o.randomForest(x = x,
                y = y,
                train = train,
                valid = valid)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |=================================================================| 100%

# Define hyperparameters
dl_params <- list(lear_rate = c(0.001, 0.005, 0.01))

# Define search criteria
search_criteria <- list(strategy = "RandomDiscrete", 
                        stopping_rounds = 10, # this is way too short & only used to keep runtime short!
                        seed = 42)

dl_params <- list(hidden = list(c(50, 50), c(100, 100)),
                  epochs = c(5, 10, 15),
                  rate = c(0.001, 0.005, 0.01))

# Train with random search
dl_grid <- h2o.grid("deeplearning", 
                    grid_id = "dl_grid",
                    x = x, 
                    y = y,
                    training_frame = train,
                    validation_frame = valid,
                    seed = 42,
                    hyper_params = dl_params,
                    search_criteria = search_criteria)

## 
  |                                                                       
  |                                                                 |   0%

# Define early stopping
stopping_params <- list(strategy = "RandomDiscrete", 
                        stopping_metric = "misclassification",
                        seed = 42)

# Run automatic machine learning
automl_model <- h2o.automl(x = x, 
                    y = y,
                    training_frame = train,
                    max_runtime_secs = 10,
                    seed = 42)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   2%
## 14:13:13.888: AutoML: XGBoost is not available; skipping it.
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |==========================================                       |  64%
## 14:13:19.918: Skipping training of model GBM_5_AutoML_20200404_141313 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200404_141313.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 82.0.
## 
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |=================================================================| 100%

# Extract the leaderboard
lb <- automl_model@leaderboard
head(lb)

##                                              model_id mean_per_class_error
## 1                        GLM_1_AutoML_20200404_141313           0.00000000
## 2 StackedEnsemble_BestOfFamily_AutoML_20200404_141313           0.02564103
## 3                        DRF_1_AutoML_20200404_141313           0.03846154
## 4                        GBM_1_AutoML_20200404_141313           0.03846154
## 5                        GBM_3_AutoML_20200404_141313           0.03846154
## 6    StackedEnsemble_AllModels_AutoML_20200404_141313           0.03846154
##      logloss      rmse        mse
## 1 0.06309225 0.1059173 0.01121848
## 2 0.20981895 0.2061637 0.04250348
## 3 0.08865308 0.1595088 0.02544305
## 4 0.09358504 0.1656964 0.02745530
## 5 0.10844147 0.1783977 0.03182573
## 6 0.20556243 0.2078477 0.04320066