I run through some examples of hyperparameter tuning and how to use h2o to improve efficiency. I start by loading machine learning packages and sourcing the data.
I start by comparing the traditional, manual process of hypertuning.
# Fit a linear model on the breast_cancer_data.
linear_model <- lm(concavity_mean ~ symmetry_mean, breast_cancer_data)
# Look at the summary of the linear_model.
summary(linear_model)
##
## Call:
## lm(formula = concavity_mean ~ symmetry_mean, data = breast_cancer_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.201877 -0.039201 -0.008432 0.030655 0.226150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.15311 0.04086 -3.747 0.000303 ***
## symmetry_mean 1.33366 0.21257 6.274 9.57e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06412 on 98 degrees of freedom
## Multiple R-squared: 0.2866, Adjusted R-squared: 0.2793
## F-statistic: 39.36 on 1 and 98 DF, p-value: 9.575e-09
# Extract the coefficients.
linear_model$coefficients
## (Intercept) symmetry_mean
## -0.1531055 1.3336568
# Plot linear relationship.
ggplot(data = breast_cancer_data,
aes(x = symmetry_mean, y = concavity_mean)) +
geom_point(color = "grey") +
geom_abline(slope = linear_model$coefficients[2],
intercept = linear_model$coefficients[1])
# Create partition index
index <- createDataPartition(breast_cancer_data$diagnosis, p = 0.7, list = FALSE)
# Set seed.
set.seed(42)
# Start timer.
tic()
# Train model.
gbm_model <- caret::train(diagnosis ~ .,
data = bc_train_data,
method = "gbm",
trControl = trainControl(method = "repeatedcv", number = 5, repeats = 3),
verbose = FALSE,
tuneLength = 4)
# Stop timer.
toc()
## 2.52 sec elapsed
# Define hyperparameter grid.
hyperparams <- expand.grid(n.trees = 200,
interaction.depth = 1,
shrinkage = 0.1,
n.minobsinnode = 10)
# Apply hyperparameter grid to train().
set.seed(42)
gbm_model <- caret::train(diagnosis ~ .,
data = bc_train_data,
method = "gbm",
trControl = trainControl(method = "repeatedcv", number = 5, repeats = 3),
verbose = FALSE,
tuneGrid = hyperparams)
I make the model slightly more robust by tesing a range of hypertuning parameters in a single pass.
# Define Cartesian grid
man_grid <- expand.grid(degree = c(1,2,3),
scale = c(0.1,0.01,0.001),
C = 0.5)
# Plot default
plot(man_grid) #svm_model_voters_grid
# Define the grid with hyperparameter ranges
big_grid <- expand.grid(size = seq(from = 1, to = 5, by = 1),
decay = c(0, 1))
big_grid
## size decay
## 1 1 0
## 2 2 0
## 3 3 0
## 4 4 0
## 5 5 0
## 6 1 1
## 7 2 1
## 8 3 1
## 9 4 1
## 10 5 1
# Train control with random search
fitControl <- trainControl(method = "repeatedcv",
number = 3,
repeats = 5,
search = "random")
# Define trainControl function
fitControl <- trainControl(method = "adaptive_cv",
number = 3, repeats = 3)
I show how the Caret package can be a really useful tool for streamling the tuning process.
# Create classification task
task <- makeClassifTask(data = knowledge_train_data,
target = "UNS")
# Define learner
lrn <- makeLearner("classif.nnet", predict.type = "prob", fix.factors.prediction = TRUE)
# Define set of parameters
param_set <- makeParamSet(
makeDiscreteParam("size", values = c(2,3,5)),
makeNumericParam("decay", lower = 0.0001, upper = 0.1)
)
# Get the parameter set for neural networks of the nnet package
getParamSet("classif.nnet")
## Type len Def Constr Req Tunable Trafo
## size integer - 3 0 to Inf - TRUE -
## maxit integer - 100 1 to Inf - TRUE -
## skip logical - FALSE - - TRUE -
## rang numeric - 0.7 -Inf to Inf - TRUE -
## decay numeric - 0 -Inf to Inf - TRUE -
## Hess logical - FALSE - - TRUE -
## trace logical - TRUE - - FALSE -
## MaxNWts integer - 1000 1 to Inf - FALSE -
## abstol numeric - 0.0001 -Inf to Inf - TRUE -
## reltol numeric - 1e-08 -Inf to Inf - TRUE -
# Define a random search tuning method.
ctrl_random <- makeTuneControlRandom(maxit = 6)
# Create holdout sampling
holdout <- makeResampleDesc("Holdout")
# Perform tuning
lrn_tune <- tuneParams(learner = lrn, task = task, resampling = holdout, control = ctrl_random, par.set = param_set)
## # weights: 30
## initial value 88.276312
## iter 10 value 32.719882
## iter 20 value 28.501970
## iter 30 value 27.896941
## iter 40 value 27.852690
## iter 50 value 27.820994
## iter 60 value 27.815146
## final value 27.815100
## converged
## # weights: 21
## initial value 102.087794
## iter 10 value 79.245636
## iter 20 value 24.493575
## iter 30 value 21.553594
## iter 40 value 18.602719
## iter 50 value 18.306930
## iter 60 value 17.813602
## iter 70 value 17.785617
## iter 80 value 17.785417
## final value 17.785416
## converged
## # weights: 48
## initial value 95.090120
## iter 10 value 39.457704
## iter 20 value 29.360027
## iter 30 value 27.380836
## iter 40 value 27.313609
## iter 50 value 27.253186
## iter 60 value 27.223225
## iter 70 value 27.210176
## iter 80 value 27.196560
## iter 90 value 27.191039
## iter 100 value 27.189062
## final value 27.189062
## stopped after 100 iterations
## # weights: 30
## initial value 91.616546
## iter 10 value 25.105422
## iter 20 value 13.126203
## iter 30 value 11.754646
## iter 40 value 11.463565
## iter 50 value 11.382937
## iter 60 value 11.328545
## iter 70 value 11.289303
## iter 80 value 11.059538
## iter 90 value 9.609672
## iter 100 value 9.184671
## final value 9.184671
## stopped after 100 iterations
## # weights: 48
## initial value 89.213986
## iter 10 value 48.399811
## iter 20 value 27.295199
## iter 30 value 26.699349
## iter 40 value 26.375264
## iter 50 value 26.307667
## iter 60 value 26.279739
## iter 70 value 26.239254
## iter 80 value 26.226135
## iter 90 value 26.195305
## iter 100 value 26.192000
## final value 26.192000
## stopped after 100 iterations
## # weights: 30
## initial value 95.371253
## iter 10 value 54.320666
## iter 20 value 40.642532
## iter 30 value 38.357368
## iter 40 value 37.316452
## iter 50 value 36.331431
## iter 60 value 36.295579
## iter 70 value 36.249260
## iter 80 value 36.243204
## iter 90 value 36.243134
## iter 90 value 36.243134
## iter 90 value 36.243134
## final value 36.243134
## converged
# Create holdout sampling
holdout <- makeResampleDesc("Holdout", predict = "both")
# Perform tuning
lrn_tune <- tuneParams(learner = lrn,
task = task,
resampling = holdout,
control = ctrl_random,
par.set = param_set,
measures = list(acc, setAggregation(acc, train.mean), mmce, setAggregation(mmce, train.mean)))
## # weights: 30
## initial value 90.285152
## iter 10 value 33.529980
## iter 20 value 29.081884
## iter 30 value 28.338633
## iter 40 value 28.112368
## iter 50 value 27.776494
## iter 60 value 27.584923
## iter 70 value 27.432008
## iter 80 value 27.371791
## iter 90 value 27.365573
## final value 27.365525
## converged
## # weights: 30
## initial value 91.399022
## iter 10 value 19.863632
## iter 20 value 7.807128
## iter 30 value 6.266995
## iter 40 value 6.067984
## iter 50 value 5.942195
## iter 60 value 5.846610
## iter 70 value 5.823793
## iter 80 value 5.769556
## iter 90 value 5.704664
## iter 100 value 5.645761
## final value 5.645761
## stopped after 100 iterations
## # weights: 30
## initial value 117.140093
## iter 10 value 55.573086
## iter 20 value 27.778096
## iter 30 value 27.345771
## iter 40 value 27.227039
## iter 50 value 27.113231
## iter 60 value 26.768572
## iter 70 value 26.637632
## iter 80 value 26.583758
## iter 90 value 26.560183
## iter 100 value 26.559881
## final value 26.559881
## stopped after 100 iterations
## # weights: 21
## initial value 108.373769
## iter 10 value 75.197924
## iter 20 value 26.576844
## iter 30 value 26.154237
## iter 40 value 25.919051
## iter 50 value 25.865777
## iter 60 value 25.851037
## final value 25.850974
## converged
## # weights: 48
## initial value 101.401373
## iter 10 value 55.728384
## iter 20 value 35.165787
## iter 30 value 33.852393
## iter 40 value 33.529308
## iter 50 value 33.350026
## iter 60 value 33.259396
## iter 70 value 33.204076
## iter 80 value 33.177365
## iter 90 value 33.176549
## iter 100 value 33.176209
## final value 33.176209
## stopped after 100 iterations
## # weights: 21
## initial value 102.089357
## iter 10 value 64.293318
## iter 20 value 27.359183
## iter 30 value 26.487057
## iter 40 value 26.424772
## iter 50 value 26.423979
## iter 60 value 26.423688
## final value 26.423682
## converged
# Set hyperparameters
lrn_best <- setHyperPars(lrn, par.vals = list(size = 1,
maxit = 150,
decay = 0))
# Train model
model_best <- train(lrn_best, task)
## # weights: 12
## initial value 136.836022
## iter 10 value 110.284864
## iter 20 value 42.408320
## iter 30 value 13.849382
## iter 40 value 13.605203
## iter 50 value 13.514116
## iter 60 value 13.445979
## iter 70 value 13.166502
## iter 80 value 13.159266
## iter 90 value 13.151792
## iter 100 value 13.131009
## iter 110 value 13.129824
## iter 120 value 13.122746
## iter 130 value 13.119993
## iter 140 value 13.119079
## iter 150 value 13.115702
## final value 13.115702
## stopped after 150 iterations
By running hyperparameter tuning through h2o clusters I can try several different parameters across several different models while using the Java Virtual Machine to prevent out of memory exceptions on my computer.
# Initialise h2o cluster
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 28 minutes 13 seconds
## H2O cluster timezone: America/Chicago
## H2O data parsing timezone: UTC
## H2O cluster version: 3.28.0.2
## H2O cluster version age: 2 months and 14 days
## H2O cluster name: H2O_started_from_R_deangc_smo670
## H2O cluster total nodes: 1
## H2O cluster total memory: 7.05 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.0 (2019-04-26)
# Convert data to h2o frame
seeds_train_data_hf <- as.h2o(seeds_train_data)
##
|
| | 0%
|
|=================================================================| 100%
y <- "seed_type"
x <- setdiff(colnames(seeds_train_data_hf), y)
seeds_train_data_hf[, y] <- as.factor(seeds_train_data_hf[, y])
sframe <- h2o.splitFrame(seeds_train_data_hf, seed = 42)
train <- sframe[[1]]
valid <- sframe[[2]]
# Train random forest model
rf_model <- h2o.randomForest(x = x,
y = y,
train = train,
valid = valid)
##
|
| | 0%
|
|========================================== | 64%
|
|=================================================================| 100%
# Define hyperparameters
dl_params <- list(lear_rate = c(0.001, 0.005, 0.01))
# Define search criteria
search_criteria <- list(strategy = "RandomDiscrete",
stopping_rounds = 10, # this is way too short & only used to keep runtime short!
seed = 42)
dl_params <- list(hidden = list(c(50, 50), c(100, 100)),
epochs = c(5, 10, 15),
rate = c(0.001, 0.005, 0.01))
# Train with random search
dl_grid <- h2o.grid("deeplearning",
grid_id = "dl_grid",
x = x,
y = y,
training_frame = train,
validation_frame = valid,
seed = 42,
hyper_params = dl_params,
search_criteria = search_criteria)
##
|
| | 0%
# Define early stopping
stopping_params <- list(strategy = "RandomDiscrete",
stopping_metric = "misclassification",
seed = 42)
# Run automatic machine learning
automl_model <- h2o.automl(x = x,
y = y,
training_frame = train,
max_runtime_secs = 10,
seed = 42)
##
|
| | 0%
|
|= | 2%
## 14:13:13.888: AutoML: XGBoost is not available; skipping it.
|
|================= | 26%
|
|============================== | 46%
|
|========================================== | 64%
## 14:13:19.918: Skipping training of model GBM_5_AutoML_20200404_141313 due to exception: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20200404_141313. Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 82.0.
##
|
|========================================================= | 88%
|
|=================================================================| 100%
# Extract the leaderboard
lb <- automl_model@leaderboard
head(lb)
## model_id mean_per_class_error
## 1 GLM_1_AutoML_20200404_141313 0.00000000
## 2 StackedEnsemble_BestOfFamily_AutoML_20200404_141313 0.02564103
## 3 DRF_1_AutoML_20200404_141313 0.03846154
## 4 GBM_1_AutoML_20200404_141313 0.03846154
## 5 GBM_3_AutoML_20200404_141313 0.03846154
## 6 StackedEnsemble_AllModels_AutoML_20200404_141313 0.03846154
## logloss rmse mse
## 1 0.06309225 0.1059173 0.01121848
## 2 0.20981895 0.2061637 0.04250348
## 3 0.08865308 0.1595088 0.02544305
## 4 0.09358504 0.1656964 0.02745530
## 5 0.10844147 0.1783977 0.03182573
## 6 0.20556243 0.2078477 0.04320066