Loading libraries for diverse Regression Trees based methods.
library(tidyverse) # because you want it pretty
## ── Attaching packages ─────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rsample) # data splitting
library(rpart) # performing regression trees
library(rpart.plot) # plotting regression trees
library(ipred) # fitting a bagged tree
library(randomForest) # basic implementation
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ranger) # a faster implementation of randomForest
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(caret) # an aggregator package for performing many machine learning models
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(h2o) # an extremely fast java-based platform
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(e1071) # caret uses it for cross-validation
library(gbm) # fitting a boosted tree
## Loaded gbm 2.1.5
library(TDboost) # fitting a Tweedie boosted tree
## Loaded TDboost 1.2
##
## Attaching package: 'TDboost'
## The following object is masked from 'package:gbm':
##
## relative.influence
library(vip) # projecting variable importance
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(readxl)
Creating training and test data.
# preparing data base
Telematics <- read_excel("FullDrivers Performance Report for R.xlsx")
teletrck <- Telematics %>%
select(Gender, Marital_Status, Age, Status, Phone, Pol_Pfx, Distance, Score) %>%
filter(!is.na(Score)) %>%
filter(Distance > 100) %>%
filter(Distance < 30000) %>%
mutate(Pol_Pfx = fct_recode(Pol_Pfx,
'Annual' = 'EAA',
'Bravo' = 'EAB',
'Enhanced' = 'EAL',
'Select' = 'EAS',
)) %>%
mutate(LSP = 100 - Score) %>% # Lost Score Points (LSP) model
select(-Score)
teletrck$Gender <- as.factor(teletrck$Gender)
teletrck$Marital_Status <- as.factor(teletrck$Marital_Status)
teletrck$Status <- as.factor(teletrck$Status)
teletrck$Phone <- as.factor(teletrck$Phone)
teletrck$pol_Pfx <- as.factor(teletrck$Pol_Pfx)
# Create training (70%) and test (30%) sets
set.seed(123)
ttrk_split <- initial_split(teletrck, prop = .7)
ttrk_train <- training(ttrk_split)
ttrk_test <- testing(ttrk_split)
Fitting a basic tree
ttrk_trees <- rpart(LSP ~ ., ttrk_train, method = 'anova')
ttrk_trees
## n= 1312
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1312 13953.750 8.063262
## 2) Phone=android 701 6490.833 7.196862
## 4) Age>=36.5 443 3658.248 6.693002 *
## 5) Age< 36.5 258 2527.008 8.062016 *
## 3) Phone=iPhone 611 6332.995 9.057283
## 6) Age>=38.5 256 2551.527 8.457031 *
## 7) Age< 38.5 355 3622.715 9.490141 *
rpart.plot(ttrk_trees)
plotcp(ttrk_trees)
Tunning the tree
# Hyperparameters Grid 1
hyper_grid <- expand.grid(
minsplit = seq(2, 10, 1),
maxdepth = seq(2, 5, 1)
)
models <- list()
for (i in 1:nrow(hyper_grid)) {
# get minsplit, maxdepth values at row i
minsplit <- hyper_grid$minsplit[i]
maxdepth <- hyper_grid$maxdepth[i]
# train a model and store in the list
models[[i]] <- rpart(
formula = LSP ~ .,
data = ttrk_train,
method = "anova",
control = list(minsplit = minsplit, maxdepth = maxdepth)
)
}
# function to get optimal cp
get_cp <- function(x) {
min <- which.min(x$cptable[, "CP"])
cp <- x$cptable[min, "CP"]
}
# function to get minimum error
get_min_error <- function(x) {
min <- which.min(x$cptable[, "xerror"])
xerror <- x$cptable[min, "xerror"]
}
hgo <- hyper_grid %>%
mutate(
cp = map_dbl(models, get_cp),
error = map_dbl(models, get_min_error)
) %>%
arrange(error) %>%
head()
hgo
## # A tibble: 6 x 4
## minsplit maxdepth cp error
## <dbl> <dbl> <dbl> <dbl>
## 1 8 2 0.01 0.902
## 2 10 3 0.01 0.904
## 3 4 5 0.01 0.904
## 4 5 3 0.01 0.905
## 5 3 2 0.01 0.905
## 6 3 3 0.01 0.905
optimal_tree <- rpart(
formula = LSP ~ .,
data = ttrk_train,
method = "anova",
control = list(minsplit = 8, maxdepth = 2, cp = 0.01)
)
optimal_tree
## n= 1312
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1312 13953.750 8.063262
## 2) Phone=android 701 6490.833 7.196862
## 4) Age>=36.5 443 3658.248 6.693002 *
## 5) Age< 36.5 258 2527.008 8.062016 *
## 3) Phone=iPhone 611 6332.995 9.057283
## 6) Age>=38.5 256 2551.527 8.457031 *
## 7) Age< 38.5 355 3622.715 9.490141 *
rpart.plot(optimal_tree)
plotcp(optimal_tree)
pred <- predict(optimal_tree, newdata = ttrk_test)
RMSE(pred, ttrk_test$LSP)
## [1] 3.11548
Bagging
# make bootstrapping reproducible
set.seed(123)
# train bagged model with ipred
bagged_m1 <- bagging(
formula = LSP ~ .,
data = ttrk_train,
coob = TRUE,
nbagg = 100 # default is 25
)
bagged_m1
##
## Bagging regression trees with 100 bootstrap replications
##
## Call: bagging.data.frame(formula = LSP ~ ., data = ttrk_train, coob = TRUE,
## nbagg = 100)
##
## Out-of-bag estimate of root mean squared error: 3.1007
# assess 10-100 bagged trees
ntree <- 10:100
# create empty vector to store OOB RMSE values
rmse <- vector(mode = "numeric", length = length(ntree))
for (i in seq_along(ntree)) {
# reproducibility
set.seed(123)
# perform bagged model
model <- bagging(
formula = LSP ~ .,
data = ttrk_train,
coob = TRUE,
nbagg = ntree[i]
)
# get OOB error
rmse[i] <- model$err
}
plot(ntree, rmse, type = 'l', lwd = 2)
# cross-validation and variable importance with caret
# Specify 10-fold cross validation
ctrl <- trainControl(method = "cv", number = 10)
# CV bagged model
bagged_cv <- train(
LSP ~ .,
data = ttrk_train,
method = "treebag",
trControl = ctrl,
importance = TRUE
)
bagged_cv
## Bagged CART
##
## 1312 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1182, 1181, 1180, 1182, 1181, 1180, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 3.102777 0.09860987 2.450765
# plot most important variables
plot(varImp(bagged_cv))
Random Forests
# for reproduciblity
set.seed(123)
# default RF model
RF1 <- randomForest(
formula = LSP ~ .,
data = ttrk_train
)
RF1
##
## Call:
## randomForest(formula = LSP ~ ., data = ttrk_train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 9.799114
## % Var explained: 7.86
plot(RF1)
# number of trees with lowest MSE
which.min(RF1$mse)
## [1] 69
# RMSE of this optimal random forest
sqrt(RF1$mse[which.min(RF1$mse)])
## [1] 3.125713
# create training and validation data
set.seed(123)
valid_split <- initial_split(ttrk_train, .8)
# training data
ttrk_train_v2 <- analysis(valid_split)
# validation data
ttrk_valid <- assessment(valid_split)
x_test <- ttrk_valid[setdiff(names(ttrk_valid), "LSP")]
y_test <- ttrk_valid$LSP
rf_oob_comp <- randomForest(
formula = LSP ~ .,
data = ttrk_train_v2,
xtest = x_test,
ytest = y_test
)
# extract OOB & validation errors
oob <- sqrt(rf_oob_comp$mse)
validation <- sqrt(rf_oob_comp$test$mse)
# compare error rates
RF_plot <- tibble(
`Out of Bag Error` = oob,
`Test error` = validation,
ntrees = 1:rf_oob_comp$ntree
) %>%
gather(Metric, RMSE, -ntrees) %>%
ggplot(aes(ntrees, RMSE, color = Metric)) +
geom_line() +
scale_y_continuous(labels = scales::dollar) +
xlab("Number of trees")
RF_plot
# tunning the forest
features <- setdiff(names(ttrk_train), "LSP")
set.seed(123)
m2 <- tuneRF(
x = ttrk_train[features],
y = ttrk_train$LSP,
ntreeTry = 500,
mtryStart = 5,
stepFactor = 1.5,
improve = 0.01,
trace = FALSE # to not show real-time progress
) %>%
head()
## 0.006923681 0.01
## -0.01292261 0.01
# hyperparameter ranger grid search
hyper_grid_rf <- expand.grid(
mtry = seq(2, 8, by = 1),
node_size = seq(3, 9, by = 2),
sampe_size = c(.55, .632, .70, .80),
OOB_RMSE = 0
)
for(i in 1:nrow(hyper_grid_rf)) {
# train model
model_rf <- ranger(
formula = LSP ~ .,
data = ttrk_train,
num.trees = 500,
mtry = hyper_grid_rf$mtry[i],
min.node.size = hyper_grid_rf$node_size[i],
sample.fraction = hyper_grid_rf$sampe_size[i],
seed = 123
)
# add OOB error to grid
hyper_grid_rf$OOB_RMSE[i] <- sqrt(model_rf$prediction.error)
}
hgf <- hyper_grid_rf %>%
arrange(OOB_RMSE) %>%
head()
hgf
## # A tibble: 6 x 4
## mtry node_size sampe_size OOB_RMSE
## <dbl> <dbl> <dbl> <dbl>
## 1 2 9 0.55 3.11
## 2 2 7 0.55 3.11
## 3 2 7 0.7 3.12
## 4 2 9 0.7 3.12
## 5 2 9 0.632 3.12
## 6 2 9 0.8 3.12
# optimal ranger
optimal_rf <- ranger(
formula = LSP ~ .,
data = ttrk_train,
num.trees = 500,
mtry = 2,
min.node.size = 9,
sample.fraction = 0.55,
seed = 123,
importance = 'impurity'
)
optimal_rf
## Ranger result
##
## Call:
## ranger(formula = LSP ~ ., data = ttrk_train, num.trees = 500, mtry = 2, min.node.size = 9, sample.fraction = 0.55, seed = 123, importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 1312
## Number of independent variables: 8
## Mtry: 2
## Target node size: 9
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 9.676832
## R squared (OOB): 0.09083027
imp_ranger <- as.data.frame(optimal_rf$variable.importance) %>%
rownames_to_column('Variable') %>%
as_tibble()
colnames(imp_ranger)[2] <- 'Impurity'
imp_ranger <- imp_ranger %>%
arrange(desc(Impurity))
imp_plot <- imp_ranger %>%
ggplot(aes(reorder(Variable, Impurity), Impurity)) +
geom_col() +
coord_flip() +
xlab('Variable') +
ylab('Impurity')
imp_plot
Gradient Boosting Machines
ttrk_boosted <- train(LSP ~ .,
data = ttrk_train,
method = 'gbm',
preProcess = c('scale', 'center'),
trControl = trainControl(method = 'repeatedcv',
number = 5,
repeats = 3,
verboseIter = FALSE),
verbose = 0)
ttrk_boosted
## Stochastic Gradient Boosting
##
## 1312 samples
## 8 predictor
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 1050, 1049, 1050, 1050, 1049, 1050, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 3.076759 0.11411868 2.413394
## 1 100 3.082923 0.11110649 2.415808
## 1 150 3.087277 0.10924428 2.420023
## 2 50 3.095434 0.10249101 2.424033
## 2 100 3.117028 0.09431837 2.442313
## 2 150 3.127816 0.09187910 2.454448
## 3 50 3.112207 0.09492203 2.435534
## 3 100 3.130910 0.08912133 2.446676
## 3 150 3.159073 0.07942147 2.468635
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 10.
ttrk_prdcted <- predict(ttrk_boosted, ttrk_test)
ttrk_test$Score_Obs <- 100 - ttrk_test$LSP
ttrk_test$Score_Pre <- 100 - ttrk_prdcted
ttrk_test %>% ggplot(aes(Score_Obs, Score_Pre)) +
geom_point() +
xlab('Observd Score') +
ylab('Predicted Score') +
xlim(75, 100) +
ylim(75, 100) +
geom_smooth(method = 'glm')
## `geom_smooth()` using formula 'y ~ x'
#Tweddie Bosting
TDboost1 <- TDboost(LSP ~ Phone + Age + Status + Pol_Pfx + Gender + Distance,
data = ttrk_train,
var.monotone = c(0,0,0,0,0,0),
distribution = list(name="EDM", alpha=1.5),
n.trees = 3000,
shrinkage = 0.005,
interaction.depth=3,
bag.fraction = 0.5,
train.fraction = 0.5,
n.minobsinnode = 10,
cv.folds = 5,
keep.data=TRUE,
verbose=FALSE)
# print out the optimal iteration number M
best.iter <- TDboost.perf(TDboost1,method="test")
print(best.iter)
## [1] 317
# check performance using 5-fold cross-validation
best.iter <- TDboost.perf(TDboost1,method="cv")
print(best.iter)
## [1] 410
summary(TDboost1,n.trees=1)
## # A tibble: 6 x 2
## var rel.inf
## <chr> <dbl>
## 1 Phone 61.5
## 2 Age 38.5
## 3 Status 0
## 4 Pol_Pfx 0
## 5 Gender 0
## 6 Distance 0
summary(TDboost1,n.trees=best.iter) # at the best iteration
## # A tibble: 6 x 2
## var rel.inf
## <chr> <dbl>
## 1 Age 34.2
## 2 Phone 29.7
## 3 Distance 19.1
## 4 Status 8.15
## 5 Pol_Pfx 6.47
## 6 Gender 2.35
# making prediction on data2
f.predict <- predict.TDboost(TDboost1,ttrk_test,best.iter)
# least squares error
print(sum((data2$Y-f.predict)^2))
## Warning in data2$Y - f.predict: longer object length is not a multiple of
## shorter object length
## [1] 34015.07
# plot variable X1 after "best" iterations
plot.TDboost(TDboost1,1,best.iter)
# contour plot of variables 1 and 3 after "best" iterations
plot.TDboost(TDboost1,c(1,2),best.iter)
pretty.gbm.tree(TDboost1, i.tree = 337)
## # A tibble: 10 x 8
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight
## <int> <dbl> <int> <int> <int> <dbl> <dbl>
## 1 1 54.5 1 8 9 7.59 328
## 2 1 22.5 2 3 7 4.57 299
## 3 -1 -0.000346 -1 -1 -1 0 43
## 4 1 36.5 4 5 6 7.99 256
## 5 -1 0.000549 -1 -1 -1 0 122
## 6 -1 -0.0000227 -1 -1 -1 0 134
## 7 -1 0.000250 -1 -1 -1 0 256
## 8 -1 0.000164 -1 -1 -1 0 299
## 9 -1 -0.000894 -1 -1 -1 0 29
## 10 -1 0.0000707 -1 -1 -1 0 328
## # … with 1 more variable: Prediction <dbl>
# create hyperparameter grid
hyper_grid_gbm <- expand.grid(
shrinkage = c(0.01, 0.1, 0.3),
interaction.depth = c(1, 3, 5),
n.minobsinnode = c(5, 10, 15),
bag.fraction = c(0.65, 0.8, 1),
optimal_trees = 0,
min_RMSE = 0
)
# total number of combinations
nrow(hyper_grid_gbm)
## [1] 81
# randomize data
random_index <- sample(1:nrow(ttrk_train), nrow(ttrk_train))
random_ttrk_train <- ttrk_train[random_index, ]
# grid search
for(i in 1:nrow(hyper_grid_gbm)) {
# reproducibility
set.seed(123)
# train model
gbm.tune <- gbm(
formula = LSP ~ .,
distribution = "gaussian",
data = random_ttrk_train,
n.trees = 5000,
interaction.depth = hyper_grid_gbm$interaction.depth[i],
shrinkage = hyper_grid_gbm$shrinkage[i],
n.minobsinnode = hyper_grid_gbm$n.minobsinnode[i],
bag.fraction = hyper_grid_gbm$bag.fraction[i],
train.fraction = .75,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
# add min training error and trees to grid
hyper_grid_gbm$optimal_trees[i] <- which.min(gbm.tune$valid.error)
hyper_grid_gbm$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}
hyper_grid_gbm %>%
arrange(min_RMSE) %>%
tibble()
## # A tibble: 81 x 6
## shrinkage interaction.dep… n.minobsinnode bag.fraction optimal_trees min_RMSE
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.1 1 5 1 31 2.92
## 2 0.1 1 10 1 31 2.92
## 3 0.1 1 15 1 31 2.92
## 4 0.01 1 15 1 386 2.92
## 5 0.01 1 5 1 377 2.92
## 6 0.01 1 10 1 377 2.92
## 7 0.01 1 15 0.8 310 2.92
## 8 0.01 1 10 0.8 310 2.92
## 9 0.3 1 5 1 12 2.92
## 10 0.3 1 10 1 12 2.92
## # … with 71 more rows
# for reproducibility
set.seed(123)
# train GBM model
gbm.fit.final <- gbm(
formula = LSP ~ .,
distribution = "gaussian",
data = ttrk_train,
n.trees = 157,
interaction.depth = 5,
shrinkage = 0.01,
n.minobsinnode = 10,
bag.fraction = 1,
train.fraction = 1,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
par(mar = c(5, 8, 1, 1))
summary(
gbm.fit.final,
method = relative.influence, # also can use permutation.test.gbm
las = 2
)
## # A tibble: 8 x 2
## var rel.inf
## <chr> <dbl>
## 1 Phone 55.8
## 2 Age 29.9
## 3 Distance 7.08
## 4 Status 3.51
## 5 Pol_Pfx 3.07
## 6 Gender 0.567
## 7 Marital_Status 0
## 8 pol_Pfx 0
vip(gbm.fit.final)
## H2O Boosting
h2o.no_progress()
h2o.init(max_mem_size = "1g")
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /tmp/RtmpUvzMZD/file19e65872a56/h2o_rstudio_user_started_from_r.out
## /tmp/RtmpUvzMZD/file19e25431b3/h2o_rstudio_user_started_from_r.err
##
##
## Starting H2O JVM and connecting: .. Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 4 seconds 28 milliseconds
## H2O cluster timezone: Etc/UTC
## H2O data parsing timezone: UTC
## H2O cluster version: 3.30.0.1
## H2O cluster version age: 6 months and 19 days !!!
## H2O cluster name: H2O_started_from_R_rstudio-user_jzy813
## H2O cluster total nodes: 1
## H2O cluster total memory: 0.97 GB
## H2O cluster total cores: 1
## H2O cluster allowed cores: 1
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 4.0.2 (2020-06-22)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (6 months and 19 days)!
## Please download and install the latest version from http://h2o.ai/download/
y <- 'LSP'
x <- setdiff(names(ttrk_train), y)
# turn training set into h2o object
train.h2o <- as.h2o(ttrk_train)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
# training basic GBM model with defaults
h2o.fit1 <- h2o.gbm(
x = x,
y = y,
training_frame = train.h2o,
nfolds = 5
)
h2o.fit1
## Model Details:
## ==============
##
## H2ORegressionModel: gbm
## Model ID: GBM_model_R_1603409901012_1
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 50 50 16579 5
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 5 5.00000 9 32 21.66000
##
##
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
##
## MSE: 6.767943
## RMSE: 2.601527
## MAE: 2.037768
## RMSLE: 0.3047606
## Mean Residual Deviance : 6.767943
##
##
##
## H2ORegressionMetrics: gbm
## ** Reported on cross-validation data. **
## ** 5-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 10.16471
## RMSE: 3.188215
## MAE: 2.498621
## RMSLE: 0.3676748
## Mean Residual Deviance : 10.16471
##
##
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid
## mae 2.4988215 0.02977455 2.5020099 2.4707112
## mean_residual_deviance 10.168217 0.54227483 10.01449 10.224343
## mse 10.168217 0.54227483 10.01449 10.224343
## r2 0.0365747 0.046777964 0.012430682 -0.026870534
## residual_deviance 10.168217 0.54227483 10.01449 10.224343
## rmse 3.187869 0.08446447 3.164568 3.1975527
## rmsle 0.36724392 0.011001318 0.34939647 0.3716534
## cv_3_valid cv_4_valid cv_5_valid
## mae 2.5476723 2.4808135 2.492901
## mean_residual_deviance 11.021231 9.529255 10.051765
## mse 11.021231 9.529255 10.051765
## r2 0.07128285 0.03525179 0.090778716
## residual_deviance 11.021231 9.529255 10.051765
## rmse 3.319824 3.086949 3.1704519
## rmsle 0.37691563 0.37389708 0.364357
h2o.fit2 <- h2o.gbm(
x = x,
y = y,
training_frame = train.h2o,
nfolds = 5,
ntrees = 5000,
stopping_rounds = 10,
stopping_tolerance = 0,
seed = 123
)
# model stopped after 25 trees
h2o.fit2@parameters$ntrees
## [1] 25
# cross validated RMSE
h2o.rmse(h2o.fit2, xval = TRUE)
## [1] 3.18596
h2o.varimp_plot(h2o.fit2)