Loading libraries for diverse Regression Trees based methods.

library(readxl)           # reading Excel, duh!
library(tidyverse)        # because you want it pretty
## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rsample)          # data splitting 
library(rpart)            # performing regression trees
library(rpart.plot)       # plotting regression trees
library(data.table)       # managing data tables more computer-efficiently
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ipred)            # fitting a bagged tree
library(randomForest)     # basic implementation
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ranger)           # a faster implementation of randomForest
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(caret)            # an aggregator package for performing many machine learning models
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)            # caret uses it for cross-validation
library(gbm)              # fitting a boosted tree
## Loaded gbm 2.1.5
library(TDboost)          # fitting a Tweedie boosted tree
## Loaded TDboost 1.2
## 
## Attaching package: 'TDboost'
## The following object is masked from 'package:gbm':
## 
##     relative.influence
library(xgboost)          # fitting a XGboosted tree
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library(vip)              # projecting variable importance
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
library(vtreat)           # encoding data frames for xgboost

Creating training and test data.

# preparing data base

Telematics <- read_excel("~/Telematics/Full Drivers Performance Report for R.xlsx")
teletrck <- Telematics %>%
  select(Gender, Marital_Status, Age, Status, Phone, Pol_Pfx, Distance, Score) %>%
  filter(!is.na(Score)) %>%
  filter(Distance > 100) %>%
  filter(Distance < 30000) %>%
  mutate(Pol_Pfx = fct_recode(Pol_Pfx,
                              'Annual'   = 'EAA',
                              'Bravo'    = 'EAB',
                              'Enhanced' = 'EAL',
                              'Select'   = 'EAS',
  )) %>%
  mutate(LSP = 100 - Score) %>%                             # Lost Score Points (LSP) model
  select(-Score)

teletrck$Gender <- as.factor(teletrck$Gender)
teletrck$Marital_Status <- as.factor(teletrck$Marital_Status)
teletrck$Status <- as.factor(teletrck$Status)
teletrck$Phone <- as.factor(teletrck$Phone)
teletrck$pol_Pfx <- as.factor(teletrck$Pol_Pfx)

# Create training (70%) and test (30%) sets

set.seed(123)
ttrk_split <- initial_split(teletrck, prop = .7)
ttrk_train <- training(ttrk_split)
ttrk_test  <- testing(ttrk_split)

Fitting a basic tree

ttrk_trees <- rpart(LSP ~ ., ttrk_train, method = 'anova')
ttrk_trees
## n= 1312 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1312 13953.750 8.063262  
##   2) Phone=android 701  6490.833 7.196862  
##     4) Age>=36.5 443  3658.248 6.693002 *
##     5) Age< 36.5 258  2527.008 8.062016 *
##   3) Phone=iPhone 611  6332.995 9.057283  
##     6) Age>=38.5 256  2551.527 8.457031 *
##     7) Age< 38.5 355  3622.715 9.490141 *
rpart.plot(ttrk_trees)

plotcp(ttrk_trees)

Tunning the tree

# Hyperparameters Grid 1
hyper_grid <- expand.grid(
  minsplit = seq(2, 10, 1),
  maxdepth = seq(2, 5, 1)
)

models <- list()

for (i in 1:nrow(hyper_grid)) {
  
  # get minsplit, maxdepth values at row i
  minsplit <- hyper_grid$minsplit[i]
  maxdepth <- hyper_grid$maxdepth[i]
  
  # train a model and store in the list
  models[[i]] <- rpart(
    formula = LSP ~ .,
    data    = ttrk_train,
    method  = "anova",
    control = list(minsplit = minsplit, maxdepth = maxdepth)
  )
}

# function to get optimal cp
get_cp <- function(x) {
  min    <- which.min(x$cptable[, "CP"])
  cp <- x$cptable[min, "CP"]
}

# function to get minimum error
get_min_error <- function(x) {
  min    <- which.min(x$cptable[, "xerror"])
  xerror <- x$cptable[min, "xerror"] 
}

hgo <- hyper_grid %>%
  mutate(
    cp    = map_dbl(models, get_cp),
    error = map_dbl(models, get_min_error)
    ) %>%
  arrange(error) %>%
  head()
hgo
##   minsplit maxdepth   cp     error
## 1        8        2 0.01 0.9024600
## 2       10        3 0.01 0.9036202
## 3        4        5 0.01 0.9037967
## 4        5        3 0.01 0.9051272
## 5        3        2 0.01 0.9052680
## 6        3        3 0.01 0.9054220
optimal_tree <- rpart(
  formula = LSP ~ .,
  data    = ttrk_train,
  method  = "anova",
  control = list(minsplit = 8, maxdepth = 2, cp = 0.01)
)

optimal_tree
## n= 1312 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1312 13953.750 8.063262  
##   2) Phone=android 701  6490.833 7.196862  
##     4) Age>=36.5 443  3658.248 6.693002 *
##     5) Age< 36.5 258  2527.008 8.062016 *
##   3) Phone=iPhone 611  6332.995 9.057283  
##     6) Age>=38.5 256  2551.527 8.457031 *
##     7) Age< 38.5 355  3622.715 9.490141 *
rpart.plot(optimal_tree)

plotcp(optimal_tree)

pred <- predict(optimal_tree, newdata = ttrk_test)
RMSE(pred, ttrk_test$LSP)
## [1] 3.11548

Bagging

# make bootstrapping reproducible
set.seed(123)

# train bagged model with ipred
bagged_m1 <- bagging(
  formula = LSP ~ .,
  data    = ttrk_train,
  coob    = TRUE,
  nbagg      = 100  # default is 25
)

bagged_m1
## 
## Bagging regression trees with 100 bootstrap replications 
## 
## Call: bagging.data.frame(formula = LSP ~ ., data = ttrk_train, coob = TRUE, 
##     nbagg = 100)
## 
## Out-of-bag estimate of root mean squared error:  3.1007
# assess 10-100 bagged trees
ntree <- 10:100

# create empty vector to store OOB RMSE values
rmse <- vector(mode = "numeric", length = length(ntree))

for (i in seq_along(ntree)) {
  # reproducibility
  set.seed(123)
  
  # perform bagged model
  model <- bagging(
  formula = LSP ~ .,
  data    = ttrk_train,
  coob    = TRUE,
  nbagg   = ntree[i]
)
  # get OOB error
  rmse[i] <- model$err
}

plot(ntree, rmse, type = 'l', lwd = 2)

# cross-validation and variable importance with caret
# Specify 10-fold cross validation

ctrl <- trainControl(method = "cv",  number = 10) 

# CV bagged model
bagged_cv <- train(
  LSP ~ .,
  data      = ttrk_train,
  method    = "treebag",
  trControl = ctrl,
  importance = TRUE
  )
bagged_cv
## Bagged CART 
## 
## 1312 samples
##    8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1182, 1181, 1180, 1182, 1181, 1180, ... 
## Resampling results:
## 
##   RMSE      Rsquared    MAE     
##   3.102777  0.09860987  2.450765
# plot most important variables
plot(varImp(bagged_cv))  

Random Forests

# for reproduciblity
set.seed(123)

# default RF model
RF1 <- randomForest(
  formula = LSP ~ .,
  data    = ttrk_train
)
RF1
## 
## Call:
##  randomForest(formula = LSP ~ ., data = ttrk_train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##           Mean of squared residuals: 9.799114
##                     % Var explained: 7.86
plot(RF1)

# number of trees with lowest MSE
which.min(RF1$mse)
## [1] 69
# RMSE of this optimal random forest
sqrt(RF1$mse[which.min(RF1$mse)])
## [1] 3.125713
# create training and validation data 
set.seed(123)
valid_split <- initial_split(ttrk_train, .8)

# training data
ttrk_train_v2 <- analysis(valid_split)

# validation data
ttrk_valid <- assessment(valid_split)
x_test <- ttrk_valid[setdiff(names(ttrk_valid), "LSP")]
y_test <- ttrk_valid$LSP

rf_oob_comp <- randomForest(
  formula = LSP ~ .,
  data    = ttrk_train_v2,
  xtest   = x_test,
  ytest   = y_test
)

# extract OOB & validation errors
oob <- sqrt(rf_oob_comp$mse)
validation <- sqrt(rf_oob_comp$test$mse)

# compare error rates
RF_plot <- tibble(
  `Out of Bag Error` = oob,
  `Test error` = validation,
  ntrees = 1:rf_oob_comp$ntree
) %>%
  gather(Metric, RMSE, -ntrees) %>%
  ggplot(aes(ntrees, RMSE, color = Metric)) +
  geom_line() +
  scale_y_continuous(labels = scales::dollar) +
  xlab("Number of trees")
RF_plot

# tunning the forest
features <- setdiff(names(ttrk_train), "LSP")

set.seed(123)

m2 <- tuneRF(
  x          = ttrk_train[features],
  y          = ttrk_train$LSP,
  ntreeTry   = 500,
  mtryStart  = 5,
  stepFactor = 1.5,
  improve    = 0.01,
  trace      = FALSE      # to not show real-time progress 
) %>%
  head()
## 0.006923681 0.01 
## -0.01292261 0.01

# hyperparameter ranger grid search
hyper_grid_rf <- expand.grid(
  mtry       = seq(2, 8, by = 1),
  node_size  = seq(3, 9, by = 2),
  sampe_size = c(.55, .632, .70, .80),
  OOB_RMSE   = 0
)

for(i in 1:nrow(hyper_grid_rf)) {
  
  # train model
  model_rf <- ranger(
    formula         = LSP ~ ., 
    data            = ttrk_train, 
    num.trees       = 500,
    mtry            = hyper_grid_rf$mtry[i],
    min.node.size   = hyper_grid_rf$node_size[i],
    sample.fraction = hyper_grid_rf$sampe_size[i],
    seed            = 123
  )
  
  # add OOB error to grid
  hyper_grid_rf$OOB_RMSE[i] <- sqrt(model_rf$prediction.error)
}

hgf <- hyper_grid_rf %>% 
  arrange(OOB_RMSE) %>%
  head()
hgf
##   mtry node_size sampe_size OOB_RMSE
## 1    2         9      0.550 3.110761
## 2    2         7      0.550 3.114517
## 3    2         7      0.700 3.117972
## 4    2         9      0.700 3.119676
## 5    2         9      0.632 3.120601
## 6    2         9      0.800 3.122090
# optimal ranger

optimal_rf <- ranger(
    formula         = LSP ~ ., 
    data            = ttrk_train, 
    num.trees       = 500,
    mtry            = 2,
    min.node.size   = 9,
    sample.fraction = 0.55,
    seed            = 123,
    importance      = 'impurity'
)
optimal_rf
## Ranger result
## 
## Call:
##  ranger(formula = LSP ~ ., data = ttrk_train, num.trees = 500,      mtry = 2, min.node.size = 9, sample.fraction = 0.55, seed = 123,      importance = "impurity") 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      1312 
## Number of independent variables:  8 
## Mtry:                             2 
## Target node size:                 9 
## Variable importance mode:         impurity 
## Splitrule:                        variance 
## OOB prediction error (MSE):       9.676832 
## R squared (OOB):                  0.09083027
imp_ranger <- as.data.frame(optimal_rf$variable.importance) %>%
  rownames_to_column('Variable') %>%
  as_tibble()
colnames(imp_ranger)[2] <- 'Impurity'
imp_ranger <- imp_ranger %>%
  arrange(desc(Impurity))
imp_plot <- imp_ranger %>%
  ggplot(aes(reorder(Variable, Impurity), Impurity)) +
    geom_col() +
    coord_flip() +
    xlab('Variable') +
    ylab('Impurity')
imp_plot

Gradient Boosting Machines

ttrk_boosted <- train(LSP ~ .,
                data = ttrk_train,
                method = 'gbm',
                preProcess = c('scale', 'center'),
                trControl = trainControl(method = 'repeatedcv', 
                                         number = 5, 
                                         repeats = 3, 
                                         verboseIter = FALSE),
                verbose = 0)
ttrk_boosted
## Stochastic Gradient Boosting 
## 
## 1312 samples
##    8 predictor
## 
## Pre-processing: scaled (12), centered (12) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 1050, 1049, 1050, 1050, 1049, 1050, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE      Rsquared    MAE     
##   1                   50      3.076759  0.11411868  2.413394
##   1                  100      3.082923  0.11110649  2.415808
##   1                  150      3.087277  0.10924428  2.420023
##   2                   50      3.095434  0.10249101  2.424033
##   2                  100      3.117028  0.09431837  2.442313
##   2                  150      3.127816  0.09187910  2.454448
##   3                   50      3.112207  0.09492203  2.435534
##   3                  100      3.130910  0.08912133  2.446676
##   3                  150      3.159073  0.07942147  2.468635
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth =
##  1, shrinkage = 0.1 and n.minobsinnode = 10.
ttrk_prdcted <- predict(ttrk_boosted, ttrk_test)
ttrk_test$Score_Obs <- 100 - ttrk_test$LSP
ttrk_test$Score_Pre <- 100 - ttrk_prdcted

ttrk_test %>% ggplot(aes(Score_Obs, Score_Pre)) +
              geom_point() +
              xlab('Observd Score') +
              ylab('Predicted Score') +
              xlim(75, 100) +
              ylim(75, 100) +
              geom_smooth(method = 'glm')

#Tweddie Bosting

TDboost1 <- TDboost(LSP ~ Phone + Age + Status + Pol_Pfx + Gender + Distance,
                    data = ttrk_train,
                    var.monotone = c(0,0,0,0,0,0),
                    distribution = list(name="EDM", alpha=1.5),
                    n.trees = 3000,
                    shrinkage = 0.005,
                    interaction.depth=3,
                    bag.fraction = 0.5,
                    train.fraction = 0.5,
                    n.minobsinnode = 10,
                    cv.folds = 5,
                    keep.data=TRUE,
                    verbose=FALSE)

# print out the optimal iteration number M
best.iter <- TDboost.perf(TDboost1,method="test")
print(best.iter)
## [1] 317
# check performance using 5-fold cross-validation
best.iter <- TDboost.perf(TDboost1,method="cv")

print(best.iter)
## [1] 410
summary(TDboost1,n.trees=1)

##        var  rel.inf
## 1    Phone 61.51449
## 2      Age 38.48551
## 3   Status  0.00000
## 4  Pol_Pfx  0.00000
## 5   Gender  0.00000
## 6 Distance  0.00000
summary(TDboost1,n.trees=best.iter) # at the best iteration

##        var   rel.inf
## 1      Age 34.237837
## 2    Phone 29.734512
## 3 Distance 19.061247
## 4   Status  8.154111
## 5  Pol_Pfx  6.467005
## 6   Gender  2.345287
# making prediction on data2
f.predict <- predict.TDboost(TDboost1,ttrk_test,best.iter)

# least squares error
print(sum((data2$Y-f.predict)^2))
## Warning in data2$Y - f.predict: longer object length is not a multiple of
## shorter object length
## [1] 34015.07
# plot variable X1 after "best" iterations
plot.TDboost(TDboost1,1,best.iter)

# contour plot of variables 1 and 3 after "best" iterations
plot.TDboost(TDboost1,c(1,2),best.iter)

pretty.gbm.tree(TDboost1, i.tree = 337)
##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight
## 0        1  5.450000e+01        1         8           9       7.585837    328
## 1        1  2.250000e+01        2         3           7       4.567356    299
## 2       -1 -3.455305e-04       -1        -1          -1       0.000000     43
## 3        1  3.650000e+01        4         5           6       7.985867    256
## 4       -1  5.493514e-04       -1        -1          -1       0.000000    122
## 5       -1 -2.269121e-05       -1        -1          -1       0.000000    134
## 6       -1  2.499228e-04       -1        -1          -1       0.000000    256
## 7       -1  1.642891e-04       -1        -1          -1       0.000000    299
## 8       -1 -8.943381e-04       -1        -1          -1       0.000000     29
## 9       -1  7.069095e-05       -1        -1          -1       0.000000    328
##      Prediction
## 0  7.069095e-05
## 1  1.642891e-04
## 2 -3.455305e-04
## 3  2.499228e-04
## 4  5.493514e-04
## 5 -2.269121e-05
## 6  2.499228e-04
## 7  1.642891e-04
## 8 -8.943381e-04
## 9  7.069095e-05
# create hyperparameter grid
hyper_grid_gbm <- expand.grid(
  shrinkage = c(0.01, 0.1, 0.3),
  interaction.depth = c(1, 3, 5),
  n.minobsinnode = c(5, 10, 15),
  bag.fraction = c(0.65, 0.8, 1), 
  optimal_trees = 0,
  min_RMSE = 0
)

# total number of combinations
nrow(hyper_grid_gbm)
## [1] 81
# randomize data
random_index <- sample(1:nrow(ttrk_train), nrow(ttrk_train))
random_ttrk_train <- ttrk_train[random_index, ]

# grid search 
for(i in 1:nrow(hyper_grid_gbm)) {
  
  # reproducibility
  set.seed(123)
  
  # train model
  gbm.tune <- gbm(
    formula = LSP ~ .,
    distribution = "gaussian",
    data = random_ttrk_train,
    n.trees = 5000,
    interaction.depth = hyper_grid_gbm$interaction.depth[i],
    shrinkage = hyper_grid_gbm$shrinkage[i],
    n.minobsinnode = hyper_grid_gbm$n.minobsinnode[i],
    bag.fraction = hyper_grid_gbm$bag.fraction[i],
    train.fraction = .75,
    n.cores = NULL, # will use all cores by default
    verbose = FALSE
  )
  
  # add min training error and trees to grid
  hyper_grid_gbm$optimal_trees[i] <- which.min(gbm.tune$valid.error)
  hyper_grid_gbm$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}

hyper_grid_gbm %>% 
  arrange(min_RMSE) %>%
  tibble()
## # A tibble: 81 x 1
##    .$shrinkage $interaction.de~ $n.minobsinnode $bag.fraction $optimal_trees
##          <dbl>            <dbl>           <dbl>         <dbl>          <dbl>
##  1        0.1                 1               5           1               31
##  2        0.1                 1              10           1               31
##  3        0.1                 1              15           1               31
##  4        0.01                1              15           1              386
##  5        0.01                1               5           1              377
##  6        0.01                1              10           1              377
##  7        0.01                1              15           0.8            310
##  8        0.01                1              10           0.8            310
##  9        0.3                 1               5           1               12
## 10        0.3                 1              10           1               12
## # ... with 71 more rows, and 1 more variable: $min_RMSE <dbl>
# for reproducibility
set.seed(123)

# train GBM model
gbm.fit.final <- gbm(
  formula = LSP ~ .,
  distribution = "gaussian",
  data = ttrk_train,
  n.trees = 157,
  interaction.depth = 5,
  shrinkage = 0.01,
  n.minobsinnode = 10,
  bag.fraction = 1, 
  train.fraction = 1,
  n.cores = NULL, # will use all cores by default
  verbose = FALSE
)  

par(mar = c(5, 8, 1, 1))
summary(
  gbm.fit.final,
  method = relative.influence, # also can use permutation.test.gbm
  las = 2
)

##              var    rel.inf
## 1          Phone 55.8395816
## 2            Age 29.9286543
## 3       Distance  7.0833915
## 4         Status  3.5100212
## 5        Pol_Pfx  3.0709190
## 6         Gender  0.5674323
## 7 Marital_Status  0.0000000
## 8        pol_Pfx  0.0000000
vip(gbm.fit.final)

# xtreem Gradient Boosting Machine

# variable names
features <- setdiff(names(ttrk_train), "LSP")

# Create the treatment plan from the training data
treatplan <- designTreatmentsZ(ttrk_train, features, verbose = FALSE)

# Get the "clean" variable names from the scoreFrame
new_vars <- treatplan %>%
  magrittr::use_series(scoreFrame) %>%        
  dplyr::filter(code %in% c("clean", "lev")) %>% 
  magrittr::use_series(varName)     

# Prepare the training data
features_train <- prepare(treatplan, ttrk_train, varRestriction = new_vars) %>% as.matrix()
response_train <- ttrk_train$LSP

# Prepare the test data
features_test <- prepare(treatplan, ttrk_test, varRestriction = new_vars) %>% as.matrix()
response_test <- ttrk_test$LSP

# dimensions of one-hot encoded data
dim(features_train)
## [1] 1312   18
dim(features_test)
## [1] 562  18
# reproducibility
set.seed(123)

xgb.fit1 <- xgb.cv(
  data = features_train,
  label = response_train,
  nrounds = 1000,
  nfold = 5,
  objective = "reg:linear",  # for regression models
  verbose = FALSE               # silent,
)

# get number of trees that minimize error
xgb.fit1$evaluation_log %>%
  summarise(
    ntrees.train = which(train_rmse_mean == min(train_rmse_mean))[1],
    rmse.train   = min(train_rmse_mean),
    ntrees.test  = which(test_rmse_mean == min(test_rmse_mean))[1],
    rmse.test   = min(test_rmse_mean),
  )
##   ntrees.train rmse.train ntrees.test rmse.test
## 1          676   0.004042          11  3.283794
# plot error vs number trees
ggplot(xgb.fit1$evaluation_log) +
  geom_line(aes(iter, train_rmse_mean), color = "red") +
  geom_line(aes(iter, test_rmse_mean), color = "blue")

# with early stopping

# create parameter list
  params <- list(
    eta = .1,
    max_depth = 5,
    min_child_weight = 2,
    subsample = .8,
    colsample_bytree = .9
  )

# reproducibility
set.seed(123)

# train model
xgb.fit3 <- xgb.cv(
  params = params,
  data = features_train,
  label = response_train,
  nrounds = 1000,
  nfold = 5,
  objective = "reg:linear",  # for regression models
  verbose = 0,               # silent,
  early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)

# assess results
xgb.fit3$evaluation_log %>%
  summarise(
    ntrees.train = which(train_rmse_mean == min(train_rmse_mean))[1],
    rmse.train   = min(train_rmse_mean),
    ntrees.test  = which(test_rmse_mean == min(test_rmse_mean))[1],
    rmse.test   = min(test_rmse_mean),
  )
##   ntrees.train rmse.train ntrees.test rmse.test
## 1           45   2.420189          35  3.181312
# with hyperparameter grid

# create hyperparameter grid
hyper_grid <- expand.grid(
  eta = c(.01, .05, .1, .3),
  max_depth = c(1, 3, 5, 7),
  min_child_weight = c(1, 3, 5, 7),
  subsample = c(.65, .8, 1), 
  colsample_bytree = c(.8, .9, 1),
  optimal_trees = 0,               # a place to dump results
  min_RMSE = 0                     # a place to dump results
)

for(i in 1:nrow(hyper_grid)) {
  
  # create parameter list
  params <- list(
    eta = hyper_grid$eta[i],
    max_depth = hyper_grid$max_depth[i],
    min_child_weight = hyper_grid$min_child_weight[i],
    subsample = hyper_grid$subsample[i],
    colsample_bytree = hyper_grid$colsample_bytree[i]
  )
  
  # reproducibility
  set.seed(123)
  
  # train model
  xgb.tune <- xgb.cv(
    params = params,
    data = features_train,
    label = response_train,
    nrounds = 5000,
    nfold = 5,
    objective = "reg:linear",  # for regression models
    verbose = 0,               # silent,
    early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
  )
  
  # add min training error and trees to grid
  hyper_grid$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
  hyper_grid$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
}

hyper_grid %>%
  dplyr::arrange(min_RMSE) %>%
  tibble()
## # A tibble: 576 x 1
##    .$eta $max_depth $min_child_weig~ $subsample $colsample_bytr~ $optimal_trees
##    <dbl>      <dbl>            <dbl>      <dbl>            <dbl>          <dbl>
##  1   0.1          1                5       0.65              0.8             54
##  2   0.1          1                7       0.65              0.8             54
##  3   0.1          1                1       0.65              0.8             54
##  4   0.1          1                3       0.65              0.8             54
##  5   0.1          1                1       0.65              0.9             41
##  6   0.1          1                3       0.65              0.9             41
##  7   0.1          1                5       0.65              0.9             41
##  8   0.1          1                7       0.65              0.9             41
##  9   0.3          1                1       0.65              0.8             23
## 10   0.3          1                3       0.65              0.8             17
## # ... with 566 more rows, and 1 more variable: $min_RMSE <dbl>
# parameter list
params <- list(
  eta = 0.1,
  max_depth = 1,
  min_child_weight = 5,
  subsample = 0.65,
  colsample_bytree = 0.8
)

# train final model
xgb.fit.final <- xgboost(
  params = params,
  data = features_train,
  label = response_train,
  nrounds = 54,
  objective = "reg:linear",
  verbose = 0
)

# create importance matrix
importance_matrix <- xgb.importance(model = xgb.fit.final)

# variable importance plot
xgb.plot.importance(importance_matrix, top_n = 10, measure = "Gain")

# predict values for test data
pred <- predict(xgb.fit.final, features_test)

# results
RMSE(pred, response_test)
## [1] 3.090804