Loading libraries for diverse Regression Trees based methods.
library(readxl) # reading Excel, duh!
library(tidyverse) # because you want it pretty
## -- Attaching packages ----------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rsample) # data splitting
library(rpart) # performing regression trees
library(rpart.plot) # plotting regression trees
library(data.table) # managing data tables more computer-efficiently
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(ipred) # fitting a bagged tree
library(randomForest) # basic implementation
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ranger) # a faster implementation of randomForest
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(caret) # an aggregator package for performing many machine learning models
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071) # caret uses it for cross-validation
library(gbm) # fitting a boosted tree
## Loaded gbm 2.1.5
library(TDboost) # fitting a Tweedie boosted tree
## Loaded TDboost 1.2
##
## Attaching package: 'TDboost'
## The following object is masked from 'package:gbm':
##
## relative.influence
library(xgboost) # fitting a XGboosted tree
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(vip) # projecting variable importance
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(vtreat) # encoding data frames for xgboost
Creating training and test data.
# preparing data base
Telematics <- read_excel("~/Telematics/Full Drivers Performance Report for R.xlsx")
teletrck <- Telematics %>%
select(Gender, Marital_Status, Age, Status, Phone, Pol_Pfx, Distance, Score) %>%
filter(!is.na(Score)) %>%
filter(Distance > 100) %>%
filter(Distance < 30000) %>%
mutate(Pol_Pfx = fct_recode(Pol_Pfx,
'Annual' = 'EAA',
'Bravo' = 'EAB',
'Enhanced' = 'EAL',
'Select' = 'EAS',
)) %>%
mutate(LSP = 100 - Score) %>% # Lost Score Points (LSP) model
select(-Score)
teletrck$Gender <- as.factor(teletrck$Gender)
teletrck$Marital_Status <- as.factor(teletrck$Marital_Status)
teletrck$Status <- as.factor(teletrck$Status)
teletrck$Phone <- as.factor(teletrck$Phone)
teletrck$pol_Pfx <- as.factor(teletrck$Pol_Pfx)
# Create training (70%) and test (30%) sets
set.seed(123)
ttrk_split <- initial_split(teletrck, prop = .7)
ttrk_train <- training(ttrk_split)
ttrk_test <- testing(ttrk_split)
Fitting a basic tree
ttrk_trees <- rpart(LSP ~ ., ttrk_train, method = 'anova')
ttrk_trees
## n= 1312
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1312 13953.750 8.063262
## 2) Phone=android 701 6490.833 7.196862
## 4) Age>=36.5 443 3658.248 6.693002 *
## 5) Age< 36.5 258 2527.008 8.062016 *
## 3) Phone=iPhone 611 6332.995 9.057283
## 6) Age>=38.5 256 2551.527 8.457031 *
## 7) Age< 38.5 355 3622.715 9.490141 *
rpart.plot(ttrk_trees)
plotcp(ttrk_trees)
Tunning the tree
# Hyperparameters Grid 1
hyper_grid <- expand.grid(
minsplit = seq(2, 10, 1),
maxdepth = seq(2, 5, 1)
)
models <- list()
for (i in 1:nrow(hyper_grid)) {
# get minsplit, maxdepth values at row i
minsplit <- hyper_grid$minsplit[i]
maxdepth <- hyper_grid$maxdepth[i]
# train a model and store in the list
models[[i]] <- rpart(
formula = LSP ~ .,
data = ttrk_train,
method = "anova",
control = list(minsplit = minsplit, maxdepth = maxdepth)
)
}
# function to get optimal cp
get_cp <- function(x) {
min <- which.min(x$cptable[, "CP"])
cp <- x$cptable[min, "CP"]
}
# function to get minimum error
get_min_error <- function(x) {
min <- which.min(x$cptable[, "xerror"])
xerror <- x$cptable[min, "xerror"]
}
hgo <- hyper_grid %>%
mutate(
cp = map_dbl(models, get_cp),
error = map_dbl(models, get_min_error)
) %>%
arrange(error) %>%
head()
hgo
## minsplit maxdepth cp error
## 1 8 2 0.01 0.9024600
## 2 10 3 0.01 0.9036202
## 3 4 5 0.01 0.9037967
## 4 5 3 0.01 0.9051272
## 5 3 2 0.01 0.9052680
## 6 3 3 0.01 0.9054220
optimal_tree <- rpart(
formula = LSP ~ .,
data = ttrk_train,
method = "anova",
control = list(minsplit = 8, maxdepth = 2, cp = 0.01)
)
optimal_tree
## n= 1312
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1312 13953.750 8.063262
## 2) Phone=android 701 6490.833 7.196862
## 4) Age>=36.5 443 3658.248 6.693002 *
## 5) Age< 36.5 258 2527.008 8.062016 *
## 3) Phone=iPhone 611 6332.995 9.057283
## 6) Age>=38.5 256 2551.527 8.457031 *
## 7) Age< 38.5 355 3622.715 9.490141 *
rpart.plot(optimal_tree)
plotcp(optimal_tree)
pred <- predict(optimal_tree, newdata = ttrk_test)
RMSE(pred, ttrk_test$LSP)
## [1] 3.11548
Bagging
# make bootstrapping reproducible
set.seed(123)
# train bagged model with ipred
bagged_m1 <- bagging(
formula = LSP ~ .,
data = ttrk_train,
coob = TRUE,
nbagg = 100 # default is 25
)
bagged_m1
##
## Bagging regression trees with 100 bootstrap replications
##
## Call: bagging.data.frame(formula = LSP ~ ., data = ttrk_train, coob = TRUE,
## nbagg = 100)
##
## Out-of-bag estimate of root mean squared error: 3.1007
# assess 10-100 bagged trees
ntree <- 10:100
# create empty vector to store OOB RMSE values
rmse <- vector(mode = "numeric", length = length(ntree))
for (i in seq_along(ntree)) {
# reproducibility
set.seed(123)
# perform bagged model
model <- bagging(
formula = LSP ~ .,
data = ttrk_train,
coob = TRUE,
nbagg = ntree[i]
)
# get OOB error
rmse[i] <- model$err
}
plot(ntree, rmse, type = 'l', lwd = 2)
# cross-validation and variable importance with caret
# Specify 10-fold cross validation
ctrl <- trainControl(method = "cv", number = 10)
# CV bagged model
bagged_cv <- train(
LSP ~ .,
data = ttrk_train,
method = "treebag",
trControl = ctrl,
importance = TRUE
)
bagged_cv
## Bagged CART
##
## 1312 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1182, 1181, 1180, 1182, 1181, 1180, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 3.102777 0.09860987 2.450765
# plot most important variables
plot(varImp(bagged_cv))
Random Forests
# for reproduciblity
set.seed(123)
# default RF model
RF1 <- randomForest(
formula = LSP ~ .,
data = ttrk_train
)
RF1
##
## Call:
## randomForest(formula = LSP ~ ., data = ttrk_train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 2
##
## Mean of squared residuals: 9.799114
## % Var explained: 7.86
plot(RF1)
# number of trees with lowest MSE
which.min(RF1$mse)
## [1] 69
# RMSE of this optimal random forest
sqrt(RF1$mse[which.min(RF1$mse)])
## [1] 3.125713
# create training and validation data
set.seed(123)
valid_split <- initial_split(ttrk_train, .8)
# training data
ttrk_train_v2 <- analysis(valid_split)
# validation data
ttrk_valid <- assessment(valid_split)
x_test <- ttrk_valid[setdiff(names(ttrk_valid), "LSP")]
y_test <- ttrk_valid$LSP
rf_oob_comp <- randomForest(
formula = LSP ~ .,
data = ttrk_train_v2,
xtest = x_test,
ytest = y_test
)
# extract OOB & validation errors
oob <- sqrt(rf_oob_comp$mse)
validation <- sqrt(rf_oob_comp$test$mse)
# compare error rates
RF_plot <- tibble(
`Out of Bag Error` = oob,
`Test error` = validation,
ntrees = 1:rf_oob_comp$ntree
) %>%
gather(Metric, RMSE, -ntrees) %>%
ggplot(aes(ntrees, RMSE, color = Metric)) +
geom_line() +
scale_y_continuous(labels = scales::dollar) +
xlab("Number of trees")
RF_plot
# tunning the forest
features <- setdiff(names(ttrk_train), "LSP")
set.seed(123)
m2 <- tuneRF(
x = ttrk_train[features],
y = ttrk_train$LSP,
ntreeTry = 500,
mtryStart = 5,
stepFactor = 1.5,
improve = 0.01,
trace = FALSE # to not show real-time progress
) %>%
head()
## 0.006923681 0.01
## -0.01292261 0.01
# hyperparameter ranger grid search
hyper_grid_rf <- expand.grid(
mtry = seq(2, 8, by = 1),
node_size = seq(3, 9, by = 2),
sampe_size = c(.55, .632, .70, .80),
OOB_RMSE = 0
)
for(i in 1:nrow(hyper_grid_rf)) {
# train model
model_rf <- ranger(
formula = LSP ~ .,
data = ttrk_train,
num.trees = 500,
mtry = hyper_grid_rf$mtry[i],
min.node.size = hyper_grid_rf$node_size[i],
sample.fraction = hyper_grid_rf$sampe_size[i],
seed = 123
)
# add OOB error to grid
hyper_grid_rf$OOB_RMSE[i] <- sqrt(model_rf$prediction.error)
}
hgf <- hyper_grid_rf %>%
arrange(OOB_RMSE) %>%
head()
hgf
## mtry node_size sampe_size OOB_RMSE
## 1 2 9 0.550 3.110761
## 2 2 7 0.550 3.114517
## 3 2 7 0.700 3.117972
## 4 2 9 0.700 3.119676
## 5 2 9 0.632 3.120601
## 6 2 9 0.800 3.122090
# optimal ranger
optimal_rf <- ranger(
formula = LSP ~ .,
data = ttrk_train,
num.trees = 500,
mtry = 2,
min.node.size = 9,
sample.fraction = 0.55,
seed = 123,
importance = 'impurity'
)
optimal_rf
## Ranger result
##
## Call:
## ranger(formula = LSP ~ ., data = ttrk_train, num.trees = 500, mtry = 2, min.node.size = 9, sample.fraction = 0.55, seed = 123, importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 1312
## Number of independent variables: 8
## Mtry: 2
## Target node size: 9
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 9.676832
## R squared (OOB): 0.09083027
imp_ranger <- as.data.frame(optimal_rf$variable.importance) %>%
rownames_to_column('Variable') %>%
as_tibble()
colnames(imp_ranger)[2] <- 'Impurity'
imp_ranger <- imp_ranger %>%
arrange(desc(Impurity))
imp_plot <- imp_ranger %>%
ggplot(aes(reorder(Variable, Impurity), Impurity)) +
geom_col() +
coord_flip() +
xlab('Variable') +
ylab('Impurity')
imp_plot
Gradient Boosting Machines
ttrk_boosted <- train(LSP ~ .,
data = ttrk_train,
method = 'gbm',
preProcess = c('scale', 'center'),
trControl = trainControl(method = 'repeatedcv',
number = 5,
repeats = 3,
verboseIter = FALSE),
verbose = 0)
ttrk_boosted
## Stochastic Gradient Boosting
##
## 1312 samples
## 8 predictor
##
## Pre-processing: scaled (12), centered (12)
## Resampling: Cross-Validated (5 fold, repeated 3 times)
## Summary of sample sizes: 1050, 1049, 1050, 1050, 1049, 1050, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 3.076759 0.11411868 2.413394
## 1 100 3.082923 0.11110649 2.415808
## 1 150 3.087277 0.10924428 2.420023
## 2 50 3.095434 0.10249101 2.424033
## 2 100 3.117028 0.09431837 2.442313
## 2 150 3.127816 0.09187910 2.454448
## 3 50 3.112207 0.09492203 2.435534
## 3 100 3.130910 0.08912133 2.446676
## 3 150 3.159073 0.07942147 2.468635
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 50, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 10.
ttrk_prdcted <- predict(ttrk_boosted, ttrk_test)
ttrk_test$Score_Obs <- 100 - ttrk_test$LSP
ttrk_test$Score_Pre <- 100 - ttrk_prdcted
ttrk_test %>% ggplot(aes(Score_Obs, Score_Pre)) +
geom_point() +
xlab('Observd Score') +
ylab('Predicted Score') +
xlim(75, 100) +
ylim(75, 100) +
geom_smooth(method = 'glm')
#Tweddie Bosting
TDboost1 <- TDboost(LSP ~ Phone + Age + Status + Pol_Pfx + Gender + Distance,
data = ttrk_train,
var.monotone = c(0,0,0,0,0,0),
distribution = list(name="EDM", alpha=1.5),
n.trees = 3000,
shrinkage = 0.005,
interaction.depth=3,
bag.fraction = 0.5,
train.fraction = 0.5,
n.minobsinnode = 10,
cv.folds = 5,
keep.data=TRUE,
verbose=FALSE)
# print out the optimal iteration number M
best.iter <- TDboost.perf(TDboost1,method="test")
print(best.iter)
## [1] 317
# check performance using 5-fold cross-validation
best.iter <- TDboost.perf(TDboost1,method="cv")
print(best.iter)
## [1] 410
summary(TDboost1,n.trees=1)
## var rel.inf
## 1 Phone 61.51449
## 2 Age 38.48551
## 3 Status 0.00000
## 4 Pol_Pfx 0.00000
## 5 Gender 0.00000
## 6 Distance 0.00000
summary(TDboost1,n.trees=best.iter) # at the best iteration
## var rel.inf
## 1 Age 34.237837
## 2 Phone 29.734512
## 3 Distance 19.061247
## 4 Status 8.154111
## 5 Pol_Pfx 6.467005
## 6 Gender 2.345287
# making prediction on data2
f.predict <- predict.TDboost(TDboost1,ttrk_test,best.iter)
# least squares error
print(sum((data2$Y-f.predict)^2))
## Warning in data2$Y - f.predict: longer object length is not a multiple of
## shorter object length
## [1] 34015.07
# plot variable X1 after "best" iterations
plot.TDboost(TDboost1,1,best.iter)
# contour plot of variables 1 and 3 after "best" iterations
plot.TDboost(TDboost1,c(1,2),best.iter)
pretty.gbm.tree(TDboost1, i.tree = 337)
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction Weight
## 0 1 5.450000e+01 1 8 9 7.585837 328
## 1 1 2.250000e+01 2 3 7 4.567356 299
## 2 -1 -3.455305e-04 -1 -1 -1 0.000000 43
## 3 1 3.650000e+01 4 5 6 7.985867 256
## 4 -1 5.493514e-04 -1 -1 -1 0.000000 122
## 5 -1 -2.269121e-05 -1 -1 -1 0.000000 134
## 6 -1 2.499228e-04 -1 -1 -1 0.000000 256
## 7 -1 1.642891e-04 -1 -1 -1 0.000000 299
## 8 -1 -8.943381e-04 -1 -1 -1 0.000000 29
## 9 -1 7.069095e-05 -1 -1 -1 0.000000 328
## Prediction
## 0 7.069095e-05
## 1 1.642891e-04
## 2 -3.455305e-04
## 3 2.499228e-04
## 4 5.493514e-04
## 5 -2.269121e-05
## 6 2.499228e-04
## 7 1.642891e-04
## 8 -8.943381e-04
## 9 7.069095e-05
# create hyperparameter grid
hyper_grid_gbm <- expand.grid(
shrinkage = c(0.01, 0.1, 0.3),
interaction.depth = c(1, 3, 5),
n.minobsinnode = c(5, 10, 15),
bag.fraction = c(0.65, 0.8, 1),
optimal_trees = 0,
min_RMSE = 0
)
# total number of combinations
nrow(hyper_grid_gbm)
## [1] 81
# randomize data
random_index <- sample(1:nrow(ttrk_train), nrow(ttrk_train))
random_ttrk_train <- ttrk_train[random_index, ]
# grid search
for(i in 1:nrow(hyper_grid_gbm)) {
# reproducibility
set.seed(123)
# train model
gbm.tune <- gbm(
formula = LSP ~ .,
distribution = "gaussian",
data = random_ttrk_train,
n.trees = 5000,
interaction.depth = hyper_grid_gbm$interaction.depth[i],
shrinkage = hyper_grid_gbm$shrinkage[i],
n.minobsinnode = hyper_grid_gbm$n.minobsinnode[i],
bag.fraction = hyper_grid_gbm$bag.fraction[i],
train.fraction = .75,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
# add min training error and trees to grid
hyper_grid_gbm$optimal_trees[i] <- which.min(gbm.tune$valid.error)
hyper_grid_gbm$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}
hyper_grid_gbm %>%
arrange(min_RMSE) %>%
tibble()
## # A tibble: 81 x 1
## .$shrinkage $interaction.de~ $n.minobsinnode $bag.fraction $optimal_trees
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.1 1 5 1 31
## 2 0.1 1 10 1 31
## 3 0.1 1 15 1 31
## 4 0.01 1 15 1 386
## 5 0.01 1 5 1 377
## 6 0.01 1 10 1 377
## 7 0.01 1 15 0.8 310
## 8 0.01 1 10 0.8 310
## 9 0.3 1 5 1 12
## 10 0.3 1 10 1 12
## # ... with 71 more rows, and 1 more variable: $min_RMSE <dbl>
# for reproducibility
set.seed(123)
# train GBM model
gbm.fit.final <- gbm(
formula = LSP ~ .,
distribution = "gaussian",
data = ttrk_train,
n.trees = 157,
interaction.depth = 5,
shrinkage = 0.01,
n.minobsinnode = 10,
bag.fraction = 1,
train.fraction = 1,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
par(mar = c(5, 8, 1, 1))
summary(
gbm.fit.final,
method = relative.influence, # also can use permutation.test.gbm
las = 2
)
## var rel.inf
## 1 Phone 55.8395816
## 2 Age 29.9286543
## 3 Distance 7.0833915
## 4 Status 3.5100212
## 5 Pol_Pfx 3.0709190
## 6 Gender 0.5674323
## 7 Marital_Status 0.0000000
## 8 pol_Pfx 0.0000000
vip(gbm.fit.final)
# xtreem Gradient Boosting Machine
# variable names
features <- setdiff(names(ttrk_train), "LSP")
# Create the treatment plan from the training data
treatplan <- designTreatmentsZ(ttrk_train, features, verbose = FALSE)
# Get the "clean" variable names from the scoreFrame
new_vars <- treatplan %>%
magrittr::use_series(scoreFrame) %>%
dplyr::filter(code %in% c("clean", "lev")) %>%
magrittr::use_series(varName)
# Prepare the training data
features_train <- prepare(treatplan, ttrk_train, varRestriction = new_vars) %>% as.matrix()
response_train <- ttrk_train$LSP
# Prepare the test data
features_test <- prepare(treatplan, ttrk_test, varRestriction = new_vars) %>% as.matrix()
response_test <- ttrk_test$LSP
# dimensions of one-hot encoded data
dim(features_train)
## [1] 1312 18
dim(features_test)
## [1] 562 18
# reproducibility
set.seed(123)
xgb.fit1 <- xgb.cv(
data = features_train,
label = response_train,
nrounds = 1000,
nfold = 5,
objective = "reg:linear", # for regression models
verbose = FALSE # silent,
)
# get number of trees that minimize error
xgb.fit1$evaluation_log %>%
summarise(
ntrees.train = which(train_rmse_mean == min(train_rmse_mean))[1],
rmse.train = min(train_rmse_mean),
ntrees.test = which(test_rmse_mean == min(test_rmse_mean))[1],
rmse.test = min(test_rmse_mean),
)
## ntrees.train rmse.train ntrees.test rmse.test
## 1 676 0.004042 11 3.283794
# plot error vs number trees
ggplot(xgb.fit1$evaluation_log) +
geom_line(aes(iter, train_rmse_mean), color = "red") +
geom_line(aes(iter, test_rmse_mean), color = "blue")
# with early stopping
# create parameter list
params <- list(
eta = .1,
max_depth = 5,
min_child_weight = 2,
subsample = .8,
colsample_bytree = .9
)
# reproducibility
set.seed(123)
# train model
xgb.fit3 <- xgb.cv(
params = params,
data = features_train,
label = response_train,
nrounds = 1000,
nfold = 5,
objective = "reg:linear", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)
# assess results
xgb.fit3$evaluation_log %>%
summarise(
ntrees.train = which(train_rmse_mean == min(train_rmse_mean))[1],
rmse.train = min(train_rmse_mean),
ntrees.test = which(test_rmse_mean == min(test_rmse_mean))[1],
rmse.test = min(test_rmse_mean),
)
## ntrees.train rmse.train ntrees.test rmse.test
## 1 45 2.420189 35 3.181312
# with hyperparameter grid
# create hyperparameter grid
hyper_grid <- expand.grid(
eta = c(.01, .05, .1, .3),
max_depth = c(1, 3, 5, 7),
min_child_weight = c(1, 3, 5, 7),
subsample = c(.65, .8, 1),
colsample_bytree = c(.8, .9, 1),
optimal_trees = 0, # a place to dump results
min_RMSE = 0 # a place to dump results
)
for(i in 1:nrow(hyper_grid)) {
# create parameter list
params <- list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i],
min_child_weight = hyper_grid$min_child_weight[i],
subsample = hyper_grid$subsample[i],
colsample_bytree = hyper_grid$colsample_bytree[i]
)
# reproducibility
set.seed(123)
# train model
xgb.tune <- xgb.cv(
params = params,
data = features_train,
label = response_train,
nrounds = 5000,
nfold = 5,
objective = "reg:linear", # for regression models
verbose = 0, # silent,
early_stopping_rounds = 10 # stop if no improvement for 10 consecutive trees
)
# add min training error and trees to grid
hyper_grid$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_rmse_mean)
hyper_grid$min_RMSE[i] <- min(xgb.tune$evaluation_log$test_rmse_mean)
}
hyper_grid %>%
dplyr::arrange(min_RMSE) %>%
tibble()
## # A tibble: 576 x 1
## .$eta $max_depth $min_child_weig~ $subsample $colsample_bytr~ $optimal_trees
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.1 1 5 0.65 0.8 54
## 2 0.1 1 7 0.65 0.8 54
## 3 0.1 1 1 0.65 0.8 54
## 4 0.1 1 3 0.65 0.8 54
## 5 0.1 1 1 0.65 0.9 41
## 6 0.1 1 3 0.65 0.9 41
## 7 0.1 1 5 0.65 0.9 41
## 8 0.1 1 7 0.65 0.9 41
## 9 0.3 1 1 0.65 0.8 23
## 10 0.3 1 3 0.65 0.8 17
## # ... with 566 more rows, and 1 more variable: $min_RMSE <dbl>
# parameter list
params <- list(
eta = 0.1,
max_depth = 1,
min_child_weight = 5,
subsample = 0.65,
colsample_bytree = 0.8
)
# train final model
xgb.fit.final <- xgboost(
params = params,
data = features_train,
label = response_train,
nrounds = 54,
objective = "reg:linear",
verbose = 0
)
# create importance matrix
importance_matrix <- xgb.importance(model = xgb.fit.final)
# variable importance plot
xgb.plot.importance(importance_matrix, top_n = 10, measure = "Gain")
# predict values for test data
pred <- predict(xgb.fit.final, features_test)
# results
RMSE(pred, response_test)
## [1] 3.090804