Exercise
Use the Hitters dataset from the previous tutorial.
Hitters Dataset: Baseball Data from the 1986 and 1987 seasons
A data frame with 322 observations of major league players on the following 20 variables.
library(dplyr)
library(rsample)
library(recipes)
library(caret)
library(yardstick)
data(Hitters, package = "ISLR")
head(Hitters)
## AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
## -Andy Allanson 293 66 1 30 29 14 1 293 66 1
## -Alan Ashby 315 81 7 24 38 39 14 3449 835 69
## -Alvin Davis 479 130 18 66 72 76 3 1624 457 63
## -Andre Dawson 496 141 20 65 78 37 11 5628 1575 225
## -Andres Galarraga 321 87 10 39 42 30 2 396 101 12
## -Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19
## CRuns CRBI CWalks League Division PutOuts Assists Errors
## -Andy Allanson 30 29 14 A E 446 33 20
## -Alan Ashby 321 414 375 N W 632 43 10
## -Alvin Davis 224 266 263 A W 880 82 14
## -Andre Dawson 828 838 354 N E 200 11 3
## -Andres Galarraga 48 46 33 N E 805 40 4
## -Alfredo Griffin 501 336 194 A W 282 421 25
## Salary NewLeague
## -Andy Allanson NA A
## -Alan Ashby 475.0 N
## -Alvin Davis 480.0 A
## -Andre Dawson 500.0 N
## -Andres Galarraga 91.5 N
## -Alfredo Griffin 750.0 A
Notes: when preparing the data:
# Remove rows with NA
Hitters <- Hitters |>
filter(complete.cases(Hitters))
set.seed(42)
splits <- initial_split(Hitters, prop = 0.5)
Hitters.train <- training(splits)
Hitters.test <- testing(splits)
A. Fit a regression trees to predict Salary from the other variables
rec <- recipe(Salary ~ ., data = Hitters.train)
tc <- trainControl(method = "cv", number = 5) # using 5-folds CV
tg <- expand.grid(
cp = seq(0, 0.3, length = 100) # Check 100 values between 0 to 0.3
)
fit.tree <- train(
x = rec,
data = Hitters.train,
method = "rpart", # A simple decision tree is fitted with method = "rpart"
tuneGrid = tg,
trControl = tc
)
Best CP:
fit.tree$bestTune
## cp
## 1 0
plot(fit.tree) # Plot CP RMSE
plot(fit.tree$finalModel) # display the tree structure
text(fit.tree$finalModel, pretty = 0, cex = 0.5)
# Add prediction column to test data
Hitters.test$tree_pred <- predict(fit.tree, newdata = Hitters.test)
rmse(Hitters.test, truth = Salary, estimate = tree_pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 379.
rsq(Hitters.test, truth = Salary, estimate = tree_pred)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.467
Random Forrest - find the optimal mtry with CV (one value should include the bagging option for this model). What was the best mtry?
tg <- expand.grid(
mtry = c(1:19) # Evaluate 1 to 19(p) predictors
)
set.seed(1234)
rf.Hitters <- train(
x = rec,
data = Hitters.train,
method = "rf",
# ntree = 500,
tuneGrid = tg,
trControl = tc
)
## Loading required namespace: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
plot(rf.Hitters) # Plot mtry RMSE
rf.Hitters$bestTune
## mtry
## 2 2
Two predictors yielded the best RMSE
Hitters.test$pred_rf <- predict(rf.Hitters, newdata = Hitters.test)
rmse(Hitters.test, truth = Salary, estimate = pred_rf)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 355.
rsq(Hitters.test, truth = Salary, estimate = pred_rf)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.564
3. Boosting - tune at least one of shrinkage and interaction.depth with CV. What was the best value(s)?
tg <- expand.grid(
## Complexity
max_depth = 1, # [1, Inf] limits the depth of each tree
min_child_weight = 5, # [1, Inf] don't split if you get less obs in a node
gamma = c(0, 0.1, 0.2, 0.5, 1, 2, 5, 10), # [0, Inf] node splitting regularization
## Gradient
eta = 0.1, # [0, 1] learning rate
nrounds = 1000, # [1, Inf] number of trees
# lower eta should come with higher nrounds
## Randomness
colsample_bytree = 1, # [0, 1] like mtry in rf
subsample = 1 # [0, 1] like bagging / rf
)
Apply Dummy Coding on factor predictors: for using
xgboost algorithm
rec <- recipe(Salary ~ ., data = Hitters.train) |>
step_dummy(all_factor())
rec <- prep(rec)
set.seed(1234)
boost.Hitters <- train(
x = rec,
data = Hitters.train,
method = "xgbTree",
tuneGrid = tg,
trControl = tc
)
## Loading required namespace: xgboost
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
Best Gamma
boost.Hitters$bestTune
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1 1000 1 0.1 0 1 5 1
Hitters.test$pred_boost <- predict(boost.Hitters, newdata = Hitters.test)
rmse(Hitters.test, truth = Salary, estimate = pred_boost)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 365.
rsq(Hitters.test, truth = Salary, estimate = pred_boost)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rsq standard 0.519
Compare best RMSE of cv for each model:
min(fit.tree$results$RMSE)
## [1] 279.7316
min(rf.Hitters$results$RMSE)
## [1] 249.4588
min(boost.Hitters$results$RMSE)
## [1] 269.826
Random Forest seems to have the best RMSE of cv(249.459).
basictree_rmse <- rmse(Hitters.test, truth = Salary, estimate = tree_pred)
rf_rmse <- rmse(Hitters.test, truth = Salary, estimate = pred_rf)
boost_rmse <- rmse(Hitters.test, truth = Salary, estimate = pred_boost)
print(c(basictree_rmse, rf_rmse, boost_rmse))
## $.metric
## [1] "rmse"
##
## $.estimator
## [1] "standard"
##
## $.estimate
## [1] 379.0146
##
## $.metric
## [1] "rmse"
##
## $.estimator
## [1] "standard"
##
## $.estimate
## [1] 355.308
##
## $.metric
## [1] "rmse"
##
## $.estimator
## [1] "standard"
##
## $.estimate
## [1] 364.5122
Random Forest method seems the have the best test RMSE as well(355.308)
varImp(fit.tree) |> plot()
varImp(boost.Hitters) |> plot()
varImp(rf.Hitters) |> plot()
Basic Tree - CRBI > CAtBat > CHits > CRun
Random Forest - CRun > CRBI > Walks > CHits > Years
Boosting - CHits > CAtBat > CRBI > CWalks > CRun