Exercise

  • Use the Hitters dataset from the previous tutorial.

  • Hitters Dataset: Baseball Data from the 1986 and 1987 seasons

  • A data frame with 322 observations of major league players on the following 20 variables.

Pre Process

Load Libraries

library(dplyr)

library(rsample)
library(recipes)
library(caret)
library(yardstick)

Load Data

data(Hitters, package = "ISLR")
head(Hitters)
##                   AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
## -Andy Allanson      293   66     1   30  29    14     1    293    66      1
## -Alan Ashby         315   81     7   24  38    39    14   3449   835     69
## -Alvin Davis        479  130    18   66  72    76     3   1624   457     63
## -Andre Dawson       496  141    20   65  78    37    11   5628  1575    225
## -Andres Galarraga   321   87    10   39  42    30     2    396   101     12
## -Alfredo Griffin    594  169     4   74  51    35    11   4408  1133     19
##                   CRuns CRBI CWalks League Division PutOuts Assists Errors
## -Andy Allanson       30   29     14      A        E     446      33     20
## -Alan Ashby         321  414    375      N        W     632      43     10
## -Alvin Davis        224  266    263      A        W     880      82     14
## -Andre Dawson       828  838    354      N        E     200      11      3
## -Andres Galarraga    48   46     33      N        E     805      40      4
## -Alfredo Griffin    501  336    194      A        W     282     421     25
##                   Salary NewLeague
## -Andy Allanson        NA         A
## -Alan Ashby        475.0         N
## -Alvin Davis       480.0         A
## -Andre Dawson      500.0         N
## -Andres Galarraga   91.5         N
## -Alfredo Griffin   750.0         A

Notes: when preparing the data:

  • remove cases with NA values.
  • split to train and test with p=0.5
# Remove rows with NA 
Hitters <- Hitters |>
  filter(complete.cases(Hitters))

Split Data

set.seed(42)
splits <- initial_split(Hitters, prop = 0.5)
Hitters.train <- training(splits)
Hitters.test <- testing(splits)

A. Fit a regression trees to predict Salary from the other variables

Basic Tree

Model Configuration

rec <- recipe(Salary ~ ., data = Hitters.train)

tc <- trainControl(method = "cv", number = 5) # using 5-folds CV

tg <- expand.grid(
  cp = seq(0, 0.3, length = 100) # Check 100 values between 0 to 0.3
)

Train Model

fit.tree <- train(
  x = rec,
  data = Hitters.train,
  method = "rpart", # A simple decision tree is fitted with method = "rpart"
  tuneGrid = tg,
  trControl = tc
) 

Best CP:

fit.tree$bestTune
##   cp
## 1  0

Plot Model

plot(fit.tree) # Plot CP RMSE

plot(fit.tree$finalModel) # display the tree structure
text(fit.tree$finalModel, pretty = 0, cex = 0.5)

Predictions - Model Performance

# Add prediction column to test data
Hitters.test$tree_pred <- predict(fit.tree, newdata = Hitters.test)
rmse(Hitters.test, truth = Salary, estimate = tree_pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        379.
rsq(Hitters.test, truth = Salary, estimate = tree_pred)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.467

Random Forest

Random Forrest - find the optimal mtry with CV (one value should include the bagging option for this model). What was the best mtry?

m Tuning configuration

tg <- expand.grid(
  mtry = c(1:19) # Evaluate 1 to 19(p) predictors
)

Train Model

set.seed(1234)
rf.Hitters <- train(
  x = rec, 
  data = Hitters.train,
  method = "rf",
  # ntree = 500,
  tuneGrid = tg,
  trControl = tc
)
## Loading required namespace: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine

Plot mtry Tuning

plot(rf.Hitters) # Plot mtry RMSE

rf.Hitters$bestTune
##   mtry
## 2    2

Two predictors yielded the best RMSE

Predictions - Model Performance

Hitters.test$pred_rf <- predict(rf.Hitters, newdata = Hitters.test)
rmse(Hitters.test, truth = Salary, estimate = pred_rf)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        355.
rsq(Hitters.test, truth = Salary, estimate = pred_rf)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.564

Boosting

3. Boosting - tune at least one of shrinkage and interaction.depth with CV. What was the best value(s)?

Tune Gamma

tg <- expand.grid(
  ## Complexity
  max_depth = 1, # [1, Inf] limits the depth of each tree
  min_child_weight = 5, # [1, Inf] don't split if you get less obs in a node
  gamma = c(0, 0.1, 0.2, 0.5, 1, 2, 5, 10), # [0, Inf] node splitting regularization
  
  ## Gradient
  eta = 0.1, # [0, 1] learning rate
  nrounds = 1000, # [1, Inf] number of trees
  # lower eta should come with higher nrounds
  
  ## Randomness
  colsample_bytree = 1, # [0, 1] like mtry in rf
  subsample = 1 # [0, 1] like bagging / rf
)

Apply Dummy Coding on factor predictors: for using xgboost algorithm

rec <- recipe(Salary ~ ., data = Hitters.train) |>
  step_dummy(all_factor())
rec <- prep(rec)
set.seed(1234)
boost.Hitters <- train(
  x = rec, 
  data = Hitters.train,
  method = "xgbTree",
  tuneGrid = tg,
  trControl = tc
)
## Loading required namespace: xgboost
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Best Gamma

boost.Hitters$bestTune
##   nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 1    1000         1 0.1     0                1                5         1

Predictions - Model Performance

Hitters.test$pred_boost <- predict(boost.Hitters, newdata = Hitters.test)
rmse(Hitters.test, truth = Salary, estimate = pred_boost)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        365.
rsq(Hitters.test, truth = Salary, estimate = pred_boost)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rsq     standard       0.519

Compare The Models

1. Which has the best CV RMSE?

Compare best RMSE of cv for each model:

min(fit.tree$results$RMSE)
## [1] 279.7316
min(rf.Hitters$results$RMSE)
## [1] 249.4588
min(boost.Hitters$results$RMSE)
## [1] 269.826

Random Forest seems to have the best RMSE of cv(249.459).

2. Which has the best test RMSE?

basictree_rmse <- rmse(Hitters.test, truth = Salary, estimate = tree_pred)

rf_rmse <- rmse(Hitters.test, truth = Salary, estimate = pred_rf)

boost_rmse <- rmse(Hitters.test, truth = Salary, estimate = pred_boost)

print(c(basictree_rmse, rf_rmse, boost_rmse))
## $.metric
## [1] "rmse"
## 
## $.estimator
## [1] "standard"
## 
## $.estimate
## [1] 379.0146
## 
## $.metric
## [1] "rmse"
## 
## $.estimator
## [1] "standard"
## 
## $.estimate
## [1] 355.308
## 
## $.metric
## [1] "rmse"
## 
## $.estimator
## [1] "standard"
## 
## $.estimate
## [1] 364.5122

Random Forest method seems the have the best test RMSE as well(355.308)

3. Which predictors were most “important” in each method?

varImp(fit.tree) |> plot()

varImp(boost.Hitters) |> plot()

varImp(rf.Hitters) |> plot()

Most Informative Predictors

  1. Basic Tree - CRBI > CAtBat > CHits > CRun

  2. Random Forest - CRun > CRBI > Walks > CHits > Years

  3. Boosting - CHits > CAtBat > CRBI > CWalks > CRun