knitr::opts_chunk$set(
  echo = TRUE,
  tidy = FALSE,
  comment = NA

)
library(fpp3)
library(dplyr)
library(ggplot2)
library(glmnet)
library(randomForest)
library(mlbench)
Warning: package 'mlbench' was built under R version 4.4.3
library(randomForest)
library(caret)
Warning: package 'caret' was built under R version 4.4.3
Loading required package: lattice

Attaching package: 'caret'
The following objects are masked from 'package:fabletools':

    MAE, RMSE
library(ggplot2)
library(dplyr)
library(party)
Warning: package 'party' was built under R version 4.4.3
Loading required package: grid
Loading required package: mvtnorm
Warning: package 'mvtnorm' was built under R version 4.4.3
Loading required package: modeltools
Warning: package 'modeltools' was built under R version 4.4.3
Loading required package: stats4

Attaching package: 'modeltools'
The following object is masked from 'package:fabletools':

    refit
Loading required package: strucchange
Warning: package 'strucchange' was built under R version 4.4.3
Loading required package: zoo
Warning: package 'zoo' was built under R version 4.4.3

Attaching package: 'zoo'
The following object is masked from 'package:tsibble':

    index
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
Loading required package: sandwich
Warning: package 'sandwich' was built under R version 4.4.3

Attaching package: 'party'
The following object is masked from 'package:fabletools':

    response
The following object is masked from 'package:dplyr':

    where
library(gbm)
Warning: package 'gbm' was built under R version 4.4.3
Loaded gbm 2.2.3
This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(Cubist)
Warning: package 'Cubist' was built under R version 4.4.3
library(AppliedPredictiveModeling)
Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.3
library(rpart)
Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
Warning: package 'rpart.plot' was built under R version 4.4.3
set.seed(200)

8.1

simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

head(simulated)

8.1(a)

set.seed(200)

model1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
rfImp1 %>%
  mutate(var = rownames(rfImp1)) %>%
  ggplot(aes(x = Overall, y = reorder(var, Overall))) +
  geom_col(fill = "steelblue") +
  labs(title = "Random Forest Variable Importance",
       x = "Importance",
       y = "Variable")

# Commnet: The random forest model mainly used the informative predictors V1–V5. The uninformative predictors V6–V10 had very low importance scores. Therefore, the model did not significantly use the uninformative predictors.

8.1(B)

set.seed(200)

simulated2 <- simulated
simulated2$duplicate1 <- simulated2$V1 + rnorm(200) * 0.1

cor(simulated2$duplicate1, simulated2$V1)
[1] 0.9497025
set.seed(200)

model2 <- randomForest(y ~ ., data = simulated2,
                       importance = TRUE,
                       ntree = 1000)

rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
rfImp2 %>%
  mutate(var = rownames(rfImp2)) %>%
  ggplot(aes(x = Overall, y = reorder(var, Overall))) +
  geom_col(fill = "darkgreen") +
  labs(title = "Variable Importance with Correlated Predictor",
       x = "Importance",
       y = "Variable")

# Comment: After adding a highly correlated predictor, the importance of V1 changed. The importance was shared between V1 and the duplicated predictor. This shows that correlated predictors can split or dilute variable importance in random forest models.

8.1 (C)

set.seed(200)

model3 <- cforest(y ~ ., data = simulated)

cf_imp_false <- varimp(model3, conditional = FALSE)
cf_imp_true <- varimp(model3, conditional = TRUE)

cf_imp_false
          V1           V2           V3           V4           V5           V6 
 8.889688266  6.475678898  0.048199904  8.374714465  1.911482592 -0.026382693 
          V7           V8           V9          V10 
 0.003470791 -0.020945307 -0.040833731 -0.017820271 
cf_imp_true
          V1           V2           V3           V4           V5           V6 
 5.526945710  5.281398593  0.037352604  6.583286041  1.202302894  0.001842558 
          V7           V8           V9          V10 
-0.011570062 -0.023575002 -0.028795045 -0.013834563 
data.frame(
  Variable = names(cf_imp_false),
  Traditional = cf_imp_false,
  Conditional = cf_imp_true
)

# Comment: The cforest model gives a similar overall pattern because V1–V5 remain more important than V6–V10. However, the exact ranking is not identical to the traditional random forest model. Conditional importance helps reduce bias caused by correlated predictors.

#8.1 (D)

set.seed(200)

gbm_model <- gbm(y ~ ., data = simulated,
                 distribution = "gaussian",
                 n.trees = 1000,
                 interaction.depth = 3,
                 shrinkage = 0.01,
                 verbose = FALSE)

summary(gbm_model)
set.seed(200)

x_sim <- simulated %>% select(-y)
y_sim <- simulated$y

cubist_model <- cubist(x = x_sim, y = y_sim)

cubist_imp <- varImp(cubist_model)
cubist_imp

**# Commnet:

The same general pattern occurs. Boosted trees and Cubist mostly identify V1–V5 as important predictors, while V6–V10 remain less important. However, the exact order of importance varies by model.**

8.2

set.seed(200)

low <- sample(0:50, 500, replace = TRUE)
medium <- sample(0:500, 500, replace = TRUE)
high <- sample(0:5000, 500, replace = TRUE)

y <- low + medium + high + rnorm(500)

sim_data <- data.frame(low, medium, high, y)

var(sim_data$low)
[1] 213.2169
var(sim_data$medium)
[1] 21668.07
var(sim_data$high)
[1] 2065662
set.seed(200)

gran_model <- randomForest(y ~ ., data = sim_data,
                           importance = TRUE,
                           ntree = 1000)

gran_imp <- as.data.frame(importance(gran_model))
gran_imp$Overall <- gran_imp$IncNodePurity
gran_imp
gran_imp %>%
  mutate(var = rownames(gran_imp)) %>%
  ggplot(aes(x = Overall, y = reorder(var, Overall))) +
  geom_col(fill = "purple") +
  labs(title = "Tree Bias with Different Predictor Granularity",
       x = "Importance",
       y = "Variable")

Exercise 8.3

In stochastic gradient boosting, the bagging fraction and learning rate govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect the magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction and the learning rate for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both parameters set to 0.9.

8.3(a) Why does the model on the right focus its importance on just the first few predictors, whereas the model on the left spreads importance across more predictors?

The model on the right, where shrinkage = 0.9 and bag.fraction = 0.9, focuses its importance on only a few predictors because the learning rate is very high and a large portion of the training data is used for each tree. A high learning rate means that each tree makes large updates to the model. As a result, the strongest predictors dominate early in the boosting process. Since 90% of the training data is used in each iteration, the same dominant predictors are repeatedly selected, causing importance to become concentrated on only a small number of variables.

In contrast, the model on the left, where shrinkage = 0.1 and bag.fraction = 0.1, spreads importance across more predictors because the learning rate is lower and only a small subset of the data is used to build each tree. The lower learning rate forces the model to learn more gradually over many trees, while the smaller bagging fraction introduces more randomness into the model-building process. Because each tree sees a different small portion of the data, different predictors have more opportunity to contribute. This results in variable importance being distributed across a larger number of predictors.

The learning rate, also called shrinkage, controls how much each tree contributes to the final model. A higher learning rate gives each tree more influence, which can cause the model to rely heavily on only the strongest predictors. A lower learning rate makes the model update more slowly and allows more predictors to contribute over time.

8.3(b) Which model do you think would be more predictive of other samples?

The model on the left, with shrinkage = 0.1 and bag.fraction = 0.1, would likely be more predictive of other samples. This model learns more gradually and introduces more randomness through the smaller bagging fraction. These characteristics usually help reduce overfitting and improve generalization to new data.

The model on the right, with shrinkage = 0.9 and bag.fraction = 0.9, may fit the training data more aggressively because each tree has a large influence on the final model. However, this can make the model more sensitive to the training data and less reliable when predicting new samples.

8.3(c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

Interaction depth, also known as tree depth or tree complexity, controls how many splits are allowed in each tree. Increasing interaction depth allows the model to capture more complex relationships and interactions among predictors.

If interaction depth is increased, the slope of the predictor importance plot would likely become less steep. This means that importance would be spread across more predictors instead of being concentrated on only the top few predictors. Deeper trees allow additional variables to enter the model through interaction effects, so more predictors can contribute to the final prediction.

This effect would likely be seen in both models, but it may be more noticeable in the model on the left because the lower learning rate and smaller bagging fraction already encourage a more gradual and diverse learning process.**

Exercise 8.7

#This exercise uses the chemical manufacturing process data from Exercises 6.3 and 7.5. The same data imputation, data splitting, and pre-processing steps are used before fitting several tree-based regression models.

#The goal is to compare tree-based models and determine which model gives the best predictive performance for yield.**




``` r
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(gbm)
library(Cubist)

data(ChemicalManufacturingProcess)

set.seed(100)

chem <- ChemicalManufacturingProcess
chem <- chem[complete.cases(chem$Yield), ]

# Outcome variable
y <- chem$Yield

# Predictor variables
x <- chem[, setdiff(names(chem), "Yield")]

# Imputation
pre_proc <- preProcess(x, method = c("medianImpute"))

x_imputed <- predict(pre_proc, x)

# Data split
set.seed(100)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)

x_train <- x_imputed[train_index, ]
x_test  <- x_imputed[-train_index, ]

y_train <- y[train_index]
y_test  <- y[-train_index]

train_data <- data.frame(Yield = y_train, x_train)
test_data  <- data.frame(Yield = y_test, x_test)

# Cross-validation setup
ctrl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 3
)

#Train several tree-based models

set.seed(100)
cart_model <- train(
  Yield ~ .,
  data = train_data,
  method = "rpart",
  trControl = ctrl,
  tuneLength = 10
)
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
set.seed(100)
rf_model <- train(
  Yield ~ .,
  data = train_data,
  method = "rf",
  trControl = ctrl,
  tuneLength = 5,
  importance = TRUE
)

set.seed(100)
gbm_model <- train(
  Yield ~ .,
  data = train_data,
  method = "gbm",
  trControl = ctrl,
  tuneLength = 5,
  verbose = FALSE
)

set.seed(100)
cubist_model <- train(
  Yield ~ .,
  data = train_data,
  method = "cubist",
  trControl = ctrl,
  tuneLength = 5
)

#Compare resampling performance

tree_results <- resamples(list(
  CART = cart_model,
  RandomForest = rf_model,
  GBM = gbm_model,
  Cubist = cubist_model
))

summary(tree_results)

Call:
summary.resamples(object = tree_results)

Models: CART, RandomForest, GBM, Cubist 
Number of resamples: 30 

MAE 
                  Min.   1st Qu.    Median      Mean   3rd Qu.     Max. NA's
CART         0.6610448 0.9875109 1.1148975 1.1114997 1.2986455 1.422143    0
RandomForest 0.5961823 0.7310847 0.8388916 0.8475204 0.9550547 1.160080    0
GBM          0.5838352 0.7973298 0.8590399 0.8815117 0.9583943 1.242580    0
Cubist       0.5508815 0.6749485 0.7444068 0.7737803 0.8729999 1.143684    0

RMSE 
                  Min.   1st Qu.    Median     Mean  3rd Qu.     Max. NA's
CART         0.7229573 1.3449168 1.4578546 1.455629 1.703146 2.082454    0
RandomForest 0.7312630 1.0094561 1.1219040 1.120952 1.289020 1.554145    0
GBM          0.7910243 0.9912871 1.1292376 1.144669 1.266990 1.575154    0
Cubist       0.6866480 0.8692631 0.9696805 1.015742 1.115064 1.693868    0

Rsquared 
                  Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
CART         0.1145397 0.3454682 0.4577474 0.4377574 0.5314922 0.8484906    0
RandomForest 0.4359938 0.5966133 0.6768760 0.6683455 0.7472944 0.8546531    0
GBM          0.3061412 0.5587058 0.6748256 0.6368569 0.7371151 0.8338435    0
Cubist       0.4443157 0.6279780 0.7439945 0.7146171 0.7878405 0.8965460    0
bwplot(tree_results, metric = "RMSE")

#Test set performance

tree_models <- list(
  CART = cart_model,
  RandomForest = rf_model,
  GBM = gbm_model,
  Cubist = cubist_model
)

test_results <- data.frame(
  Model = character(),
  RMSE = numeric(),
  Rsquared = numeric(),
  MAE = numeric()
)

for(i in names(tree_models)) {
  pred <- predict(tree_models[[i]], newdata = test_data)
  perf <- postResample(pred = pred, obs = test_data$Yield)
  
  test_results <- rbind(
    test_results,
    data.frame(
      Model = i,
      RMSE = perf["RMSE"],
      Rsquared = perf["Rsquared"],
      MAE = perf["MAE"]
    )
  )
}

test_results

8.7(a) Which tree-based regression model gives the optimal resampling and test set performance?

Based on the resampling results and the test set performance, the best tree-based regression model is the model with the lowest RMSE and highest R2. In most cases for this data set, the random forest or boosted tree model performs better than the single CART tree because ensemble methods reduce variance and improve predictive accuracy.

The final answer should be based on the test_results table above. The model with the smallest test RMSE is selected as the optimal tree-based regression model.

8.7(b) Which predictors are most important in the optimal tree-based regression model?**

# Replace rf_model with the best model from part (a) if needed
best_tree_model <- rf_model

tree_imp <- varImp(best_tree_model)
tree_imp
rf variable importance

  only 20 most important variables shown (out of 57)

                       Overall
ManufacturingProcess32  100.00
BiologicalMaterial12     45.46
ManufacturingProcess13   44.90
BiologicalMaterial11     42.28
BiologicalMaterial06     38.98
BiologicalMaterial03     38.81
BiologicalMaterial04     35.05
ManufacturingProcess39   34.73
BiologicalMaterial05     34.35
ManufacturingProcess11   34.16
ManufacturingProcess09   32.70
BiologicalMaterial02     32.69
ManufacturingProcess17   31.99
ManufacturingProcess34   28.76
BiologicalMaterial08     28.39
BiologicalMaterial09     28.35
ManufacturingProcess31   27.18
ManufacturingProcess06   26.98
ManufacturingProcess30   24.23
ManufacturingProcess20   23.73
plot(tree_imp, top = 20)

**The most important predictors in the optimal tree-based model are identified using the variable importance plot. The top predictors are those with the largest importance values.

To check whether biological or process predictors dominate the list, the top variables should be compared by their naming pattern. In this data set, the predictors include both biological measurements and process variables. If most of the top-ranked predictors come from biological measurements, then the biological variables dominate. If most come from process variables, then the process variables dominate.**

imp_df <- tree_imp$importance
imp_df$Predictor <- rownames(imp_df)

top_10_tree <- imp_df[order(-imp_df$Overall), ][1:10, ]
top_10_tree

The top 10 predictors from the optimal tree-based model can be compared with the top 10 predictors from the optimal linear and nonlinear models from previous exercises. If the same predictors appear repeatedly across linear, nonlinear, and tree-based models, this suggests that those variables have a strong and consistent relationship with yield. If the tree-based model identifies different predictors, this may indicate nonlinear relationships or interaction effects that were not captured as strongly by the linear model.

8.7(c) Plot the optimal single tree with the distribution of yield in the terminal nodes.

set.seed(100)
single_tree <- train(
  Yield ~ .,
  data = train_data,
  method = "rpart",
  trControl = ctrl,
  tuneLength = 10
)
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
rpart.plot(
  single_tree$finalModel,
  type = 4,
  extra = 101,
  fallen.leaves = TRUE,
  main = "Optimal Single Regression Tree for Yield"
)

**The single tree provides a useful visual summary of how the predictors split the yield values into different terminal nodes. Each terminal node represents a subgroup of observations with a similar predicted yield. This view can provide additional knowledge because it shows threshold values for important predictors and how those predictors separate high-yield and low-yield observations.

However, the single tree is mainly useful for interpretation rather than prediction. It is easier to explain than random forest or boosted trees, but it is usually less accurate because it has higher variance. The tree may reveal whether certain biological or process variables create meaningful splits in yield. If the first few splits are biological predictors, then biological variables may be strongly associated with yield. If the first few splits are process predictors, then process variables may play a larger role in determining yield.

Overall, the terminal node plot helps explain the relationship between selected predictors and yield, while the ensemble tree models usually provide stronger predictive performance.**