1 Exercise 8.1

Recreate the simulated data from Exercise 7.2:

>library(mlbench)
>set.seed(200)
>simulated <- mlbench.friedman1(200, sd = 1)
>simulated <- cbind(simulated$x, simulated$y)
>simulated <- as.data.frame(simulated)
>colnames(simulated)[ncol(simulated)] <- "y"

1.1 8.1(a):

Fit a random forest model to all of the predictors, then estimate the variable importance scores:

library(randomForest)
library(caret)
model1 <- randomForest(y ~., data = simulated,
                        importance = TRUE,
                        ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

Did the random forest model significantly use the uninformative predictors (v6-v10)?

set.seed(200)
simulated_raw <- mlbench.friedman1(200, sd = 1)

# recreate the Friedman simulation with 10 predictors and one response
simulated <- as.data.frame(cbind(simulated_raw$x, y = simulated_raw$y))
colnames(simulated) <- c(paste0("V", 1:10), "y")

# V1 to V5 are informative predictors, while V6 to V10 are noise variables.
head(simulated)
set.seed(RANDOM_STATE)
model_rf_original <- randomForest(
  y ~ .,
  data = simulated,
  importance = TRUE,
  ntree = 1000
)

# use %IncMSE to measure random forest variable importance for regression
rf_imp_original <- importance(model_rf_original, scale = FALSE) %>%
  as.data.frame() %>%
  rownames_to_column("Predictor")

importance_col <- ifelse("%IncMSE" %in% names(rf_imp_original), "%IncMSE", names(rf_imp_original)[2])

rf_imp_original <- rf_imp_original %>%
  mutate(
    Importance = .data[[importance_col]],
    Predictor_Type = ifelse(Predictor %in% paste0("V", 1:5),
                            "Informative: V1-V5",
                            "Noise: V6-V10")
  ) %>%
  arrange(desc(Importance))

kable(rf_imp_original[, c("Predictor", "Predictor_Type", "Importance")],
      digits = 3,
      caption = "Random forest variable importance for the original Friedman data")
Random forest variable importance for the original Friedman data
Predictor Predictor_Type Importance
V1 Informative: V1-V5 8.841
V4 Informative: V1-V5 7.760
V2 Informative: V1-V5 6.481
V5 Informative: V1-V5 2.156
V3 Informative: V1-V5 0.814
V6 Noise: V6-V10 0.103
V7 Noise: V6-V10 0.039
V10 Noise: V6-V10 -0.090
V9 Noise: V6-V10 -0.145
V8 Noise: V6-V10 -0.161
# plot importance scores to compare informative and noise predictors
ggplot(rf_imp_original, aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Random Forest Variable Importance: Original Data",
    x = "Predictor",
    y = "Importance (%IncMSE)"
  )

rf_imp_original_summary <- rf_imp_original %>%
  group_by(Predictor_Type) %>%
  summarize(
    Mean_Importance = mean(Importance),
    Max_Importance = max(Importance),
    .groups = "drop"
  )

kable(rf_imp_original_summary,
      digits = 3,
      caption = "Average and maximum importance by predictor type")
Average and maximum importance by predictor type
Predictor_Type Mean_Importance Max_Importance
Informative: V1-V5 5.211 8.841
Noise: V6-V10 -0.051 0.103

The random forest mostly gives larger importance to the truly informative variables (V1 to V5). The uninformative predictors (V6 to V10) may still receive small nonzero importance values because of random sampling variation, but they should not dominate the importance list. Based on the table and plot, I would not say the model significantly used the uninformative predictors unless one of V6 to V10 appears near the top with importance similar to the true predictors.

1.2 8.1(b):

Now add an additional predictor that is highly correlated with one of the informative predictors. For example: simulated\(duplicate1 <- simulated\)V1 + rnorm(200) * .1 cor(simulated\(duplicated1, simulated\)V1)

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

set.seed(RANDOM_STATE)

# add duplicate1 as a highly correlated copy of V1
simulated_dup1 <- simulated %>%
  mutate(duplicate1 = V1 + rnorm(n(), sd = 0.1))

cor(simulated_dup1$duplicate1, simulated_dup1$V1)
## [1] 0.9429376
set.seed(RANDOM_STATE)
model_rf_duplicate1 <- randomForest(
  y ~ .,
  data = simulated_dup1,
  importance = TRUE,
  ntree = 1000
)

# extract variable importance after adding one correlated predictor
rf_imp_duplicate1 <- importance(model_rf_duplicate1, scale = FALSE) %>%
  as.data.frame() %>%
  rownames_to_column("Predictor")

importance_col_dup1 <- ifelse(
  "%IncMSE" %in% names(rf_imp_duplicate1),
  "%IncMSE",
  names(rf_imp_duplicate1)[2]
)

rf_imp_duplicate1 <- rf_imp_duplicate1 %>%
  mutate(
    Importance = .data[[importance_col_dup1]],
    Predictor_Type = case_when(
      Predictor %in% paste0("V", 1:5) ~ "Informative: V1-V5",
      Predictor == "duplicate1" ~ "Highly correlated copy of V1",
      TRUE ~ "Noise: V6-V10"
    )
  ) %>%
  arrange(desc(Importance))

kable(
  rf_imp_duplicate1[, c("Predictor", "Predictor_Type", "Importance")],
  digits = 3,
  caption = "Random forest variable importance after adding duplicate1"
)
Random forest variable importance after adding duplicate1
Predictor Predictor_Type Importance
V4 Informative: V1-V5 7.242
V2 Informative: V1-V5 6.219
V1 Informative: V1-V5 5.971
duplicate1 Highly correlated copy of V1 3.696
V5 Informative: V1-V5 2.004
V3 Informative: V1-V5 0.634
V6 Noise: V6-V10 0.168
V7 Noise: V6-V10 0.078
V8 Noise: V6-V10 -0.028
V9 Noise: V6-V10 -0.083
V10 Noise: V6-V10 -0.091
set.seed(RANDOM_STATE + 1)

# add duplicate2 as another highly correlated copy of V1
simulated_dup2 <- simulated_dup1 %>%
  mutate(duplicate2 = V1 + rnorm(n(), sd = 0.1))

cor(simulated_dup2[, c("V1", "duplicate1", "duplicate2")])
##                   V1 duplicate1 duplicate2
## V1         1.0000000  0.9429376  0.9419378
## duplicate1 0.9429376  1.0000000  0.8887843
## duplicate2 0.9419378  0.8887843  1.0000000
set.seed(RANDOM_STATE)

model_rf_duplicate2 <- randomForest(
  y ~ .,
  data = simulated_dup2,
  importance = TRUE,
  ntree = 1000
)

# extract variable importance after adding two correlated predictos
rf_imp_duplicate2 <- importance(model_rf_duplicate2, scale = FALSE) %>%
  as.data.frame() %>%
  rownames_to_column("Predictor")

importance_col_dup2 <- ifelse(
  "%IncMSE" %in% names(rf_imp_duplicate2),
  "%IncMSE",
  names(rf_imp_duplicate2)[2]
)

rf_imp_duplicate2 <- rf_imp_duplicate2 %>%
  mutate(
    Importance = .data[[importance_col_dup2]],
    Predictor_Type = case_when(
      Predictor %in% paste0("V", 1:5) ~ "Informative: V1-V5",
      Predictor %in% c("duplicate1", "duplicate2") ~ "Highly correlated copy of V1",
      TRUE ~ "Noise: V6-V10"
    )
  ) %>%
  arrange(desc(Importance))

kable(
  rf_imp_duplicate2[, c("Predictor", "Predictor_Type", "Importance")],
  digits = 3,
  caption = "Random forest variable importance after adding duplicate1 and duplicate2"
)
Random forest variable importance after adding duplicate1 and duplicate2
Predictor Predictor_Type Importance
V4 Informative: V1-V5 7.493
V2 Informative: V1-V5 6.388
V1 Informative: V1-V5 4.788
duplicate2 Highly correlated copy of V1 2.820
duplicate1 Highly correlated copy of V1 2.557
V5 Informative: V1-V5 1.877
V3 Informative: V1-V5 0.480
V6 Noise: V6-V10 0.142
V7 Noise: V6-V10 0.031
V9 Noise: V6-V10 -0.009
V10 Noise: V6-V10 -0.009
V8 Noise: V6-V10 -0.042
# compare V1 importance before and after adding the correlated predictor
compare_v1 <- bind_rows(
  rf_imp_original %>%
    filter(Predictor == "V1") %>%
    transmute(Model = "Original data", Predictor, Importance),
  
  rf_imp_duplicate1 %>%
    filter(Predictor %in% c("V1", "duplicate1")) %>%
    transmute(Model = "With duplicate1", Predictor, Importance),
  
  rf_imp_duplicate2 %>%
    filter(Predictor %in% c("V1", "duplicate1", "duplicate2")) %>%
    transmute(Model = "With duplicate1 and duplicate2", Predictor, Importance)
)

kable(
  compare_v1,
  digits = 3,
  caption = "Change in V1 importance after adding correlated predictors"
)
Change in V1 importance after adding correlated predictors
Model Predictor Importance
Original data V1 8.841
With duplicate1 V1 5.971
With duplicate1 duplicate1 3.696
With duplicate1 and duplicate2 V1 4.788
With duplicate1 and duplicate2 duplicate2 2.820
With duplicate1 and duplicate2 duplicate1 2.557
ggplot(rf_imp_duplicate2, aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Random Forest Variable Importance: With duplicate1 and duplicate2",
    x = "Predictor",
    y = "Importance (%IncMSE)"
  )

After adding duplicate1, the importance of V1 decreases because the new predictor contains almost the same information as V1. The random forest can split on either variable, so the importance gets shared between them.

After adding duplicate2, the same pattern becomes stronger. The information originally represented by V1 is now spread across V1, duplicate1, and duplicate2. This shows that random forest variable importance can be diluted when several predictors are highly correlated with each other. The predictor may still be important, but its importance score may look smaller because other correlated variables are competing with it.

1.3 8.1(c):

Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

set.seed(RANDOM_STATE)

# fit a conditional inference forest to compare traditional and conditional importance
cforest_fit <- cforest(
  y ~ .,
  data = simulated_dup1,
  controls = cforest_unbiased(ntree = 500, mtry = 3)
)

# conditional = FALSE gives the regular permutation importance
cf_imp_unconditional <- varimp(cforest_fit, conditional = FALSE)

#  conditional = TRUE adjusts importance for correlated predictors
cf_imp_conditional <- varimp(cforest_fit, conditional = TRUE)

cf_importance <- tibble(
  Predictor = names(cf_imp_unconditional),
  Unconditional_Importance = as.numeric(cf_imp_unconditional),
  Conditional_Importance = as.numeric(cf_imp_conditional)
) %>%
  mutate(
    Predictor_Type = case_when(
      Predictor %in% paste0("V", 1:5) ~ "Informative: V1-V5", 
      Predictor == "duplicate1" ~ "Highly correlated duplicate of V1",
      TRUE ~ "Noise: V6-V10"
    )
  ) %>%
  arrange(desc(Conditional_Importance))

kable(cf_importance,
      digits = 3,
      caption = "Conditional inference forest variable importance")
Conditional inference forest variable importance
Predictor Unconditional_Importance Conditional_Importance Predictor_Type
V4 5.926 4.237 Informative: V1-V5
V2 4.847 3.445 Informative: V1-V5
V1 5.896 2.570 Informative: V1-V5
duplicate1 3.169 1.185 Highly correlated duplicate of V1
V5 1.869 1.003 Informative: V1-V5
V3 0.067 0.059 Informative: V1-V5
V9 -0.028 0.021 Noise: V6-V10
V7 -0.045 0.002 Noise: V6-V10
V8 -0.014 0.000 Noise: V6-V10
V6 -0.010 0.000 Noise: V6-V10
V10 -0.054 -0.036 Noise: V6-V10
cf_importance_long <- cf_importance %>%
  pivot_longer(
    cols = c(Unconditional_Importance, Conditional_Importance),
    names_to = "Importance_Type",
    values_to = "Importance"
  )

ggplot(cf_importance_long,
       aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ Importance_Type, scales = "free_x") +
  labs(
    title = "cforest Variable Importance",
    x = "Predictor",
    y = "Importance"
  )

The cforest results show a similar overall pattern to the traditional random forest. The informative predictors still have the largest importance values, while the noise variables have importance values close to zero. However, the conditional importance measure reduces the effect of correlated predictors. In this case, V1 and duplicate1 are still important, but the conditional importance gives a more cautious estimate because the two variables contain similar information.

1.4 8.1(d):

Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

# use a small repeated cross-validation setup for this simulation exercise
ctrl_small <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2
)

# set a small tuning grid for boosted regression trees
gbm_grid <- expand.grid(
  n.trees = c(100, 300),
  interaction.depth = c(1, 3),
  shrinkage = c(0.05, 0.10),
  n.minobsinnode = c(5, 10)
)

# set a small tuning grid for Cubist models
cubist_grid <- expand.grid(
  committees = c(1, 10, 50),
  neighbors = c(0, 5)
)
set.seed(RANDOM_STATE)
gbm_original <- train(
  y ~ .,
  data = simulated,
  method = "gbm",
  trControl = ctrl_small,
  tuneGrid = gbm_grid,
  metric = "RMSE",
  verbose = FALSE
)

set.seed(RANDOM_STATE)
cubist_original <- train(
  y ~ .,
  data = simulated,
  method = "cubist",
  trControl = ctrl_small,
  tuneGrid = cubist_grid,
  metric = "RMSE"
)
set.seed(RANDOM_STATE)
gbm_duplicate <- train(
  y ~ .,
  data = simulated_dup1,
  method = "gbm",
  trControl = ctrl_small,
  tuneGrid = gbm_grid,
  metric = "RMSE",
  verbose = FALSE
)

set.seed(RANDOM_STATE)
cubist_duplicate <- train(
  y ~ .,
  data = simulated_dup1,
  method = "cubist",
  trControl = ctrl_small,
  tuneGrid = cubist_grid,
  metric = "RMSE"
)
# use this helper function to extract caret variable importance in a clean format
extract_caret_imp <- function(fit, model_name) {
  out <- varImp(fit, scale = FALSE)$importance %>%
    as.data.frame() %>%
    rownames_to_column("Predictor")
  
  if (!"Overall" %in% names(out)) {
    numeric_cols <- names(out)[sapply(out, is.numeric)]
    out$Overall <- rowMeans(out[, numeric_cols, drop = FALSE])
  }
  
  out %>% 
    transmute(Model = model_name, Predictor, Importance = Overall) %>%
    arrange(desc(Importance))
}

imp_gbm_original <- extract_caret_imp(gbm_original, "GBM: original")
imp_gbm_duplicate <- extract_caret_imp(gbm_duplicate, "GBM: with duplicate1")
imp_cubist_original <- extract_caret_imp(cubist_original, "Cubist: original")
imp_cubist_duplicate <- extract_caret_imp(cubist_duplicate, "Cubist: with duplicate1")

imp_tree_methods <- bind_rows(
  imp_gbm_original,
  imp_gbm_duplicate,
  imp_cubist_original,
  imp_cubist_duplicate
)

kable(imp_tree_methods %>% filter(Predictor %in% c("V1", "duplicate1")),
      digits = 3,
      caption = "Importance of V1 and duplicate1 for GBM and Cubist")
Importance of V1 and duplicate1 for GBM and Cubist
Model Predictor Importance
GBM: original V1 8215.241
GBM: with duplicate1 V1 5415.667
GBM: with duplicate1 duplicate1 3211.120
Cubist: original V1 70.500
Cubist: with duplicate1 V1 69.500
Cubist: with duplicate1 duplicate1 0.000
# keep only positive-importance predictors and remove extra tied rows
top_predictors_tree_methods <- imp_tree_methods %>%
  filter(Importance > 0) %>%
  group_by(Model) %>%
  slice_max(Importance, n = 8, with_ties = FALSE) %>%
  ungroup()

kable(
  top_predictors_tree_methods,
  digits = 3,
  caption = "Top positive-importance predictors from boosted trees and Cubist"
)
Top positive-importance predictors from boosted trees and Cubist
Model Predictor Importance
Cubist: original V1 70.500
Cubist: original V2 54.500
Cubist: original V4 50.000
Cubist: original V5 47.500
Cubist: original V3 41.000
Cubist: with duplicate1 V1 69.500
Cubist: with duplicate1 V2 54.500
Cubist: with duplicate1 V4 50.000
Cubist: with duplicate1 V5 47.000
Cubist: with duplicate1 V3 41.000
Cubist: with duplicate1 V6 14.000
GBM: original V4 9021.022
GBM: original V1 8215.241
GBM: original V2 6756.323
GBM: original V5 4038.332
GBM: original V3 2699.320
GBM: original V7 501.379
GBM: original V6 399.003
GBM: original V8 249.501
GBM: with duplicate1 V4 9180.565
GBM: with duplicate1 V2 6458.622
GBM: with duplicate1 V1 5415.667
GBM: with duplicate1 V5 3972.630
GBM: with duplicate1 duplicate1 3211.120
GBM: with duplicate1 V3 2792.299
GBM: with duplicate1 V7 453.513
GBM: with duplicate1 V6 417.806

The same general issue appears in boosted trees, but not exactly the same way for Cubist. In the GBM model, adding duplicate1 reduces the importance of V1 and gives duplicate1 a noticeable importance score. This suggests that boosted trees can split importance between two highly correlated predictors, similar to what happened in the random forest model.

For Cubist, the pattern is different. The model mostly keeps V1 and gives duplicate1 almost no importance. This means Cubist selected one of the correlated predictors and largely ignored the other. In this case, Cubist did not split importance between V1 and duplicate1 the same way GBM did.

Overall, correlated predictors can affect variable importance, but the effect depends on the tree-based method. GBM spreads some importance across correlated predictors, while Cubist tends to choose one of the correlated predictors. Even though a few noise variables may receive small positive importance values, the truly informative predictors still dominate the importance lists.

Fig.8.24: A comparison of variable importance magnitudes for differing values of the bagging fraction and shrinkage parameters. Both tuning parameters are set to 0.1 in the left figure. Both are set to 0.9 in the right figure

2 Exercise 8.2

Use a simulation to show tree bias with different granularities.

# create simulated data where all predictors are noise but have different granularities
make_bias_data <- function(n = 300) {
  tibble(
    y = rnorm(n),
    x_binary = sample(0:1, n, replace = TRUE),
    x_5_levels = sample(1:5, n, replace = TRUE),
    x_10_levels = sample(1:10, n, replace = TRUE),
    x_50_levels = sample(1:50, n, replace = TRUE),
    x_continuous = runif(n)
  )
}

# fit one tree per simulation and record which variable is chosen for the first split
one_tree_bias_run <- function(seed_id, n = 300) {
  set.seed(seed_id)
  dat <- make_bias_data(n)
  
  fit <- rpart(
    y ~ .,
    data = dat,
    method = "anova",
    control = rpart.control(cp = 0, minsplit = 20, maxdepth = 3)
  )
  
  tibble(
    Replication = seed_id,
    Root_Split = fit$frame$var[1]
  )
}
set.seed (RANDOM_STATE)
root_results <- map_dfr(1:200, one_tree_bias_run)

# count how often each noise predictor is selected as the root split
root_summary <- root_results %>%
  count(Root_Split, name = "Times_Selected") %>%
  mutate(Percent_Selected = Times_Selected / sum(Times_Selected) * 100) %>%
  arrange(desc(Times_Selected))

kable(root_summary,
      digits = 2,
      caption = "How often each noise predictor was selected as the root split")
How often each noise predictor was selected as the root split
Root_Split Times_Selected Percent_Selected
x_continuous 91 45.5
x_50_levels 56 28.0
x_5_levels 24 12.0
x_10_levels 23 11.5
x_binary 6 3.0
ggplot(root_summary, aes(x = reorder(Root_Split, Percent_Selected), y = Percent_Selected)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Tree Bias Simulation: Root Split Selection",
    x = "Predictor",
    y = "Percent selected as root split"
  )

set.seed(RANDOM_STATE)
bias_data <- make_bias_data(n = 1000)

# fit a random forest to check importance bias when all predictors are noise
bias_rf <- randomForest(
  y ~ .,
  data = bias_data,
  ntree = 1000,
  importance = TRUE
)

bias_rf_imp <- importance(bias_rf, scale = FALSE) %>%
  as.data.frame() %>%
  rownames_to_column("Predictor")

importance_col_bias <- ifelse("%IncMSE" %in% names(bias_rf_imp), "%IncMSE", names(bias_rf_imp)[2])

bias_rf_imp <- bias_rf_imp %>%
  transmute(Predictor, Importance = .data[[importance_col_bias]]) %>%
  arrange(desc(Importance))

kable(bias_rf_imp,
      digits = 3,
      caption = "Random forest importance when all predictors are noise")
Random forest importance when all predictors are noise
Predictor Importance
x_50_levels 0.007
x_5_levels 0.006
x_10_levels 0.005
x_continuous 0.002
x_binary -0.001
ggplot(bias_rf_imp, aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Variable Importance Bias When All Predictors Are Noise",
    x = "Predictor",
    y = "Importance (%IncMSE)"
  )

In this simulation, all predictors are noise, so no predictor should truly be important. However, the single-tree simulation shows that the tree often selects predictors with more possible split points as the root split. The continuous predictor was selected most often, followed by the predictor with 50 levels, while the binary predictor was selected least often. This happens because predictors with more possible splits have more chances to create an apparently useful split by random chance.

The random forest importance results show the same general issue, although the importance values are very small because none of the predictors are truly related to the response. Overall, this simulation shows that tree-based models can have selection bias when predictors have different granularities.

3 Exercise 8.3

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

3.1 8.3(a):

Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The model on the right focuses its importance on only a few predictors because both tuning parameters are large. A high learning rate allows each tree to make a large correction to the model, and a high bagging fraction means each tree is trained on a larger portion of the data. Together, these settings make the boosting process move more aggressively toward the strongest predictors early in model building.

The model on the left spreads importance across more predictors because the learning rate and bagging fraction are both smaller. Each tree has a smaller effect, so the model learns more gradually. As a result, more predictors have a chance to contribute to the final model.

3.2 8.3(b):

which model do you think would be more predictive of other samples?

I would expect the model on the left to be more predictive for new samples, assuming enough trees are used. The lower learning rate and smaller bagging fraction make the model more regularized, which usually helps reduce overfitting. The model on the right may fit the training data quickly, but it is more likely to overfit because the variable importance is concentrated heavily on only a few predictors.

3.3 8.3(c):

How would increasing interaction depth affect the slope of predictor importance for either model in Fig.8.24?

Increasing the interaction depth would allow each tree to use more splits and capture more complex interactions among predictors. This could make the variable importance curve less steep because more predictors could become useful through interactions, not only through their individual effects. However, if the true signal is concentrated in only a few predictors, those predictors may still stay near the top. A larger interaction depth can improve flexibility, but it can also increase the risk of overfitting.

4 Exercise 8.7

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

# load the chemical manufacturing data and separate predictors from Yield
data(ChemicalManufacturingProcess)

chem_data <- ChemicalManufacturingProcess
predictors <- chem_data %>% select(-Yield)
yield <- chem_data$Yield

data_overview <- tibble(
  Rows = nrow(chem_data),
  Predictors = ncol(predictors),
  Missing_Predictor_Cells = sum(is.na(predictors)),
  Yield_Mean = mean(yield),
  Yield_SD = sd(yield)
)

kable(
  data_overview,
  digits = 3,
  caption = "Chemical manufacturing data overview"
)
Chemical manufacturing data overview
Rows Predictors Missing_Predictor_Cells Yield_Mean Yield_SD
176 57 106 40.177 1.846
# use the same 70/30 split and preprocessing workflow from Exercise 7.5 for a consistent comparison
set.seed(901)

training_rows <- createDataPartition(yield, p = 0.70, list = FALSE)

train_predictors <- predictors[training_rows, ]
train_yield <- yield[training_rows]

test_predictors <- predictors[-training_rows, ]
test_yield <- yield[-training_rows]

# apply the same transformation, scaling, and kNN imputation from Exercise 7.5
pp <- preProcess(
  train_predictors,
  method = c("YeoJohnson", "center", "scale", "knnImpute")
)

pp_train_predictors <- predict(pp, train_predictors)
pp_test_predictors <- predict(pp, test_predictors)

# remove near-zero variance predictors after preprocessing
nzvpp <- nearZeroVar(pp_train_predictors)

if (length(nzvpp) > 0) {
  pp_train_predictors <- pp_train_predictors[, -nzvpp]
  pp_test_predictors <- pp_test_predictors[, -nzvpp]
}

# remove highly correlated predictors to match the reduced predictor set from Exercise 7.5
predcorr <- cor(pp_train_predictors)
highCorrpp <- findCorrelation(predcorr)

if (length(highCorrpp) > 0) {
  pp_train_predictors <- pp_train_predictors[, -highCorrpp]
  pp_test_predictors <- pp_test_predictors[, -highCorrpp]
}

# rename the processed training and test data for easier model fitting
x_train <- pp_train_predictors
y_train <- train_yield

x_test <- pp_test_predictors
y_test <- test_yield

preprocess_overview <- tibble(
  Training_Rows = nrow(x_train),
  Training_Predictors = ncol(x_train),
  Test_Rows = nrow(x_test),
  Test_Predictors = ncol(x_test),
  Missing_Training_Values_After_Preprocessing = sum(is.na(x_train)),
  Missing_Test_Values_After_Preprocessing = sum(is.na(x_test))
)

kable(
  preprocess_overview,
  caption = "Exercise 8.7 data after preprocessing"
)
Exercise 8.7 data after preprocessing
Training_Rows Training_Predictors Test_Rows Test_Predictors Missing_Training_Values_After_Preprocessing Missing_Test_Values_After_Preprocessing
124 46 52 46 0 0
# use bootstrap resampling to stay consistent with the earlier nonlinear model comparison
ctrl_87 <- trainControl(
  method = "boot",
  number = 25
)

4.1 8.7(a):

Which tree-based regression model gives the optimal resampling and test set performance?

# fit a single regression tree as an interpretable baseline model
set.seed(415)
rpart_fit <- train(
  x = x_train,
  y = y_train,
  method = "rpart",
  trControl = ctrl_87,
  tuneLength = 20,
  metric = "RMSE"
)

# fit a random forest to improve stability by averaging many trees
set.seed(415)
rf_fit <- train(
  x = x_train,
  y = y_train,
  method = "rf",
  trControl = ctrl_87,
  tuneGrid = expand.grid(
    mtry = unique(c(2, floor(sqrt(ncol(x_train))), floor(ncol(x_train) / 3), floor(ncol(x_train) / 2), ncol(x_train)))
  ),
  metric = "RMSE",
  importance = TRUE
)

# fit boosted trees using a moderate tuning grid.
gbm_grid <- expand.grid(
  n.trees = c(100, 300, 500),
  interaction.depth = c(1, 3, 5),
  shrinkage = c(0.05, 0.10),
  n.minobsinnode = 10
)

set.seed(415)
gbm_fit <- train(
  x = x_train,
  y = y_train,
  method = "gbm",
  trControl = ctrl_87,
  tuneGrid = gbm_grid,
  metric = "RMSE",
  verbose = FALSE
)

# fit Cubist because it combines rule-based splits with linear models
cubist_grid <- expand.grid(
  committees = c(1, 10, 20, 50),
  neighbors = c(0, 5, 9)
)

set.seed(415)
cubist_fit <- train(
  x = x_train,
  y = y_train,
  method = "cubist",
  trControl = ctrl_87,
  tuneGrid = cubist_grid,
  metric = "RMSE"
)
# extract each model's best resampling result for a clean comparison table
get_best_resample <- function(model, model_name) {
  best <- model$results
  
  for (col in names(model$bestTune)) {
    best <- best[best[[col]] == model$bestTune[[col]], ]
  }
  
  best %>%
    slice(1) %>%
    mutate(Model = model_name) %>%
    select(Model, everything())
}

tree_resampling_results <- bind_rows(
  get_best_resample(rpart_fit, "Single Regression Tree"),
  get_best_resample(rf_fit, "Random Forest"),
  get_best_resample(gbm_fit, "Boosted Trees"),
  get_best_resample(cubist_fit, "Cubist")
) %>%
  arrange(RMSE)

kable(
  tree_resampling_results,
  digits = 3,
  caption = "Best resampling performance for tree-based models"
)
Best resampling performance for tree-based models
Model cp RMSE Rsquared MAE RMSESD RsquaredSD MAESD mtry shrinkage interaction.depth n.minobsinnode n.trees committees neighbors
Cubist NA 1.097 0.604 0.839 0.153 0.096 0.121 NA NA NA NA NA 50 5
Random Forest NA 1.156 0.576 0.896 0.129 0.093 0.114 23 NA NA NA NA NA NA
Boosted Trees NA 1.192 0.533 0.909 0.095 0.078 0.094 NA 0.05 5 10 300 NA NA
Single Regression Tree 0.16 1.427 0.353 1.131 0.148 0.107 0.119 NA NA NA NA NA NA NA
# calculate test-set RMSE, R-squared, and MAE for each model
get_test_performance <- function(model, model_name) {
  pred <- predict(model, newdata = x_test)
  perf <- postResample(pred = pred, obs = y_test)
  
  tibble(
    Model = model_name,
    Test_RMSE = unname(perf["RMSE"]),
    Test_Rsquared = unname(perf["Rsquared"]),
    Test_MAE = unname(perf["MAE"])
  )
}

tree_test_results <- bind_rows(
  get_test_performance(rpart_fit, "Single Regression Tree"),
  get_test_performance(rf_fit, "Random Forest"),
  get_test_performance(gbm_fit, "Boosted Trees"),
  get_test_performance(cubist_fit, "Cubist")
) %>%
  arrange(Test_RMSE)

kable(
  tree_test_results,
  digits = 3,
  caption = "Test-set performance for tree-based models"
)
Test-set performance for tree-based models
Model Test_RMSE Test_Rsquared Test_MAE
Cubist 1.083 0.722 0.821
Random Forest 1.247 0.673 0.997
Boosted Trees 1.267 0.613 1.016
Single Regression Tree 1.544 0.426 1.225
# select the best tree-based model based on the lowest test-set RMSE
best_tree_model_name <- tree_test_results$Model[1]

model_list <- list(
  "Single Regression Tree" = rpart_fit,
  "Random Forest" = rf_fit,
  "Boosted Trees" = gbm_fit,
  "Cubist" = cubist_fit
)

best_tree_model <- model_list[[best_tree_model_name]]

best_model_summary <- tibble(
  Best_Model = best_tree_model_name,
  Best_Test_RMSE = tree_test_results$Test_RMSE[1],
  Best_Test_Rsquared = tree_test_results$Test_Rsquared[1],
  Best_Test_MAE = tree_test_results$Test_MAE[1]
)

kable(
  best_model_summary,
  digits = 3,
  caption = "Best tree-based model based on test-set RMSE"
)
Best tree-based model based on test-set RMSE
Best_Model Best_Test_RMSE Best_Test_Rsquared Best_Test_MAE
Cubist 1.083 0.722 0.821

The tree-based regression models were compared using bootstrap resampling and then evaluated on the hold-out test set. I used RMSE as the main comparison metric because Yield is a continuous response variable, and RMSE measures prediction error in the original units of yield.

Based on the test-set results, the optimal tree-based model is Cubist, because it has the lowest test RMSE among the models considered. The resampling results are useful for tuning the models, but the test-set results are the more important final comparison because they evaluate prediction performance on new data.

4.2 8.7(b):

Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

# extract variable importance from the best tree-based model
best_varimp_raw <- varImp(best_tree_model, scale = FALSE)$importance

if (!"Overall" %in% names(best_varimp_raw)) {
  numeric_cols <- names(best_varimp_raw)[sapply(best_varimp_raw, is.numeric)]
  best_varimp_raw$Overall <- rowMeans(best_varimp_raw[, numeric_cols, drop = FALSE], na.rm = TRUE)
}

best_varimp <- best_varimp_raw %>%
  rownames_to_column("Predictor") %>%
  select(Predictor, Overall) %>%
  arrange(desc(Overall))

top10_tree_predictors <- best_varimp %>%
  slice(1:10) %>%
  mutate(
    Variable_Group = case_when(
      str_detect(Predictor, "^BiologicalMaterial") ~ "Biological",
      str_detect(Predictor, "^ManufacturingProcess") ~ "Process",
      TRUE ~ "Other"
    )
  )

kable(
  top10_tree_predictors,
  digits = 3,
  caption = "Top 10 important predictors from the best tree-based model"
)
Top 10 important predictors from the best tree-based model
Predictor Overall Variable_Group
ManufacturingProcess32 48.0 Process
ManufacturingProcess09 28.0 Process
ManufacturingProcess33 24.0 Process
ManufacturingProcess04 17.5 Process
ManufacturingProcess45 15.0 Process
ManufacturingProcess15 12.0 Process
ManufacturingProcess13 11.5 Process
BiologicalMaterial06 11.0 Biological
BiologicalMaterial03 9.5 Biological
ManufacturingProcess11 6.5 Process
# plot the top 10 predictors from the best tree-based model
ggplot(top10_tree_predictors,
       aes(x = reorder(Predictor, Overall), y = Overall, fill = Variable_Group)) +
  geom_col() +
  coord_flip() +
  labs(
    title = paste("Top 10 Important Predictors from", best_tree_model_name),
    x = "Predictor",
    y = "Importance",
    fill = "Variable Group"
  )

# count how many top predictors are biological versus process variables
top10_group_summary <- top10_tree_predictors %>%
  count(Variable_Group)

kable(
  top10_group_summary,
  caption = "Biological versus process variables in the top 10 tree-based predictors"
)
Biological versus process variables in the top 10 tree-based predictors
Variable_Group n
Biological 2
Process 8
# compare the tree-based top predictors with the previous linear and nonlinear models
linear_top10 <- c(
  "ManufacturingProcess32",
  "ManufacturingProcess36",
  "ManufacturingProcess13",
  "ManufacturingProcess09",
  "ManufacturingProcess17",
  "BiologicalMaterial06",
  "ManufacturingProcess33",
  "BiologicalMaterial08",
  "BiologicalMaterial01",
  "BiologicalMaterial03"
)

nonlinear_top_predictors <- c(
  "ManufacturingProcess32",
  "ManufacturingProcess09"
)

predictor_comparison <- top10_tree_predictors %>%
  mutate(
    In_Top10_Linear_Model = Predictor %in% linear_top10,
    In_Nonlinear_MARS_Model = Predictor %in% nonlinear_top_predictors
  )

kable(
  predictor_comparison,
  digits = 3,
  caption = "Comparison of top tree-based predictors with previous linear and nonlinear models"
)
Comparison of top tree-based predictors with previous linear and nonlinear models
Predictor Overall Variable_Group In_Top10_Linear_Model In_Nonlinear_MARS_Model
ManufacturingProcess32 48.0 Process TRUE TRUE
ManufacturingProcess09 28.0 Process TRUE TRUE
ManufacturingProcess33 24.0 Process TRUE FALSE
ManufacturingProcess04 17.5 Process FALSE FALSE
ManufacturingProcess45 15.0 Process FALSE FALSE
ManufacturingProcess15 12.0 Process FALSE FALSE
ManufacturingProcess13 11.5 Process TRUE FALSE
BiologicalMaterial06 11.0 Biological TRUE FALSE
BiologicalMaterial03 9.5 Biological TRUE FALSE
ManufacturingProcess11 6.5 Process FALSE FALSE
overlap_summary <- tibble(
  Comparison = c(
    "Tree top 10 and linear top 10",
    "Tree top 10 and nonlinear MARS"
  ),
  Number_Overlapping = c(
    sum(top10_tree_predictors$Predictor %in% linear_top10),
    sum(top10_tree_predictors$Predictor %in% nonlinear_top_predictors)
  )
)

kable(
  overlap_summary,
  caption = "Overlap between tree-based predictors and previous model predictors"
)
Overlap between tree-based predictors and previous model predictors
Comparison Number_Overlapping
Tree top 10 and linear top 10 6
Tree top 10 and nonlinear MARS 2

The optimal tree-based model was Cubist. Its top predictors were ManufacturingProcess32, ManufacturingProcess09, ManufacturingProcess33, ManufacturingProcess04, ManufacturingProcess45, ManufacturingProcess15, ManufacturingProcess13, BiologicalMaterial06, BiologicalMaterial03, and ManufacturingProcess11. Among these top 10 predictors, 8 are manufacturing process variables and 2 are biological material variables. Therefore, the process variables clearly dominate the importance list, although BiologicalMaterial06 and BiologicalMaterial03 still show that raw-material measurements have some predictive value.

Compared with the optimal linear model from Exercise 6.3, the tree-based model has 6 overlapping predictors: ManufacturingProcess32, ManufacturingProcess09, ManufacturingProcess33, ManufacturingProcess13, BiologicalMaterial06, and BiologicalMaterial03. Compared with the optimal nonlinear MARS model from Exercise 7.5, the tree-based model overlaps on ManufacturingProcess32 and ManufacturingProcess09. This means ManufacturingProcess32 and ManufacturingProcess09 are especially stable predictors because they appear in the linear, nonlinear, and tree-based models.

Overall, the tree-based model agrees with the earlier models that manufacturing process variables are the main drivers of yield. The Cubist model spreads importance across more process variables than the MARS model, but the main conclusion is consistent: process-related predictors dominate the prediction of yield.

4.3 8.7(c):

Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

# plot the single regression tree for interpretation
rpart.plot(
  rpart_fit$finalModel,
  type = 2,
  extra = 101,
  fallen.leaves = TRUE,
  main = "Optimal Single Regression Tree for Yield"
)

# use finalModel$where to get each training observation's terminal node
terminal_nodes <- rpart_fit$finalModel$where

# convert terminal-node row positions into readable node labels
terminal_node_labels <- rownames(rpart_fit$finalModel$frame)[terminal_nodes]

tree_node_data <- tibble(
  Yield = y_train,
  Terminal_Node = factor(terminal_node_labels)
)

head(tree_node_data)
# plot Yield by terminal node to check whether the tree separates low and high yield groups
ggplot(tree_node_data, aes(x = Terminal_Node, y = Yield)) +
  geom_boxplot() +
  geom_jitter(width = 0.15, alpha = 0.5) +
  labs(
    title = "Distribution of Yield in the Terminal Nodes",
    x = "Terminal Node",
    y = "Yield"
  )

# summarize each terminal node by sample size, average yield, and spread
terminal_node_summary <- tree_node_data %>%
  group_by(Terminal_Node) %>%
  summarise(
    n = n(),
    mean_yield = mean(Yield),
    median_yield = median(Yield),
    sd_yield = sd(Yield),
    min_yield = min(Yield),
    max_yield = max(Yield),
    .groups = "drop"
  ) %>%
  arrange(mean_yield)

kable(
  terminal_node_summary,
  digits = 3,
  caption = "Yield summary by terminal node"
)
Yield summary by terminal node
Terminal_Node n mean_yield median_yield sd_yield min_yield max_yield
2 74 39.286 39.155 1.362 35.25 43.12
3 50 41.500 41.605 1.453 36.83 44.16

The optimal single regression tree uses ManufacturingProcess32 as the main splitting variable. This provides a simple interpretation: observations with lower preprocessed values of ManufacturingProcess32 are placed into the lower-yield terminal node, while observations with higher values are placed into the higher-yield terminal node.

The terminal node summary shows that node 2 has an average yield of 39.286, while node 3 has an average yield of 41.500. The boxplot also shows that node 3 has a higher median yield than node 2. This suggests that ManufacturingProcess32 is strongly related to yield and can separate the data into lower- and higher-yield groups.

This view provides additional knowledge because it gives a simple rule-based explanation of the yield pattern. However, the single tree is not the best predictive model because its test RMSE is higher than Cubist. I would use Cubist for prediction, but the single tree is useful for explaining the main relationship between ManufacturingProcess32 and yield.