1 Exercise 8.1

Recreate the simulated data from Exercise 7.2:

>library(mlbench)
>set.seed(200)
>simulated <- mlbench.friedman1(200, sd = 1)
>simulated <- cbind(simulated$x, simulated$y)
>simulated <- as.data.frame(simulated)
>colnames(simulated)[ncol(simulated)] <- "y"

1.1 8.1(a):

Fit a random forest model to all of the predictors, then estimate the variable importance scores:

library(randomForest)
library(caret)
model1 <- randomForest(y ~., data = simulated,
                        importance = TRUE,
                        ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

Did the random forest model significantly use the uninformative predictors (v6-v10)?

set.seed(200)
simulated_raw <- mlbench.friedman1(200, sd = 1)

# recreate the Friedman simulation with 10 predictors and one response
simulated <- as.data.frame(cbind(simulated_raw$x, y = simulated_raw$y))
colnames(simulated) <- c(paste0("V", 1:10), "y")

# V1 to V5 are informative predictors, while V6 to V10 are noise variables.
head(simulated)

set.seed(RANDOM_STATE)
model_rf_original <- randomForest(
  y ~ .,
  data = simulated,
  importance = TRUE,
  ntree = 1000
)

# use %IncMSE to measure random forest variable importance for regression
rf_imp_original <- importance(model_rf_original, scale = FALSE) %>%
  as.data.frame() %>%
  rownames_to_column("Predictor")

importance_col <- ifelse("%IncMSE" %in% names(rf_imp_original), "%IncMSE", names(rf_imp_original)[2])

rf_imp_original <- rf_imp_original %>%
  mutate(
    Importance = .data[[importance_col]],
    Predictor_Type = ifelse(Predictor %in% paste0("V", 1:5),
                            "Informative: V1-V5",
                            "Noise: V6-V10")
  ) %>%
  arrange(desc(Importance))

kable(rf_imp_original[, c("Predictor", "Predictor_Type", "Importance")],
      digits = 3,
      caption = "Random forest variable importance for the original Friedman data")

Random forest variable importance for the original Friedman data
Predictor	Predictor_Type	Importance
V1	Informative: V1-V5	8.841
V4	Informative: V1-V5	7.760
V2	Informative: V1-V5	6.481
V5	Informative: V1-V5	2.156
V3	Informative: V1-V5	0.814
V6	Noise: V6-V10	0.103
V7	Noise: V6-V10	0.039
V10	Noise: V6-V10	-0.090
V9	Noise: V6-V10	-0.145
V8	Noise: V6-V10	-0.161

# plot importance scores to compare informative and noise predictors
ggplot(rf_imp_original, aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Random Forest Variable Importance: Original Data",
    x = "Predictor",
    y = "Importance (%IncMSE)"
  )

rf_imp_original_summary <- rf_imp_original %>%
  group_by(Predictor_Type) %>%
  summarize(
    Mean_Importance = mean(Importance),
    Max_Importance = max(Importance),
    .groups = "drop"
  )

kable(rf_imp_original_summary,
      digits = 3,
      caption = "Average and maximum importance by predictor type")

Average and maximum importance by predictor type
Predictor_Type	Mean_Importance	Max_Importance
Informative: V1-V5	5.211	8.841
Noise: V6-V10	-0.051	0.103

The random forest mostly gives larger importance to the truly informative variables (V1 to V5). The uninformative predictors (V6 to V10) may still receive small nonzero importance values because of random sampling variation, but they should not dominate the importance list. Based on the table and plot, I would not say the model significantly used the uninformative predictors unless one of V6 to V10 appears near the top with importance similar to the true predictors.

1.2 8.1(b):

Now add an additional predictor that is highly correlated with one of the informative predictors. For example: simulated\(duplicate1 <- simulated\)V1 + rnorm(200) * .1 cor(simulated\(duplicated1, simulated\)V1)

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

set.seed(RANDOM_STATE)

# add duplicate1 as a highly correlated copy of V1
simulated_dup1 <- simulated %>%
  mutate(duplicate1 = V1 + rnorm(n(), sd = 0.1))

cor(simulated_dup1$duplicate1, simulated_dup1$V1)

## [1] 0.9429376

set.seed(RANDOM_STATE)
model_rf_duplicate1 <- randomForest(
  y ~ .,
  data = simulated_dup1,
  importance = TRUE,
  ntree = 1000
)

# extract variable importance after adding one correlated predictor
rf_imp_duplicate1 <- importance(model_rf_duplicate1, scale = FALSE) %>%
  as.data.frame() %>%
  rownames_to_column("Predictor")

importance_col_dup1 <- ifelse(
  "%IncMSE" %in% names(rf_imp_duplicate1),
  "%IncMSE",
  names(rf_imp_duplicate1)[2]
)

rf_imp_duplicate1 <- rf_imp_duplicate1 %>%
  mutate(
    Importance = .data[[importance_col_dup1]],
    Predictor_Type = case_when(
      Predictor %in% paste0("V", 1:5) ~ "Informative: V1-V5",
      Predictor == "duplicate1" ~ "Highly correlated copy of V1",
      TRUE ~ "Noise: V6-V10"
    )
  ) %>%
  arrange(desc(Importance))

kable(
  rf_imp_duplicate1[, c("Predictor", "Predictor_Type", "Importance")],
  digits = 3,
  caption = "Random forest variable importance after adding duplicate1"
)

Random forest variable importance after adding duplicate1
Predictor	Predictor_Type	Importance
V4	Informative: V1-V5	7.242
V2	Informative: V1-V5	6.219
V1	Informative: V1-V5	5.971
duplicate1	Highly correlated copy of V1	3.696
V5	Informative: V1-V5	2.004
V3	Informative: V1-V5	0.634
V6	Noise: V6-V10	0.168
V7	Noise: V6-V10	0.078
V8	Noise: V6-V10	-0.028
V9	Noise: V6-V10	-0.083
V10	Noise: V6-V10	-0.091

set.seed(RANDOM_STATE + 1)

# add duplicate2 as another highly correlated copy of V1
simulated_dup2 <- simulated_dup1 %>%
  mutate(duplicate2 = V1 + rnorm(n(), sd = 0.1))

cor(simulated_dup2[, c("V1", "duplicate1", "duplicate2")])

##                   V1 duplicate1 duplicate2
## V1         1.0000000  0.9429376  0.9419378
## duplicate1 0.9429376  1.0000000  0.8887843
## duplicate2 0.9419378  0.8887843  1.0000000

set.seed(RANDOM_STATE)

model_rf_duplicate2 <- randomForest(
  y ~ .,
  data = simulated_dup2,
  importance = TRUE,
  ntree = 1000
)

# extract variable importance after adding two correlated predictos
rf_imp_duplicate2 <- importance(model_rf_duplicate2, scale = FALSE) %>%
  as.data.frame() %>%
  rownames_to_column("Predictor")

importance_col_dup2 <- ifelse(
  "%IncMSE" %in% names(rf_imp_duplicate2),
  "%IncMSE",
  names(rf_imp_duplicate2)[2]
)

rf_imp_duplicate2 <- rf_imp_duplicate2 %>%
  mutate(
    Importance = .data[[importance_col_dup2]],
    Predictor_Type = case_when(
      Predictor %in% paste0("V", 1:5) ~ "Informative: V1-V5",
      Predictor %in% c("duplicate1", "duplicate2") ~ "Highly correlated copy of V1",
      TRUE ~ "Noise: V6-V10"
    )
  ) %>%
  arrange(desc(Importance))

kable(
  rf_imp_duplicate2[, c("Predictor", "Predictor_Type", "Importance")],
  digits = 3,
  caption = "Random forest variable importance after adding duplicate1 and duplicate2"
)

Random forest variable importance after adding duplicate1 and duplicate2
Predictor	Predictor_Type	Importance
V4	Informative: V1-V5	7.493
V2	Informative: V1-V5	6.388
V1	Informative: V1-V5	4.788
duplicate2	Highly correlated copy of V1	2.820
duplicate1	Highly correlated copy of V1	2.557
V5	Informative: V1-V5	1.877
V3	Informative: V1-V5	0.480
V6	Noise: V6-V10	0.142
V7	Noise: V6-V10	0.031
V9	Noise: V6-V10	-0.009
V10	Noise: V6-V10	-0.009
V8	Noise: V6-V10	-0.042

# compare V1 importance before and after adding the correlated predictor
compare_v1 <- bind_rows(
  rf_imp_original %>%
    filter(Predictor == "V1") %>%
    transmute(Model = "Original data", Predictor, Importance),
  
  rf_imp_duplicate1 %>%
    filter(Predictor %in% c("V1", "duplicate1")) %>%
    transmute(Model = "With duplicate1", Predictor, Importance),
  
  rf_imp_duplicate2 %>%
    filter(Predictor %in% c("V1", "duplicate1", "duplicate2")) %>%
    transmute(Model = "With duplicate1 and duplicate2", Predictor, Importance)
)

kable(
  compare_v1,
  digits = 3,
  caption = "Change in V1 importance after adding correlated predictors"
)

Change in V1 importance after adding correlated predictors
Model	Predictor	Importance
Original data	V1	8.841
With duplicate1	V1	5.971
With duplicate1	duplicate1	3.696
With duplicate1 and duplicate2	V1	4.788
With duplicate1 and duplicate2	duplicate2	2.820
With duplicate1 and duplicate2	duplicate1	2.557

ggplot(rf_imp_duplicate2, aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Random Forest Variable Importance: With duplicate1 and duplicate2",
    x = "Predictor",
    y = "Importance (%IncMSE)"
  )

After adding duplicate1, the importance of V1 decreases because the new predictor contains almost the same information as V1. The random forest can split on either variable, so the importance gets shared between them.

After adding duplicate2, the same pattern becomes stronger. The information originally represented by V1 is now spread across V1, duplicate1, and duplicate2. This shows that random forest variable importance can be diluted when several predictors are highly correlated with each other. The predictor may still be important, but its importance score may look smaller because other correlated variables are competing with it.

1.3 8.1(c):

Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

set.seed(RANDOM_STATE)

# fit a conditional inference forest to compare traditional and conditional importance
cforest_fit <- cforest(
  y ~ .,
  data = simulated_dup1,
  controls = cforest_unbiased(ntree = 500, mtry = 3)
)

# conditional = FALSE gives the regular permutation importance
cf_imp_unconditional <- varimp(cforest_fit, conditional = FALSE)

#  conditional = TRUE adjusts importance for correlated predictors
cf_imp_conditional <- varimp(cforest_fit, conditional = TRUE)

cf_importance <- tibble(
  Predictor = names(cf_imp_unconditional),
  Unconditional_Importance = as.numeric(cf_imp_unconditional),
  Conditional_Importance = as.numeric(cf_imp_conditional)
) %>%
  mutate(
    Predictor_Type = case_when(
      Predictor %in% paste0("V", 1:5) ~ "Informative: V1-V5", 
      Predictor == "duplicate1" ~ "Highly correlated duplicate of V1",
      TRUE ~ "Noise: V6-V10"
    )
  ) %>%
  arrange(desc(Conditional_Importance))

kable(cf_importance,
      digits = 3,
      caption = "Conditional inference forest variable importance")

Conditional inference forest variable importance
Predictor	Unconditional_Importance	Conditional_Importance	Predictor_Type
V4	5.926	4.237	Informative: V1-V5
V2	4.847	3.445	Informative: V1-V5
V1	5.896	2.570	Informative: V1-V5
duplicate1	3.169	1.185	Highly correlated duplicate of V1
V5	1.869	1.003	Informative: V1-V5
V3	0.067	0.059	Informative: V1-V5
V9	-0.028	0.021	Noise: V6-V10
V7	-0.045	0.002	Noise: V6-V10
V8	-0.014	0.000	Noise: V6-V10
V6	-0.010	0.000	Noise: V6-V10
V10	-0.054	-0.036	Noise: V6-V10

cf_importance_long <- cf_importance %>%
  pivot_longer(
    cols = c(Unconditional_Importance, Conditional_Importance),
    names_to = "Importance_Type",
    values_to = "Importance"
  )

ggplot(cf_importance_long,
       aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ Importance_Type, scales = "free_x") +
  labs(
    title = "cforest Variable Importance",
    x = "Predictor",
    y = "Importance"
  )

The cforest results show a similar overall pattern to the traditional random forest. The informative predictors still have the largest importance values, while the noise variables have importance values close to zero. However, the conditional importance measure reduces the effect of correlated predictors. In this case, V1 and duplicate1 are still important, but the conditional importance gives a more cautious estimate because the two variables contain similar information.

1.4 8.1(d):

Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

# use a small repeated cross-validation setup for this simulation exercise
ctrl_small <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2
)

# set a small tuning grid for boosted regression trees
gbm_grid <- expand.grid(
  n.trees = c(100, 300),
  interaction.depth = c(1, 3),
  shrinkage = c(0.05, 0.10),
  n.minobsinnode = c(5, 10)
)

# set a small tuning grid for Cubist models
cubist_grid <- expand.grid(
  committees = c(1, 10, 50),
  neighbors = c(0, 5)
)

set.seed(RANDOM_STATE)
gbm_original <- train(
  y ~ .,
  data = simulated,
  method = "gbm",
  trControl = ctrl_small,
  tuneGrid = gbm_grid,
  metric = "RMSE",
  verbose = FALSE
)

set.seed(RANDOM_STATE)
cubist_original <- train(
  y ~ .,
  data = simulated,
  method = "cubist",
  trControl = ctrl_small,
  tuneGrid = cubist_grid,
  metric = "RMSE"
)

set.seed(RANDOM_STATE)
gbm_duplicate <- train(
  y ~ .,
  data = simulated_dup1,
  method = "gbm",
  trControl = ctrl_small,
  tuneGrid = gbm_grid,
  metric = "RMSE",
  verbose = FALSE
)

set.seed(RANDOM_STATE)
cubist_duplicate <- train(
  y ~ .,
  data = simulated_dup1,
  method = "cubist",
  trControl = ctrl_small,
  tuneGrid = cubist_grid,
  metric = "RMSE"
)

# use this helper function to extract caret variable importance in a clean format
extract_caret_imp <- function(fit, model_name) {
  out <- varImp(fit, scale = FALSE)$importance %>%
    as.data.frame() %>%
    rownames_to_column("Predictor")
  
  if (!"Overall" %in% names(out)) {
    numeric_cols <- names(out)[sapply(out, is.numeric)]
    out$Overall <- rowMeans(out[, numeric_cols, drop = FALSE])
  }
  
  out %>% 
    transmute(Model = model_name, Predictor, Importance = Overall) %>%
    arrange(desc(Importance))
}

imp_gbm_original <- extract_caret_imp(gbm_original, "GBM: original")
imp_gbm_duplicate <- extract_caret_imp(gbm_duplicate, "GBM: with duplicate1")
imp_cubist_original <- extract_caret_imp(cubist_original, "Cubist: original")
imp_cubist_duplicate <- extract_caret_imp(cubist_duplicate, "Cubist: with duplicate1")

imp_tree_methods <- bind_rows(
  imp_gbm_original,
  imp_gbm_duplicate,
  imp_cubist_original,
  imp_cubist_duplicate
)

kable(imp_tree_methods %>% filter(Predictor %in% c("V1", "duplicate1")),
      digits = 3,
      caption = "Importance of V1 and duplicate1 for GBM and Cubist")

Importance of V1 and duplicate1 for GBM and Cubist
Model	Predictor	Importance
GBM: original	V1	8215.241
GBM: with duplicate1	V1	5415.667
GBM: with duplicate1	duplicate1	3211.120
Cubist: original	V1	70.500
Cubist: with duplicate1	V1	69.500
Cubist: with duplicate1	duplicate1	0.000

# keep only positive-importance predictors and remove extra tied rows
top_predictors_tree_methods <- imp_tree_methods %>%
  filter(Importance > 0) %>%
  group_by(Model) %>%
  slice_max(Importance, n = 8, with_ties = FALSE) %>%
  ungroup()

kable(
  top_predictors_tree_methods,
  digits = 3,
  caption = "Top positive-importance predictors from boosted trees and Cubist"
)

Top positive-importance predictors from boosted trees and Cubist
Model	Predictor	Importance
Cubist: original	V1	70.500
Cubist: original	V2	54.500
Cubist: original	V4	50.000
Cubist: original	V5	47.500
Cubist: original	V3	41.000
Cubist: with duplicate1	V1	69.500
Cubist: with duplicate1	V2	54.500
Cubist: with duplicate1	V4	50.000
Cubist: with duplicate1	V5	47.000
Cubist: with duplicate1	V3	41.000
Cubist: with duplicate1	V6	14.000
GBM: original	V4	9021.022
GBM: original	V1	8215.241
GBM: original	V2	6756.323
GBM: original	V5	4038.332
GBM: original	V3	2699.320
GBM: original	V7	501.379
GBM: original	V6	399.003
GBM: original	V8	249.501
GBM: with duplicate1	V4	9180.565
GBM: with duplicate1	V2	6458.622
GBM: with duplicate1	V1	5415.667
GBM: with duplicate1	V5	3972.630
GBM: with duplicate1	duplicate1	3211.120
GBM: with duplicate1	V3	2792.299
GBM: with duplicate1	V7	453.513
GBM: with duplicate1	V6	417.806

The same general issue appears in boosted trees, but not exactly the same way for Cubist. In the GBM model, adding duplicate1 reduces the importance of V1 and gives duplicate1 a noticeable importance score. This suggests that boosted trees can split importance between two highly correlated predictors, similar to what happened in the random forest model.

For Cubist, the pattern is different. The model mostly keeps V1 and gives duplicate1 almost no importance. This means Cubist selected one of the correlated predictors and largely ignored the other. In this case, Cubist did not split importance between V1 and duplicate1 the same way GBM did.

Overall, correlated predictors can affect variable importance, but the effect depends on the tree-based method. GBM spreads some importance across correlated predictors, while Cubist tends to choose one of the correlated predictors. Even though a few noise variables may receive small positive importance values, the truly informative predictors still dominate the importance lists.

Fig.8.24: A comparison of variable importance magnitudes for differing values of the bagging fraction and shrinkage parameters. Both tuning parameters are set to 0.1 in the left figure. Both are set to 0.9 in the right figure

2 Exercise 8.2

Use a simulation to show tree bias with different granularities.

# create simulated data where all predictors are noise but have different granularities
make_bias_data <- function(n = 300) {
  tibble(
    y = rnorm(n),
    x_binary = sample(0:1, n, replace = TRUE),
    x_5_levels = sample(1:5, n, replace = TRUE),
    x_10_levels = sample(1:10, n, replace = TRUE),
    x_50_levels = sample(1:50, n, replace = TRUE),
    x_continuous = runif(n)
  )
}

# fit one tree per simulation and record which variable is chosen for the first split
one_tree_bias_run <- function(seed_id, n = 300) {
  set.seed(seed_id)
  dat <- make_bias_data(n)
  
  fit <- rpart(
    y ~ .,
    data = dat,
    method = "anova",
    control = rpart.control(cp = 0, minsplit = 20, maxdepth = 3)
  )
  
  tibble(
    Replication = seed_id,
    Root_Split = fit$frame$var[1]
  )
}

set.seed (RANDOM_STATE)
root_results <- map_dfr(1:200, one_tree_bias_run)

# count how often each noise predictor is selected as the root split
root_summary <- root_results %>%
  count(Root_Split, name = "Times_Selected") %>%
  mutate(Percent_Selected = Times_Selected / sum(Times_Selected) * 100) %>%
  arrange(desc(Times_Selected))

kable(root_summary,
      digits = 2,
      caption = "How often each noise predictor was selected as the root split")

How often each noise predictor was selected as the root split
Root_Split	Times_Selected	Percent_Selected
x_continuous	91	45.5
x_50_levels	56	28.0
x_5_levels	24	12.0
x_10_levels	23	11.5
x_binary	6	3.0

ggplot(root_summary, aes(x = reorder(Root_Split, Percent_Selected), y = Percent_Selected)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Tree Bias Simulation: Root Split Selection",
    x = "Predictor",
    y = "Percent selected as root split"
  )

set.seed(RANDOM_STATE)
bias_data <- make_bias_data(n = 1000)

# fit a random forest to check importance bias when all predictors are noise
bias_rf <- randomForest(
  y ~ .,
  data = bias_data,
  ntree = 1000,
  importance = TRUE
)

bias_rf_imp <- importance(bias_rf, scale = FALSE) %>%
  as.data.frame() %>%
  rownames_to_column("Predictor")

importance_col_bias <- ifelse("%IncMSE" %in% names(bias_rf_imp), "%IncMSE", names(bias_rf_imp)[2])

bias_rf_imp <- bias_rf_imp %>%
  transmute(Predictor, Importance = .data[[importance_col_bias]]) %>%
  arrange(desc(Importance))

kable(bias_rf_imp,
      digits = 3,
      caption = "Random forest importance when all predictors are noise")

Random forest importance when all predictors are noise
Predictor	Importance
x_50_levels	0.007
x_5_levels	0.006
x_10_levels	0.005
x_continuous	0.002
x_binary	-0.001

ggplot(bias_rf_imp, aes(x = reorder(Predictor, Importance), y = Importance)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Variable Importance Bias When All Predictors Are Noise",
    x = "Predictor",
    y = "Importance (%IncMSE)"
  )

In this simulation, all predictors are noise, so no predictor should truly be important. However, the single-tree simulation shows that the tree often selects predictors with more possible split points as the root split. The continuous predictor was selected most often, followed by the predictor with 50 levels, while the binary predictor was selected least often. This happens because predictors with more possible splits have more chances to create an apparently useful split by random chance.

The random forest importance results show the same general issue, although the importance values are very small because none of the predictors are truly related to the response. Overall, this simulation shows that tree-based models can have selection bias when predictors have different granularities.

3 Exercise 8.3

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

3.1 8.3(a):

Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The model on the right focuses its importance on only a few predictors because both tuning parameters are large. A high learning rate allows each tree to make a large correction to the model, and a high bagging fraction means each tree is trained on a larger portion of the data. Together, these settings make the boosting process move more aggressively toward the strongest predictors early in model building.

The model on the left spreads importance across more predictors because the learning rate and bagging fraction are both smaller. Each tree has a smaller effect, so the model learns more gradually. As a result, more predictors have a chance to contribute to the final model.

3.2 8.3(b):

which model do you think would be more predictive of other samples？

I would expect the model on the left to be more predictive for new samples, assuming enough trees are used. The lower learning rate and smaller bagging fraction make the model more regularized, which usually helps reduce overfitting. The model on the right may fit the training data quickly, but it is more likely to overfit because the variable importance is concentrated heavily on only a few predictors.

3.3 8.3(c):

How would increasing interaction depth affect the slope of predictor importance for either model in Fig.8.24?

Increasing the interaction depth would allow each tree to use more splits and capture more complex interactions among predictors. This could make the variable importance curve less steep because more predictors could become useful through interactions, not only through their individual effects. However, if the true signal is concentrated in only a few predictors, those predictors may still stay near the top. A larger interaction depth can improve flexibility, but it can also increase the risk of overfitting.

4 Exercise 8.7

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

# load the chemical manufacturing data and separate predictors from Yield
data(ChemicalManufacturingProcess)

chem_data <- ChemicalManufacturingProcess
predictors <- chem_data %>% select(-Yield)
yield <- chem_data$Yield

data_overview <- tibble(
  Rows = nrow(chem_data),
  Predictors = ncol(predictors),
  Missing_Predictor_Cells = sum(is.na(predictors)),
  Yield_Mean = mean(yield),
  Yield_SD = sd(yield)
)

kable(
  data_overview,
  digits = 3,
  caption = "Chemical manufacturing data overview"
)

Chemical manufacturing data overview
Rows	Predictors	Missing_Predictor_Cells	Yield_Mean	Yield_SD
176	57	106	40.177	1.846

# use the same 70/30 split and preprocessing workflow from Exercise 7.5 for a consistent comparison
set.seed(901)

training_rows <- createDataPartition(yield, p = 0.70, list = FALSE)

train_predictors <- predictors[training_rows, ]
train_yield <- yield[training_rows]

test_predictors <- predictors[-training_rows, ]
test_yield <- yield[-training_rows]

# apply the same transformation, scaling, and kNN imputation from Exercise 7.5
pp <- preProcess(
  train_predictors,
  method = c("YeoJohnson", "center", "scale", "knnImpute")
)

pp_train_predictors <- predict(pp, train_predictors)
pp_test_predictors <- predict(pp, test_predictors)

# remove near-zero variance predictors after preprocessing
nzvpp <- nearZeroVar(pp_train_predictors)

if (length(nzvpp) > 0) {
  pp_train_predictors <- pp_train_predictors[, -nzvpp]
  pp_test_predictors <- pp_test_predictors[, -nzvpp]
}

# remove highly correlated predictors to match the reduced predictor set from Exercise 7.5
predcorr <- cor(pp_train_predictors)
highCorrpp <- findCorrelation(predcorr)

if (length(highCorrpp) > 0) {
  pp_train_predictors <- pp_train_predictors[, -highCorrpp]
  pp_test_predictors <- pp_test_predictors[, -highCorrpp]
}

# rename the processed training and test data for easier model fitting
x_train <- pp_train_predictors
y_train <- train_yield

x_test <- pp_test_predictors
y_test <- test_yield

preprocess_overview <- tibble(
  Training_Rows = nrow(x_train),
  Training_Predictors = ncol(x_train),
  Test_Rows = nrow(x_test),
  Test_Predictors = ncol(x_test),
  Missing_Training_Values_After_Preprocessing = sum(is.na(x_train)),
  Missing_Test_Values_After_Preprocessing = sum(is.na(x_test))
)

kable(
  preprocess_overview,
  caption = "Exercise 8.7 data after preprocessing"
)

Exercise 8.7 data after preprocessing
Training_Rows	Training_Predictors	Test_Rows	Test_Predictors	Missing_Training_Values_After_Preprocessing	Missing_Test_Values_After_Preprocessing
124	46	52	46	0	0

# use bootstrap resampling to stay consistent with the earlier nonlinear model comparison
ctrl_87 <- trainControl(
  method = "boot",
  number = 25
)

4.1 8.7(a):

Which tree-based regression model gives the optimal resampling and test set performance?

# fit a single regression tree as an interpretable baseline model
set.seed(415)
rpart_fit <- train(
  x = x_train,
  y = y_train,
  method = "rpart",
  trControl = ctrl_87,
  tuneLength = 20,
  metric = "RMSE"
)

# fit a random forest to improve stability by averaging many trees
set.seed(415)
rf_fit <- train(
  x = x_train,
  y = y_train,
  method = "rf",
  trControl = ctrl_87,
  tuneGrid = expand.grid(
    mtry = unique(c(2, floor(sqrt(ncol(x_train))), floor(ncol(x_train) / 3), floor(ncol(x_train) / 2), ncol(x_train)))
  ),
  metric = "RMSE",
  importance = TRUE
)

# fit boosted trees using a moderate tuning grid.
gbm_grid <- expand.grid(
  n.trees = c(100, 300, 500),
  interaction.depth = c(1, 3, 5),
  shrinkage = c(0.05, 0.10),
  n.minobsinnode = 10
)

set.seed(415)
gbm_fit <- train(
  x = x_train,
  y = y_train,
  method = "gbm",
  trControl = ctrl_87,
  tuneGrid = gbm_grid,
  metric = "RMSE",
  verbose = FALSE
)

# fit Cubist because it combines rule-based splits with linear models
cubist_grid <- expand.grid(
  committees = c(1, 10, 20, 50),
  neighbors = c(0, 5, 9)
)

set.seed(415)
cubist_fit <- train(
  x = x_train,
  y = y_train,
  method = "cubist",
  trControl = ctrl_87,
  tuneGrid = cubist_grid,
  metric = "RMSE"
)

# extract each model's best resampling result for a clean comparison table
get_best_resample <- function(model, model_name) {
  best <- model$results
  
  for (col in names(model$bestTune)) {
    best <- best[best[[col]] == model$bestTune[[col]], ]
  }
  
  best %>%
    slice(1) %>%
    mutate(Model = model_name) %>%
    select(Model, everything())
}

tree_resampling_results <- bind_rows(
  get_best_resample(rpart_fit, "Single Regression Tree"),
  get_best_resample(rf_fit, "Random Forest"),
  get_best_resample(gbm_fit, "Boosted Trees"),
  get_best_resample(cubist_fit, "Cubist")
) %>%
  arrange(RMSE)

kable(
  tree_resampling_results,
  digits = 3,
  caption = "Best resampling performance for tree-based models"
)

Best resampling performance for tree-based models
Model	cp	RMSE	Rsquared	MAE	RMSESD	RsquaredSD	MAESD	mtry	shrinkage	interaction.depth	n.minobsinnode	n.trees	committees	neighbors
Cubist	NA	1.097	0.604	0.839	0.153	0.096	0.121	NA	NA	NA	NA	NA	50	5
Random Forest	NA	1.156	0.576	0.896	0.129	0.093	0.114	23	NA	NA	NA	NA	NA	NA
Boosted Trees	NA	1.192	0.533	0.909	0.095	0.078	0.094	NA	0.05	5	10	300	NA	NA
Single Regression Tree	0.16	1.427	0.353	1.131	0.148	0.107	0.119	NA	NA	NA	NA	NA	NA	NA

# calculate test-set RMSE, R-squared, and MAE for each model
get_test_performance <- function(model, model_name) {
  pred <- predict(model, newdata = x_test)
  perf <- postResample(pred = pred, obs = y_test)
  
  tibble(
    Model = model_name,
    Test_RMSE = unname(perf["RMSE"]),
    Test_Rsquared = unname(perf["Rsquared"]),
    Test_MAE = unname(perf["MAE"])
  )
}

tree_test_results <- bind_rows(
  get_test_performance(rpart_fit, "Single Regression Tree"),
  get_test_performance(rf_fit, "Random Forest"),
  get_test_performance(gbm_fit, "Boosted Trees"),
  get_test_performance(cubist_fit, "Cubist")
) %>%
  arrange(Test_RMSE)

kable(
  tree_test_results,
  digits = 3,
  caption = "Test-set performance for tree-based models"
)

Test-set performance for tree-based models
Model	Test_RMSE	Test_Rsquared	Test_MAE
Cubist	1.083	0.722	0.821
Random Forest	1.247	0.673	0.997
Boosted Trees	1.267	0.613	1.016
Single Regression Tree	1.544	0.426	1.225

# select the best tree-based model based on the lowest test-set RMSE
best_tree_model_name <- tree_test_results$Model[1]

model_list <- list(
  "Single Regression Tree" = rpart_fit,
  "Random Forest" = rf_fit,
  "Boosted Trees" = gbm_fit,
  "Cubist" = cubist_fit
)

best_tree_model <- model_list[[best_tree_model_name]]

best_model_summary <- tibble(
  Best_Model = best_tree_model_name,
  Best_Test_RMSE = tree_test_results$Test_RMSE[1],
  Best_Test_Rsquared = tree_test_results$Test_Rsquared[1],
  Best_Test_MAE = tree_test_results$Test_MAE[1]
)

kable(
  best_model_summary,
  digits = 3,
  caption = "Best tree-based model based on test-set RMSE"
)

Best tree-based model based on test-set RMSE
Best_Model	Best_Test_RMSE	Best_Test_Rsquared	Best_Test_MAE
Cubist	1.083	0.722	0.821

The tree-based regression models were compared using bootstrap resampling and then evaluated on the hold-out test set. I used RMSE as the main comparison metric because Yield is a continuous response variable, and RMSE measures prediction error in the original units of yield.

Based on the test-set results, the optimal tree-based model is Cubist, because it has the lowest test RMSE among the models considered. The resampling results are useful for tuning the models, but the test-set results are the more important final comparison because they evaluate prediction performance on new data.

4.2 8.7(b):

Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

# extract variable importance from the best tree-based model
best_varimp_raw <- varImp(best_tree_model, scale = FALSE)$importance

if (!"Overall" %in% names(best_varimp_raw)) {
  numeric_cols <- names(best_varimp_raw)[sapply(best_varimp_raw, is.numeric)]
  best_varimp_raw$Overall <- rowMeans(best_varimp_raw[, numeric_cols, drop = FALSE], na.rm = TRUE)
}

best_varimp <- best_varimp_raw %>%
  rownames_to_column("Predictor") %>%
  select(Predictor, Overall) %>%
  arrange(desc(Overall))

top10_tree_predictors <- best_varimp %>%
  slice(1:10) %>%
  mutate(
    Variable_Group = case_when(
      str_detect(Predictor, "^BiologicalMaterial") ~ "Biological",
      str_detect(Predictor, "^ManufacturingProcess") ~ "Process",
      TRUE ~ "Other"
    )
  )

kable(
  top10_tree_predictors,
  digits = 3,
  caption = "Top 10 important predictors from the best tree-based model"
)

Top 10 important predictors from the best tree-based model
Predictor	Overall	Variable_Group
ManufacturingProcess32	48.0	Process
ManufacturingProcess09	28.0	Process
ManufacturingProcess33	24.0	Process
ManufacturingProcess04	17.5	Process
ManufacturingProcess45	15.0	Process
ManufacturingProcess15	12.0	Process
ManufacturingProcess13	11.5	Process
BiologicalMaterial06	11.0	Biological
BiologicalMaterial03	9.5	Biological
ManufacturingProcess11	6.5	Process

# plot the top 10 predictors from the best tree-based model
ggplot(top10_tree_predictors,
       aes(x = reorder(Predictor, Overall), y = Overall, fill = Variable_Group)) +
  geom_col() +
  coord_flip() +
  labs(
    title = paste("Top 10 Important Predictors from", best_tree_model_name),
    x = "Predictor",
    y = "Importance",
    fill = "Variable Group"
  )

# count how many top predictors are biological versus process variables
top10_group_summary <- top10_tree_predictors %>%
  count(Variable_Group)

kable(
  top10_group_summary,
  caption = "Biological versus process variables in the top 10 tree-based predictors"
)

Biological versus process variables in the top 10 tree-based predictors
Variable_Group	n
Biological	2
Process	8

# compare the tree-based top predictors with the previous linear and nonlinear models
linear_top10 <- c(
  "ManufacturingProcess32",
  "ManufacturingProcess36",
  "ManufacturingProcess13",
  "ManufacturingProcess09",
  "ManufacturingProcess17",
  "BiologicalMaterial06",
  "ManufacturingProcess33",
  "BiologicalMaterial08",
  "BiologicalMaterial01",
  "BiologicalMaterial03"
)

nonlinear_top_predictors <- c(
  "ManufacturingProcess32",
  "ManufacturingProcess09"
)

predictor_comparison <- top10_tree_predictors %>%
  mutate(
    In_Top10_Linear_Model = Predictor %in% linear_top10,
    In_Nonlinear_MARS_Model = Predictor %in% nonlinear_top_predictors
  )

kable(
  predictor_comparison,
  digits = 3,
  caption = "Comparison of top tree-based predictors with previous linear and nonlinear models"
)

Comparison of top tree-based predictors with previous linear and nonlinear models
Predictor	Overall	Variable_Group	In_Top10_Linear_Model	In_Nonlinear_MARS_Model
ManufacturingProcess32	48.0	Process	TRUE	TRUE
ManufacturingProcess09	28.0	Process	TRUE	TRUE
ManufacturingProcess33	24.0	Process	TRUE	FALSE
ManufacturingProcess04	17.5	Process	FALSE	FALSE
ManufacturingProcess45	15.0	Process	FALSE	FALSE
ManufacturingProcess15	12.0	Process	FALSE	FALSE
ManufacturingProcess13	11.5	Process	TRUE	FALSE
BiologicalMaterial06	11.0	Biological	TRUE	FALSE
BiologicalMaterial03	9.5	Biological	TRUE	FALSE
ManufacturingProcess11	6.5	Process	FALSE	FALSE

overlap_summary <- tibble(
  Comparison = c(
    "Tree top 10 and linear top 10",
    "Tree top 10 and nonlinear MARS"
  ),
  Number_Overlapping = c(
    sum(top10_tree_predictors$Predictor %in% linear_top10),
    sum(top10_tree_predictors$Predictor %in% nonlinear_top_predictors)
  )
)

kable(
  overlap_summary,
  caption = "Overlap between tree-based predictors and previous model predictors"
)

Overlap between tree-based predictors and previous model predictors
Comparison	Number_Overlapping
Tree top 10 and linear top 10	6
Tree top 10 and nonlinear MARS	2

The optimal tree-based model was Cubist. Its top predictors were ManufacturingProcess32, ManufacturingProcess09, ManufacturingProcess33, ManufacturingProcess04, ManufacturingProcess45, ManufacturingProcess15, ManufacturingProcess13, BiologicalMaterial06, BiologicalMaterial03, and ManufacturingProcess11. Among these top 10 predictors, 8 are manufacturing process variables and 2 are biological material variables. Therefore, the process variables clearly dominate the importance list, although BiologicalMaterial06 and BiologicalMaterial03 still show that raw-material measurements have some predictive value.

Compared with the optimal linear model from Exercise 6.3, the tree-based model has 6 overlapping predictors: ManufacturingProcess32, ManufacturingProcess09, ManufacturingProcess33, ManufacturingProcess13, BiologicalMaterial06, and BiologicalMaterial03. Compared with the optimal nonlinear MARS model from Exercise 7.5, the tree-based model overlaps on ManufacturingProcess32 and ManufacturingProcess09. This means ManufacturingProcess32 and ManufacturingProcess09 are especially stable predictors because they appear in the linear, nonlinear, and tree-based models.

Overall, the tree-based model agrees with the earlier models that manufacturing process variables are the main drivers of yield. The Cubist model spreads importance across more process variables than the MARS model, but the main conclusion is consistent: process-related predictors dominate the prediction of yield.

4.3 8.7(c):

Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

# plot the single regression tree for interpretation
rpart.plot(
  rpart_fit$finalModel,
  type = 2,
  extra = 101,
  fallen.leaves = TRUE,
  main = "Optimal Single Regression Tree for Yield"
)

# use finalModel$where to get each training observation's terminal node
terminal_nodes <- rpart_fit$finalModel$where

# convert terminal-node row positions into readable node labels
terminal_node_labels <- rownames(rpart_fit$finalModel$frame)[terminal_nodes]

tree_node_data <- tibble(
  Yield = y_train,
  Terminal_Node = factor(terminal_node_labels)
)

head(tree_node_data)

# plot Yield by terminal node to check whether the tree separates low and high yield groups
ggplot(tree_node_data, aes(x = Terminal_Node, y = Yield)) +
  geom_boxplot() +
  geom_jitter(width = 0.15, alpha = 0.5) +
  labs(
    title = "Distribution of Yield in the Terminal Nodes",
    x = "Terminal Node",
    y = "Yield"
  )

# summarize each terminal node by sample size, average yield, and spread
terminal_node_summary <- tree_node_data %>%
  group_by(Terminal_Node) %>%
  summarise(
    n = n(),
    mean_yield = mean(Yield),
    median_yield = median(Yield),
    sd_yield = sd(Yield),
    min_yield = min(Yield),
    max_yield = max(Yield),
    .groups = "drop"
  ) %>%
  arrange(mean_yield)

kable(
  terminal_node_summary,
  digits = 3,
  caption = "Yield summary by terminal node"
)

Yield summary by terminal node
Terminal_Node	n	mean_yield	median_yield	sd_yield	min_yield	max_yield
2	74	39.286	39.155	1.362	35.25	43.12
3	50	41.500	41.605	1.453	36.83	44.16

The optimal single regression tree uses ManufacturingProcess32 as the main splitting variable. This provides a simple interpretation: observations with lower preprocessed values of ManufacturingProcess32 are placed into the lower-yield terminal node, while observations with higher values are placed into the higher-yield terminal node.

The terminal node summary shows that node 2 has an average yield of 39.286, while node 3 has an average yield of 41.500. The boxplot also shows that node 3 has a higher median yield than node 2. This suggests that ManufacturingProcess32 is strongly related to yield and can separate the data into lower- and higher-yield groups.

This view provides additional knowledge because it gives a simple rule-based explanation of the yield pattern. However, the single tree is not the best predictive model because its test RMSE is higher than Cubist. I would use Cubist for prediction, but the single tree is useful for explaining the main relationship between ManufacturingProcess32 and yield.

Data 624 HW 9

Jayden Jiang

2026-05-02