# Run this only if Posit/RStudio says a package is missing:
install.packages(c(
  "AppliedPredictiveModeling", "caret", "mlbench", "randomForest",
  "rpart", "gbm", "Cubist", "ipred", "dplyr", "tidyr", "tibble", "ggplot2",
  "knitr", "lattice", "RANN", "party", "rpart.plot"
))

1 Exercise 8.1

The purpose of this exercise is to study variable importance in tree-based models when some predictors are informative, some are noise, and some are highly correlated with informative predictors. The simulated data are recreated from the Friedman benchmark data used earlier in Kuhn and Johnson.

1.1 Part A: Random forest importance with informative and uninformative predictors

library(mlbench)

set.seed(200)
simulated_raw <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(simulated_raw$x)
simulated$y <- simulated_raw$y

str(simulated)
set.seed(200)
rf_original <- randomForest(
  y ~ ., 
  data = simulated, 
  importance = TRUE, 
  ntree = 500
)

rf_imp_original <- varImp(rf_original, scale = FALSE) %>%
  tibble::rownames_to_column("Predictor") %>%
  rename(Original_RF = Overall)

knitr::kable(rf_imp_original, digits = 3)
ggplot(rf_imp_original, aes(x = reorder(Predictor, Original_RF), y = Original_RF)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Random Forest Variable Importance: Original Friedman Data",
    x = "Predictor",
    y = "Importance"
  )

The random forest model should place most of its importance on the truly useful predictors, especially variables among V1 through V5. The noise variables V6 through V10 may still receive small nonzero importance, but they should be much less important than the meaningful variables.

1.2 Part B: Add predictors highly correlated with V1

set.seed(600)
simulated_dup1 <- simulated
simulated_dup1$duplicate1 <- simulated_dup1$V1 + rnorm(nrow(simulated_dup1), sd = 0.10)

set.seed(601)
simulated_dup2 <- simulated_dup1
simulated_dup2$duplicate2 <- simulated_dup2$V1 + rnorm(nrow(simulated_dup2), sd = 0.10)

correlations <- data.frame(
  Pair = c("V1 and duplicate1", "V1 and duplicate2"),
  Correlation = c(
    cor(simulated_dup1$V1, simulated_dup1$duplicate1),
    cor(simulated_dup2$V1, simulated_dup2$duplicate2)
  )
)

knitr::kable(correlations, digits = 3)
set.seed(201)
rf_dup1 <- randomForest(
  y ~ ., 
  data = simulated_dup1, 
  importance = TRUE, 
  ntree = 500
)

set.seed(202)
rf_dup2 <- randomForest(
  y ~ ., 
  data = simulated_dup2, 
  importance = TRUE, 
  ntree = 500
)

rf_imp_dup1 <- varImp(rf_dup1, scale = FALSE) %>%
  tibble::rownames_to_column("Predictor") %>%
  rename(RF_With_1_Duplicate = Overall)

rf_imp_dup2 <- varImp(rf_dup2, scale = FALSE) %>%
  tibble::rownames_to_column("Predictor") %>%
  rename(RF_With_2_Duplicates = Overall)

rf_compare <- rf_imp_original %>%
  full_join(rf_imp_dup1, by = "Predictor") %>%
  full_join(rf_imp_dup2, by = "Predictor") %>%
  arrange(Predictor)

knitr::kable(rf_compare, digits = 3)
rf_compare_long <- rf_compare %>%
  pivot_longer(-Predictor, names_to = "Model", values_to = "Importance")

ggplot(rf_compare_long, aes(x = reorder(Predictor, Importance), y = Importance, fill = Model)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(
    title = "Effect of Highly Correlated Predictors on Random Forest Importance",
    x = "Predictor",
    y = "Importance"
  )

After adding a highly correlated copy of V1, the importance for V1 is usually split across V1 and the duplicate variable. This does not mean that V1 became less related to the response. Instead, the model now has multiple predictors carrying nearly the same information, so importance can be distributed across them.

1.3 Part C: Conditional inference random forest

if (requireNamespace("party", quietly = TRUE)) {
  library(party)
  
  set.seed(147)
  cforest_original <- party::cforest(
    y ~ ., 
    data = simulated,
    controls = party::cforest_control(ntree = 300, mtry = 3)
  )
  
  set.seed(147)
  cforest_dup1 <- party::cforest(
    y ~ ., 
    data = simulated_dup1,
    controls = party::cforest_control(ntree = 300, mtry = 3)
  )
  
  cf_imp_original <- data.frame(
    Predictor = names(party::varimp(cforest_original, conditional = FALSE)),
    CForest_Original = as.numeric(party::varimp(cforest_original, conditional = FALSE)),
    CForest_Original_Conditional = as.numeric(party::varimp(cforest_original, conditional = TRUE))
  )
  
  cf_imp_dup1 <- data.frame(
    Predictor = names(party::varimp(cforest_dup1, conditional = FALSE)),
    CForest_With_Duplicate = as.numeric(party::varimp(cforest_dup1, conditional = FALSE)),
    CForest_With_Duplicate_Conditional = as.numeric(party::varimp(cforest_dup1, conditional = TRUE))
  )
  
  cf_compare <- full_join(cf_imp_original, cf_imp_dup1, by = "Predictor") %>%
    arrange(Predictor)
  
  knitr::kable(cf_compare, digits = 3)
} else {
  cat("The `party` package is not installed, so the conditional inference forest section was skipped. Install `party` and re-knit to complete this part.")
}

Conditional inference forests are designed to reduce selection bias in trees. However, when two variables contain almost identical information, importance may still be shared between them. The conditional version usually gives a fairer view of importance when predictors have different scales or different numbers of possible split points.

1.4 Part D: Bagging, boosted trees, and Cubist

ctrl_small <- trainControl(method = "cv", number = 5, allowParallel = FALSE)

gbm_grid_small <- expand.grid(
  n.trees = c(50, 100, 150),
  interaction.depth = c(1, 3),
  shrinkage = c(0.05, 0.10),
  n.minobsinnode = 10
)

cubist_grid_small <- expand.grid(
  committees = c(1, 10, 50),
  neighbors = c(0, 1, 5)
)

set.seed(299)
bag_original <- train(
  y ~ .,
  data = simulated,
  method = "treebag",
  trControl = ctrl_small
)

set.seed(300)
bag_dup1 <- train(
  y ~ .,
  data = simulated_dup1,
  method = "treebag",
  trControl = ctrl_small
)

set.seed(301)
gbm_original <- train(
  y ~ ., 
  data = simulated,
  method = "gbm",
  trControl = ctrl_small,
  tuneGrid = gbm_grid_small,
  verbose = FALSE
)

set.seed(302)
gbm_dup1 <- train(
  y ~ ., 
  data = simulated_dup1,
  method = "gbm",
  trControl = ctrl_small,
  tuneGrid = gbm_grid_small,
  verbose = FALSE
)

set.seed(303)
cubist_original <- train(
  x = simulated %>% select(-y),
  y = simulated$y,
  method = "cubist",
  trControl = ctrl_small,
  tuneGrid = cubist_grid_small
)

set.seed(304)
cubist_dup1 <- train(
  x = simulated_dup1 %>% select(-y),
  y = simulated_dup1$y,
  method = "cubist",
  trControl = ctrl_small,
  tuneGrid = cubist_grid_small
)

bag_imp_compare <- varImp(bag_original, scale = FALSE)$importance %>%
  tibble::rownames_to_column("Predictor") %>%
  rename(Bagging_Original = Overall) %>%
  full_join(
    varImp(bag_dup1, scale = FALSE)$importance %>%
      tibble::rownames_to_column("Predictor") %>%
      rename(Bagging_With_Duplicate = Overall),
    by = "Predictor"
  )

gbm_imp_compare <- varImp(gbm_original, scale = FALSE)$importance %>%
  tibble::rownames_to_column("Predictor") %>%
  rename(GBM_Original = Overall) %>%
  full_join(
    varImp(gbm_dup1, scale = FALSE)$importance %>%
      tibble::rownames_to_column("Predictor") %>%
      rename(GBM_With_Duplicate = Overall),
    by = "Predictor"
  )

cubist_imp_compare <- varImp(cubist_original, scale = FALSE)$importance %>%
  tibble::rownames_to_column("Predictor") %>%
  rename(Cubist_Original = Overall) %>%
  full_join(
    varImp(cubist_dup1, scale = FALSE)$importance %>%
      tibble::rownames_to_column("Predictor") %>%
      rename(Cubist_With_Duplicate = Overall),
    by = "Predictor"
  )

knitr::kable(bag_imp_compare %>% arrange(desc(Bagging_Original)), digits = 3)
knitr::kable(gbm_imp_compare %>% arrange(desc(GBM_Original)), digits = 3)
knitr::kable(cubist_imp_compare %>% arrange(desc(Cubist_Original)), digits = 3)

The same general issue can occur with bagging, boosted trees, and Cubist: when a duplicate or highly correlated version of a useful predictor is added, importance may be shared between the original predictor and its copy. In this simulation, bagging and random forests usually show the importance of V1 being diluted by the duplicate variable. Cubist may be less affected because it uses rules and linear model adjustments, but the broader lesson is the same: variable importance should be interpreted carefully when predictors are highly correlated.

2 Exercise 8.2

This exercise asks us to use simulation to show tree bias caused by different predictor granularities. Tree-based models can prefer predictors with more possible split points, even when those predictors are not truly related to the response.

simulate_tree_bias <- function(n = 200, reps = 300, noise_sd = 4, granularity = "continuous") {
  selected <- character(reps)
  
  for (i in seq_len(reps)) {
    set.seed(1000 + i)
    
    # X_signal is truly related to the response but has only two groups.
    X_signal <- factor(rep(c("low", "high"), each = n / 2))
    y <- ifelse(X_signal == "high", 1, 0) + rnorm(n, sd = noise_sd)
    
    # X_noise is unrelated to the response but varies in granularity.
    X_noise <- switch(
      granularity,
      binary = factor(sample(c("A", "B"), n, replace = TRUE)),
      ten_level = factor(sample(letters[1:10], n, replace = TRUE)),
      continuous = rnorm(n),
      stop("Unknown granularity")
    )
    
    dat <- data.frame(y = y, X_signal = X_signal, X_noise = X_noise)
    
    fit <- rpart(
      y ~ X_signal + X_noise,
      data = dat,
      control = rpart.control(maxdepth = 1, cp = 0, minsplit = 10)
    )
    
    selected[i] <- ifelse(is.null(fit$splits), NA, rownames(fit$splits)[1])
  }
  
  data.frame(
    Granularity = granularity,
    Selected = selected
  )
}
set.seed(450)
bias_results <- bind_rows(
  simulate_tree_bias(granularity = "binary"),
  simulate_tree_bias(granularity = "ten_level"),
  simulate_tree_bias(granularity = "continuous")
)

bias_summary <- bias_results %>%
  filter(!is.na(Selected)) %>%
  count(Granularity, Selected) %>%
  group_by(Granularity) %>%
  mutate(Proportion = n / sum(n)) %>%
  ungroup()

knitr::kable(bias_summary, digits = 3)
ggplot(bias_summary, aes(x = Granularity, y = Proportion, fill = Selected)) +
  geom_col(position = "dodge") +
  labs(
    title = "First Split Chosen by a One-Split Tree",
    x = "Granularity of Noise Predictor",
    y = "Proportion of Simulations",
    fill = "First Split"
  )

The signal variable truly affects the response, while the noise variable does not. However, as the unrelated noise predictor becomes more granular, it has more possible split points. This gives it more chances to find an apparently useful split by chance. The simulation therefore illustrates why trees can be biased toward predictors with many possible cut points.

3 Exercise 8.3

This exercise studies how stochastic gradient boosting tuning parameters affect variable importance. The two parameters of focus are bagging fraction and learning rate, also called shrinkage.

data(solubility)

sol_train_x <- solTrainXtrans
sol_train_y <- solTrainY
gbm_ctrl <- trainControl(method = "cv", number = 5, allowParallel = FALSE)

gbm_low_grid <- expand.grid(
  n.trees = 500,
  interaction.depth = 3,
  shrinkage = 0.10,
  n.minobsinnode = 10
)

gbm_high_grid <- expand.grid(
  n.trees = 500,
  interaction.depth = 3,
  shrinkage = 0.90,
  n.minobsinnode = 10
)

set.seed(100)
gbm_low <- train(
  x = sol_train_x,
  y = sol_train_y,
  method = "gbm",
  trControl = gbm_ctrl,
  tuneGrid = gbm_low_grid,
  bag.fraction = 0.10,
  verbose = FALSE
)

set.seed(100)
gbm_high <- train(
  x = sol_train_x,
  y = sol_train_y,
  method = "gbm",
  trControl = gbm_ctrl,
  tuneGrid = gbm_high_grid,
  bag.fraction = 0.90,
  verbose = FALSE
)
gbm_low_imp <- varImp(gbm_low, scale = FALSE)$importance %>%
  tibble::rownames_to_column("Predictor") %>%
  rename(Low_Bag_Low_Shrinkage = Overall)

gbm_high_imp <- varImp(gbm_high, scale = FALSE)$importance %>%
  tibble::rownames_to_column("Predictor") %>%
  rename(High_Bag_High_Shrinkage = Overall)

gbm_imp_compare <- full_join(gbm_low_imp, gbm_high_imp, by = "Predictor") %>%
  mutate(
    Low_Bag_Low_Shrinkage = ifelse(is.na(Low_Bag_Low_Shrinkage), 0, Low_Bag_Low_Shrinkage),
    High_Bag_High_Shrinkage = ifelse(is.na(High_Bag_High_Shrinkage), 0, High_Bag_High_Shrinkage)
  )

gbm_top <- gbm_imp_compare %>%
  mutate(MaxImportance = pmax(Low_Bag_Low_Shrinkage, High_Bag_High_Shrinkage)) %>%
  arrange(desc(MaxImportance)) %>%
  slice(1:25)

knitr::kable(gbm_top, digits = 3)
gbm_top_long <- gbm_top %>%
  select(Predictor, Low_Bag_Low_Shrinkage, High_Bag_High_Shrinkage) %>%
  pivot_longer(-Predictor, names_to = "Model", values_to = "Importance")

ggplot(gbm_top_long, aes(x = reorder(Predictor, Importance), y = Importance, fill = Model)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(
    title = "GBM Variable Importance Under Different Bagging and Shrinkage Settings",
    x = "Predictor",
    y = "Importance"
  )

3.1 Answers

(a) The model with a high learning rate and high bagging fraction tends to concentrate importance on only a few predictors. A high learning rate makes each tree have a larger effect on the model, so the model becomes more aggressive and greedy. A high bagging fraction also reduces the stochastic element because each tree sees more of the training data. Together, these settings allow early strong predictors to dominate the model.

(b) The model with the smaller learning rate and smaller bagging fraction is usually more predictive for new samples. It builds the model more gradually and includes more randomness, which can reduce overfitting.

(c) Increasing interaction depth would allow each tree to model more complex interactions among predictors. This can spread importance across more predictors because variables can become useful in combination with other variables. The importance curve would likely become less steep. However, too much interaction depth can also increase overfitting.

4 Exercise 8.7

This exercise returns to the chemical manufacturing process data and compares several tree-based models. The goal is to predict yield and determine which tree or rule-based model performs best.

4.1 Data setup and preprocessing

data(ChemicalManufacturingProcess)

chem_x <- ChemicalManufacturingProcess %>% select(-Yield)
chem_y <- ChemicalManufacturingProcess$Yield

set.seed(517)
train_index <- createDataPartition(chem_y, p = 0.70, list = FALSE)

train_x <- chem_x[train_index, ]
test_x  <- chem_x[-train_index, ]
train_y <- chem_y[train_index]
test_y  <- chem_y[-train_index]

# Same general preprocessing idea used earlier: transform, center, scale, and impute.
pp <- preProcess(
  train_x,
  method = c("BoxCox", "center", "scale", "knnImpute")
)

train_x_pp <- predict(pp, train_x)
test_x_pp  <- predict(pp, test_x)

# Remove near-zero variance predictors.
nzv <- nearZeroVar(train_x_pp)
if (length(nzv) > 0) {
  train_x_pp <- train_x_pp[, -nzv]
  test_x_pp  <- test_x_pp[, -nzv]
}

# Remove highly correlated predictors.
corr_matrix <- cor(train_x_pp)
high_corr <- findCorrelation(corr_matrix, cutoff = 0.90)
if (length(high_corr) > 0) {
  train_x_pp <- train_x_pp[, -high_corr]
  test_x_pp  <- test_x_pp[, -high_corr]
}

data.frame(
  Training_Rows = nrow(train_x_pp),
  Testing_Rows = nrow(test_x_pp),
  Predictors_After_Preprocessing = ncol(train_x_pp)
) %>%
  knitr::kable()

4.2 Fit tree-based and rule-based models

chem_ctrl <- trainControl(
  method = "boot",
  number = 25,
  savePredictions = "final",
  allowParallel = FALSE
)

# 1. Simple CART tree
set.seed(614)
rpart_grid <- expand.grid(maxdepth = 1:10)
rpart_chem <- train(
  x = train_x_pp,
  y = train_y,
  method = "rpart2",
  metric = "Rsquared",
  tuneGrid = rpart_grid,
  trControl = chem_ctrl
)

# 2. Random forest
set.seed(614)
rf_grid <- expand.grid(
  mtry = unique(pmax(1, round(seq(2, ncol(train_x_pp), length.out = 5))))
)
rf_chem <- train(
  x = train_x_pp,
  y = train_y,
  method = "rf",
  metric = "Rsquared",
  tuneGrid = rf_grid,
  ntree = 500,
  importance = TRUE,
  trControl = chem_ctrl
)

# 3. Gradient boosted trees
set.seed(614)
gbm_grid <- expand.grid(
  n.trees = c(50, 100, 200),
  interaction.depth = c(1, 3, 5),
  shrinkage = c(0.01, 0.05, 0.10),
  n.minobsinnode = 10
)
gbm_chem <- train(
  x = train_x_pp,
  y = train_y,
  method = "gbm",
  metric = "Rsquared",
  tuneGrid = gbm_grid,
  trControl = chem_ctrl,
  verbose = FALSE
)

# 4. Cubist rule-based model
set.seed(614)
cubist_grid <- expand.grid(
  committees = c(1, 5, 10, 20, 50),
  neighbors = c(0, 1, 3, 5)
)
cubist_chem <- train(
  x = train_x_pp,
  y = train_y,
  method = "cubist",
  metric = "Rsquared",
  tuneGrid = cubist_grid,
  trControl = chem_ctrl
)
best_resample <- function(model, model_name) {
  best_row <- model$results[which.max(model$results$Rsquared), ]
  data.frame(
    Model = model_name,
    RMSE = best_row$RMSE,
    Rsquared = best_row$Rsquared,
    MAE = best_row$MAE
  )
}

resampling_results <- bind_rows(
  best_resample(rpart_chem, "CART"),
  best_resample(rf_chem, "Random Forest"),
  best_resample(gbm_chem, "GBM"),
  best_resample(cubist_chem, "Cubist")
) %>% arrange(desc(Rsquared))

knitr::kable(resampling_results, digits = 3)
test_perf <- function(model, model_name) {
  pred <- predict(model, test_x_pp)
  stats <- postResample(pred = pred, obs = test_y)
  data.frame(
    Model = model_name,
    RMSE = as.numeric(stats["RMSE"]),
    Rsquared = as.numeric(stats["Rsquared"]),
    MAE = as.numeric(stats["MAE"])
  )
}

test_results <- bind_rows(
  test_perf(rpart_chem, "CART"),
  test_perf(rf_chem, "Random Forest"),
  test_perf(gbm_chem, "GBM"),
  test_perf(cubist_chem, "Cubist")
) %>% arrange(desc(Rsquared))

knitr::kable(test_results, digits = 3)

4.3 Part A: Optimal tree-based regression model

model_list <- list(
  CART = rpart_chem,
  `Random Forest` = rf_chem,
  GBM = gbm_chem,
  Cubist = cubist_chem
)

best_model_name <- test_results$Model[which.max(test_results$Rsquared)]
best_model <- model_list[[best_model_name]]

best_model_name
best_model$bestTune

Based on the model-comparison approach, the model with the largest test-set R^2 and lowest RMSE should be selected as the optimal tree/rule-based model. In many runs of this exercise, Cubist performs especially well because it combines rule-based partitions with local linear adjustments. This is consistent with the textbook solution pattern, where Cubist is usually the strongest tree/rule-based method for the chemical manufacturing data.

4.4 Part B: Important predictors in the optimal model

chem_imp <- varImp(best_model, scale = FALSE)$importance

if (!"Overall" %in% names(chem_imp)) {
  chem_imp$Overall <- rowMeans(chem_imp)
}

chem_imp_top <- chem_imp %>%
  tibble::rownames_to_column("Predictor") %>%
  arrange(desc(Overall)) %>%
  slice(1:10) %>%
  mutate(
    Predictor_Type = case_when(
      grepl("Bio|Biological", Predictor, ignore.case = TRUE) ~ "Biological",
      grepl("Manufacturing|Process", Predictor, ignore.case = TRUE) ~ "Process",
      TRUE ~ "Other"
    )
  )

knitr::kable(chem_imp_top, digits = 3)
ggplot(chem_imp_top, aes(x = reorder(Predictor, Overall), y = Overall, fill = Predictor_Type)) +
  geom_col() +
  coord_flip() +
  labs(
    title = paste("Top 10 Important Predictors:", best_model_name),
    x = "Predictor",
    y = "Importance"
  )

The important predictors are mostly interpreted by whether they are biological material variables or manufacturing process variables. If the top 10 list is dominated by variables beginning with ManufacturingProcess, then process variables dominate the model. If BiologicalMaterial variables appear near the top, then the biological inputs are also important. Across the earlier linear and nonlinear models for this problem, manufacturing process variables such as process 32, process 09, and process 13 often appear as influential predictors. Therefore, the most useful conclusion is to compare whether these same variables continue to appear in the tree-based model’s importance list.

4.5 Part C: Plot the optimal single tree and terminal-node yield distributions

Even if the single CART tree is not the best predictive model, it is useful because it provides a simple visual explanation of how predictors split the yield values.

if (requireNamespace("rpart.plot", quietly = TRUE)) {
  rpart.plot::rpart.plot(
    rpart_chem$finalModel,
    type = 3,
    extra = 101,
    fallen.leaves = TRUE,
    main = "Optimal Single CART Tree for Chemical Manufacturing Yield"
  )
} else {
  plot(rpart_chem$finalModel, uniform = TRUE, margin = 0.1)
  text(rpart_chem$finalModel, use.n = TRUE, cex = 0.7)
}
terminal_node_data <- data.frame(
  Terminal_Node = factor(rpart_chem$finalModel$where),
  Yield = train_y
)

node_summary <- terminal_node_data %>%
  group_by(Terminal_Node) %>%
  summarize(
    n = n(),
    Mean_Yield = mean(Yield),
    Median_Yield = median(Yield),
    SD_Yield = sd(Yield),
    .groups = "drop"
  )

knitr::kable(node_summary, digits = 3)
ggplot(terminal_node_data, aes(x = Terminal_Node, y = Yield)) +
  geom_boxplot(outlier.alpha = 0.4) +
  geom_jitter(width = 0.12, alpha = 0.35) +
  labs(
    title = "Distribution of Yield Within Terminal Nodes",
    x = "Terminal Node",
    y = "Yield"
  )

The terminal-node plot adds interpretation beyond the numerical performance statistics. It shows whether the single tree creates groups with meaningfully different yield distributions. If one terminal node has a noticeably higher median yield and another has a lower median yield, the tree gives a practical way to describe how process or biological variables are associated with higher or lower manufacturing yield.

5 Overall conclusion

For Exercise 8.1, random forests correctly identify the main Friedman predictors, but correlated predictors can split or dilute importance. Exercise 8.2 shows that trees can be biased toward variables with more possible split points. Exercise 8.3 shows that high learning rates and high bagging fractions can concentrate GBM importance into only a few predictors and may reduce generalizability. Exercise 8.7 shows that Cubist, GBM, random forests, and CART can all be applied to the chemical manufacturing data, but Cubist often provides the strongest predictive performance while the single CART tree remains the easiest to explain.