Packages

library(mlbench)
library(randomForest)
library(caret)
library(party)
library(gbm)
library(Cubist)
library(AppliedPredictiveModeling)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(dplyr)
library(tidyr)

Exercise 8.1

Data Generation

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

cat("Observations:", nrow(simulated),
    " | Predictors:", ncol(simulated) - 1, "\n")

Observations: 200 | Predictors: 10

cat("V1–V5 are informative; V6–V10 are pure noise\n")

V1–V5 are informative; V6–V10 are pure noise

The Friedman simulation generates a response that depends on only five of ten available predictors. The true model is: $y = 10\sin(\pi x_1 x_2) + 20(x_3 - 0.5)^2 + 10x_4 + 5x_5 + \varepsilon$, where $\varepsilon \sim N(0, 1)$. Predictors V6 through V10 have no relationship with the outcome — they are pure noise. This makes the dataset an ideal test bed for evaluating whether a model’s variable importance correctly separates signal from noise.

Part (a): Baseline Random Forest — Do Uninformative Predictors Get Used?

set.seed(42)
model1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree      = 1000)

rfImp1     <- varImp(model1, scale = FALSE)
rfImp1_df  <- data.frame(
  Predictor  = rownames(rfImp1),
  Importance = rfImp1[, 1]
) %>%
  arrange(desc(Importance)) %>%
  mutate(Role = ifelse(Predictor %in% paste0("V", 1:5),
                       "Informative", "Noise"))

knitr::kable(
  rfImp1_df %>% mutate(Importance = round(Importance, 3)),
  caption = "Random forest variable importance — baseline model (n = 1,000 trees)",
  row.names = FALSE
)

Random forest variable importance — baseline model (n = 1,000 trees)
Predictor	Importance	Role
V1	8.417	Informative
V4	7.730	Informative
V2	6.869	Informative
V5	2.195	Informative
V3	0.942	Informative
V6	0.148	Noise
V7	0.053	Noise
V10	-0.043	Noise
V9	-0.058	Noise
V8	-0.087	Noise

ggplot(rfImp1_df,
       aes(x = reorder(Predictor, Importance), y = Importance, fill = Role)) +
  geom_col(width = 0.6) +
  coord_flip() +
  scale_fill_manual(values = c(Informative = "#2166ac", Noise = "#d73027")) +
  labs(
    title    = "Random Forest Variable Importance — Baseline Model",
    subtitle = "Blue = informative (V1–V5), Red = noise (V6–V10)",
    x = NULL, y = "% Increase in MSE", fill = NULL
  ) +
  theme_minimal(base_size = 13)

The random forest correctly separates the two groups. The five informative predictors (V1–V5) all receive meaningfully positive importance scores, while the five noise predictors (V6–V10) cluster near zero. The model does not significantly use the uninformative predictors. This result is expected: random forests build many trees on bootstrap samples and at each split randomly sample mtry candidate predictors. Across 1,000 trees, the noise predictors occasionally appear at a split, but they rarely reduce SSE more than the informative ones, so their average importance remains near the baseline.

Part (b): Adding a Highly Correlated Duplicate of V1

# One correlated duplicate
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
r1 <- round(cor(simulated$duplicate1, simulated$V1), 4)
cat("Correlation between duplicate1 and V1:", r1, "\n")

Correlation between duplicate1 and V1: 0.9341

set.seed(42)
model2   <- randomForest(y ~ ., data = simulated,
                         importance = TRUE, ntree = 1000)
rfImp2   <- varImp(model2, scale = FALSE)

# Second correlated duplicate
simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1
r2 <- round(cor(simulated$duplicate2, simulated$V1), 4)
cat("Correlation between duplicate2 and V1:", r2, "\n")

Correlation between duplicate2 and V1: 0.9445

set.seed(42)
model3   <- randomForest(y ~ ., data = simulated,
                         importance = TRUE, ntree = 1000)
rfImp3   <- varImp(model3, scale = FALSE)

# Summarize V1 importance across the three models
v1_tbl <- data.frame(
  Model       = c("Baseline", "+ duplicate1", "+ duplicate1 & duplicate2"),
  V1_Imp      = round(c(rfImp1["V1", 1], rfImp2["V1", 1], rfImp3["V1", 1]), 3)
)

knitr::kable(v1_tbl,
             caption = "V1 importance score as correlated duplicates are added",
             col.names = c("Model", "V1 Importance (% MSE increase)"),
             row.names = FALSE)

V1 importance score as correlated duplicates are added
Model	V1 Importance (% MSE increase)
Baseline	8.417
+ duplicate1	5.757
+ duplicate1 & duplicate2	4.695

# Compare top predictors across baseline and one-duplicate model
compare_df <- data.frame(
  Predictor = rownames(rfImp1),
  Baseline  = rfImp1[, 1]
)
dup1_df <- data.frame(
  Predictor  = rownames(rfImp2),
  With_Dup1  = rfImp2[, 1]
)
compare_df <- merge(compare_df, dup1_df, by = "Predictor") %>%
  pivot_longer(-Predictor, names_to = "Model", values_to = "Importance") %>%
  filter(Predictor %in% c("V1", "V2", "V3", "V4", "V5",
                           "duplicate1", "V6", "V7"))

ggplot(compare_df,
       aes(x = reorder(Predictor, Importance), y = Importance, fill = Model)) +
  geom_col(position = "dodge", width = 0.65) +
  coord_flip() +
  scale_fill_manual(values = c(Baseline = "#2166ac", With_Dup1 = "#f4a582")) +
  labs(
    title    = "V1 Importance Diluted by a Highly Correlated Duplicate",
    subtitle = "duplicate1 ≈ V1 + small noise; importance is now shared between them",
    x = NULL, y = "% Increase in MSE", fill = NULL
  ) +
  theme_minimal(base_size = 13)

When duplicate1 (correlation 0.9341 with V1) is added, V1’s importance drops substantially and is now spread across both V1 and duplicate1. When a second duplicate is introduced (correlation 0.9445), V1’s importance falls further still. This is a well-known limitation of the standard permutation importance measure: when two predictors carry identical information, the forest can use either one interchangeably at each split. Permuting V1 hurts less than before because duplicate1 can step in as a substitute, making V1 appear less “important” than it truly is. This bias can be corrected by using conditional importance (Part c) or by removing correlated duplicates before modelling.

Part (c): Conditional Inference Forest

# Use the original 10-predictor data for a clean comparison
simulated_base <- simulated[, c(paste0("V", 1:10), "y")]

set.seed(42)
cforest_mod <- cforest(y ~ ., data = simulated_base,
                       controls = cforest_unbiased(ntree = 500, mtry = 3))

imp_trad <- varimp(cforest_mod, conditional = FALSE)
imp_cond <- varimp(cforest_mod, conditional = TRUE)

imp_cf <- data.frame(
  Predictor   = names(imp_trad),
  Traditional = round(imp_trad, 4),
  Conditional = round(imp_cond, 4)
) %>% arrange(desc(Traditional))

knitr::kable(imp_cf,
             caption = "cforest variable importance: traditional vs. conditional",
             row.names = FALSE)

cforest variable importance: traditional vs. conditional
Predictor	Traditional	Conditional
V1	7.2411	4.4372
V4	7.0724	4.9437
V2	5.6184	4.0119
V5	1.9155	1.1504
V3	0.1363	0.1004
V7	-0.0217	-0.0231
V6	-0.0781	-0.0007
V9	-0.0804	0.0278
V8	-0.0865	-0.0221
V10	-0.0979	-0.0160

imp_cf_long <- imp_cf %>%
  pivot_longer(-Predictor, names_to = "Type", values_to = "Importance") %>%
  mutate(Role = ifelse(Predictor %in% paste0("V", 1:5),
                       "Informative", "Noise"))

ggplot(imp_cf_long,
       aes(x = reorder(Predictor, Importance), y = Importance, fill = Type)) +
  geom_col(position = "dodge", width = 0.65) +
  coord_flip() +
  scale_fill_manual(values = c(Traditional = "#9C27B0", Conditional = "#4CAF50")) +
  labs(
    title    = "cforest: Traditional vs. Conditional Variable Importance",
    subtitle = "Conditional importance is better calibrated when predictors are correlated",
    x = NULL, y = "Importance", fill = NULL
  ) +
  theme_minimal(base_size = 13)

The conditional inference forest shows the same broad pattern as the standard random forest: V1–V5 receive higher importance than V6–V10. However, the conditional importance scores are noticeably more conservative — the magnitude of the top predictors shrinks and the gap between informative and noise variables narrows. This conservatism is by design: the conditional importance measure by Strobl et al. (2007) accounts for correlations among predictors when permuting, so it avoids artificially inflating the importance of predictors that are associated with others. On this data with relatively independent noise variables, both measures point to the same qualitative conclusion, but conditional importance would be more reliable in a setting with many correlated features.

Part (d): Boosted Trees and Cubist

set.seed(42)
gbm_81 <- gbm(y ~ ., data = simulated_base,
              distribution      = "gaussian",
              n.trees           = 1000,
              interaction.depth = 3,
              shrinkage         = 0.01,
              bag.fraction      = 0.5,
              verbose           = FALSE)

gbm_imp <- summary(gbm_81, plotit = FALSE)

knitr::kable(
  gbm_imp %>% mutate(rel.inf = round(rel.inf, 3)),
  caption = "Gradient boosting relative influence (1,000 trees, depth = 3)",
  col.names = c("Predictor", "Relative Influence (%)"),
  row.names = FALSE
)

Gradient boosting relative influence (1,000 trees, depth = 3)
Predictor	Relative Influence (%)
V4	27.913
V1	26.390
V2	22.111
V5	11.811
V3	7.930
V7	1.055
V6	1.012
V9	0.621
V10	0.605
V8	0.553

gbm_imp_df <- gbm_imp %>%
  mutate(Role = ifelse(var %in% paste0("V", 1:5), "Informative", "Noise"))

ggplot(gbm_imp_df,
       aes(x = reorder(var, rel.inf), y = rel.inf, fill = Role)) +
  geom_col(width = 0.6) +
  coord_flip() +
  scale_fill_manual(values = c(Informative = "#2166ac", Noise = "#d73027")) +
  labs(
    title    = "Gradient Boosting — Relative Influence",
    subtitle = "Blue = informative (V1–V5), Red = noise (V6–V10)",
    x = NULL, y = "Relative Influence (%)", fill = NULL
  ) +
  theme_minimal(base_size = 13)

set.seed(42)
cubist_81  <- cubist(x = simulated_base[, paste0("V", 1:10)],
                     y = simulated_base$y,
                     committees = 50)

cubist_imp <- varImp(cubist_81)
cubist_df  <- data.frame(
  Predictor  = rownames(cubist_imp),
  Importance = round(cubist_imp[, 1], 1),
  Role       = ifelse(rownames(cubist_imp) %in% paste0("V", 1:5),
                      "Informative", "Noise")
) %>% arrange(desc(Importance))

knitr::kable(cubist_df,
             caption = "Cubist variable importance (50 committees)",
             row.names = FALSE)

Cubist variable importance (50 committees)
Predictor	Importance	Role
V1	71.0	Informative
V2	57.5	Informative
V4	48.5	Informative
V3	44.5	Informative
V5	35.0	Informative
V6	0.0	Noise
V7	0.0	Noise
V8	0.0	Noise
V9	0.0	Noise
V10	0.0	Noise

ggplot(cubist_df,
       aes(x = reorder(Predictor, Importance), y = Importance, fill = Role)) +
  geom_col(width = 0.6) +
  coord_flip() +
  scale_fill_manual(values = c(Informative = "#2166ac", Noise = "#d73027")) +
  labs(
    title    = "Cubist — Variable Importance",
    subtitle = "Blue = informative (V1–V5), Red = noise (V6–V10)",
    x = NULL, y = "Usage (%)", fill = NULL
  ) +
  theme_minimal(base_size = 13)

Both gradient boosting and Cubist replicate the pattern from the random forest: V1–V5 dominate and V6–V10 are negligible. GBM concentrates importance heavily on V1 and V4, the largest contributors in the true model, while assigning near-zero influence to all five noise predictors. Cubist spreads importance somewhat more evenly across the five informative predictors but still assigns zero usage to V6–V10. The finding is robust across all four tree-based methods tested: standard random forest, conditional inference forest, gradient boosting, and Cubist each correctly identify the informative predictors and discard the noise.

Exercise 8.2

Simulation Design

set.seed(123)
n <- 500

# Pure noise response, no predictor has any true relationship with y
y82 <- rnorm(n)

# Predictors at increasing levels of granularity
sim82 <- data.frame(
  y      = y82,
  x_2    = sample(1:2,   n, replace = TRUE),
  x_4    = sample(1:4,   n, replace = TRUE),
  x_8    = sample(1:8,   n, replace = TRUE),
  x_16   = sample(1:16,  n, replace = TRUE),
  x_32   = sample(1:32,  n, replace = TRUE),
  x_cont = runif(n)
)

cat("Unique values per predictor:\n")

Unique values per predictor:

sapply(sim82[, -1], dplyr::n_distinct)

x_2 x_4 x_8 x_16 x_32 x_cont 2 4 8 16 32 500

The design isolates granularity as the only varying feature. Every predictor is statistically independent of the response (all are random noise), so any systematic difference in importance scores can only be attributed to the number of unique values each predictor has — its granularity.

tree82 <- rpart(y ~ ., data = sim82,
                control = rpart.control(cp = 0.001, maxdepth = 10))

imp82  <- tree82$variable.importance

if (length(imp82) > 0) {
  imp82_df <- data.frame(
    Predictor   = names(imp82),
    Importance  = as.numeric(imp82),
    UniqueVals  = c(x_2 = 2, x_4 = 4, x_8 = 8,
                    x_16 = 16, x_32 = 32, x_cont = 500)[names(imp82)]
  ) %>% arrange(desc(Importance))
  
  knitr::kable(
    imp82_df %>% mutate(Importance = round(Importance, 3)),
    caption = "Single tree importance on a pure-noise response",
    row.names = FALSE
  )
}

Single tree importance on a pure-noise response
Predictor	Importance	UniqueVals
x_cont	71.651	500
x_32	42.016	32
x_16	24.345	16
x_8	13.193	8
x_4	10.917	4
x_2	3.022	2

if (exists("imp82_df")) {
  ggplot(imp82_df,
         aes(x = reorder(Predictor, Importance), y = Importance,
             fill = log2(UniqueVals))) +
    geom_col(width = 0.6) +
    coord_flip() +
    scale_fill_viridis_c(name = "log (unique values)", option = "plasma") +
    labs(
      title    = "Tree Bias: Importance vs. Predictor Granularity",
      subtitle = "All predictors are noise — higher granularity inflates importance",
      x = NULL, y = "Variable Importance (SSE reduction)"
    ) +
    theme_minimal(base_size = 13)
}

Replicated Simulations

B <- 200
gran_names <- c("x_2","x_4","x_8","x_16","x_32","x_cont")
bias_mat   <- matrix(0, nrow = B, ncol = length(gran_names),
                     dimnames = list(NULL, gran_names))

for (b in seq_len(B)) {
  df_b <- data.frame(
    y      = rnorm(n),
    x_2    = sample(1:2,   n, replace = TRUE),
    x_4    = sample(1:4,   n, replace = TRUE),
    x_8    = sample(1:8,   n, replace = TRUE),
    x_16   = sample(1:16,  n, replace = TRUE),
    x_32   = sample(1:32,  n, replace = TRUE),
    x_cont = runif(n)
  )
  fit_b <- rpart(y ~ ., data = df_b,
                 control = rpart.control(cp = 0.001, maxdepth = 5))
  vi_b  <- fit_b$variable.importance
  for (v in gran_names)
    bias_mat[b, v] <- if (v %in% names(vi_b)) vi_b[v] else 0
}

bias_df <- as.data.frame(bias_mat) %>%
  pivot_longer(everything(), names_to = "Predictor", values_to = "Importance") %>%
  mutate(Predictor = factor(Predictor, levels = gran_names))

ggplot(bias_df, aes(x = Predictor, y = Importance)) +
  geom_boxplot(fill = "#4393c3", outlier.alpha = 0.25, width = 0.55) +
  labs(
    title    = "Distribution of Tree Importance Across 200 Noise Simulations",
    subtitle = "Each predictor is pure noise — differences in importance reflect bias, not signal",
    x        = "Predictor (ordered by number of unique values)",
    y        = "Variable Importance (SSE reduction)"
  ) +
  theme_minimal(base_size = 13)

The replication confirms the pattern decisively. Across 200 independent simulations with a noise response, predictors with more unique values consistently receive higher importance scores from the single regression tree. The continuous predictor (x_cont, ~500 distinct values) dominates in nearly every replication; x_32 is a distant second; and x_2 (binary) receives the lowest importance. Because the true importance of every predictor is exactly zero, these differences represent pure bias.

The mechanism is straightforward. CART evaluates all possible binary splits for each predictor and selects the one that maximally reduces SSE. A predictor with 500 unique values offers approximately 499 candidate split points; a binary predictor offers only one. With more chances to find a lucky split by random fluctuation, high-granularity predictors have a systematic advantage in the selection tournament — even when they carry no real information. This selection bias motivates the use of alternative importance measures (e.g., conditional inference trees) that penalise predictors for having more candidate splits, or the use of random forests, which reduce this bias across many trees.

Exercise 8.3

Solubility Data

data(solubility)
cat("Training:", nrow(solTrainX), "samples,", ncol(solTrainX), "predictors\n")

Training: 951 samples, 228 predictors

cat("Test:    ", nrow(solTestX),  "samples\n")

Test: 316 samples

train83 <- data.frame(solTrainX, y = solTrainY)
test83  <- data.frame(solTestX)

Two Extreme GBM Models

set.seed(42)
gbm_low <- gbm(y ~ ., data = train83,
               distribution      = "gaussian",
               n.trees           = 1000,
               interaction.depth = 3,
               shrinkage         = 0.1,
               bag.fraction      = 0.1,
               verbose           = FALSE)

set.seed(42)
gbm_high <- gbm(y ~ ., data = train83,
                distribution      = "gaussian",
                n.trees           = 1000,
                interaction.depth = 3,
                shrinkage         = 0.9,
                bag.fraction      = 0.9,
                verbose           = FALSE)

imp_low  <- summary(gbm_low,  plotit = FALSE) %>% mutate(Model = "Low  (bf=0.1, lr=0.1)")
imp_high <- summary(gbm_high, plotit = FALSE) %>% mutate(Model = "High (bf=0.9, lr=0.9)")

bind_rows(head(imp_low, 15), head(imp_high, 15)) %>%
  ggplot(aes(x = reorder(var, rel.inf), y = rel.inf)) +
  geom_col(fill = "#2166ac", width = 0.65) +
  coord_flip() +
  facet_wrap(~Model, scales = "free_x") +
  labs(
    title    = "GBM Variable Importance: Low vs. High Bagging Fraction & Learning Rate",
    subtitle = "High settings concentrate importance on a handful of predictors",
    x = NULL, y = "Relative Influence (%)"
  ) +
  theme_minimal(base_size = 12) +
  theme(strip.background = element_rect(fill = "grey90"),
        strip.text = element_text(face = "bold"))

Part (a): Why Does the High Model Concentrate Importance?

When both the learning rate and the bagging fraction are high (0.9 each), two forces combine to narrow the importance distribution.

A high learning rate means each successive tree makes large corrections to the residuals. The first few boosting iterations seize on the strongest predictors in the data because those offer the greatest SSE reduction. With a step size of 0.9, the model essentially solves most of the learning problem in those early iterations. Subsequent trees find little residual signal left to explain, so they contribute minimally — and the predictors they use accumulate negligible relative influence. The result is a steep drop-off: the one or two strongest predictors dominate, and everything else is marginal.

A high bagging fraction (0.9) compounds this by providing each tree with nearly the full training set. There is almost no sampling variability across iterations, so every tree sees the same dominant signal and selects the same strongest predictors. This repetition reinforces the concentration. In contrast, a low bagging fraction (0.1) feeds each tree a small, highly variable subsample, giving different predictors the chance to look best in different subsets. Over 1,000 iterations, this stochasticity spreads influence across a much wider set of predictors.

Part (b): Which Model Is More Predictive?

pred_low83  <- predict(gbm_low,  newdata = test83, n.trees = 1000)
pred_high83 <- predict(gbm_high, newdata = test83, n.trees = 1000)

rmse83 <- function(a, p) round(sqrt(mean((a - p)^2)), 4)
r2_83  <- function(a, p) round(cor(a, p)^2, 4)

perf83 <- data.frame(
  Model    = c("Low (bf=0.1, lr=0.1)", "High (bf=0.9, lr=0.9)"),
  Test_RMSE = c(rmse83(solTestY, pred_low83), rmse83(solTestY, pred_high83)),
  Test_R2   = c(r2_83(solTestY,  pred_low83), r2_83(solTestY,  pred_high83))
)

knitr::kable(perf83,
             caption = "Test-set performance for the two GBM parameterisations",
             col.names = c("Model", "Test RMSE", "Test R²"),
             row.names = FALSE)

Test-set performance for the two GBM parameterisations
Model	Test RMSE	Test R²
Low (bf=0.1, lr=0.1)	0.6785	0.8940
High (bf=0.9, lr=0.9)	0.7400	0.8781

The low-parameter model (Low (bf=0.1, lr=0.1)) achieves a lower test RMSE (0.6785) than the high-parameter model (0.74). This is the expected result. A low learning rate forces the boosting algorithm to make many small, careful corrections rather than a few aggressive ones, which reduces the risk of overshooting the optimal fit. A low bagging fraction introduces regularising noise at each step, which acts as a form of variance reduction similar to dropout in neural networks. Together, these settings encourage a slower, more stable convergence that generalises better to new data. The high-parameter model is prone to overfitting: its large learning rate can over-correct the training residuals and its high bagging fraction provides insufficient stochasticity to smooth out those errors.

Part (c): Effect of Increasing Interaction Depth

Interaction depth controls the maximum number of splits in each boosting tree, which determines the order of predictor interactions the model can capture. Its effect on the importance slope differs between the two parameterisations.

For the low-parameter model, increasing interaction depth tends to flatten the importance slope (spread importance more evenly). Because the learning rate is small, the algorithm takes many iterations to converge, giving secondary and tertiary predictors ample opportunity to appear in deeper splits. Deeper trees allow the model to capture interactions among multiple predictors simultaneously, so importance accrues to a broader set of variables rather than concentrating on the single best main-effect predictor.

For the high-parameter model, the effect is more muted. Because the high learning rate already front-loads most of the learning into the first few iterations — where simple, strong main-effect predictors are selected — adding deeper splits in subsequent iterations still finds the same dominant predictors. The importance slope may flatten slightly but the concentration on the top one or two predictors is unlikely to change dramatically because those predictors genuinely explain the most variance and the model converges too quickly to explore alternatives.

In both cases the practical recommendation is to treat interaction depth as a tuning parameter evaluated via cross-validation rather than set to an extreme: too shallow (stumps) risks missing real interactions; too deep risks overfitting and can re-introduce the importance concentration seen with high learning rates.

Exercise 8.7

Problem Statement

Exercises 6.3 and 7.5 describe a chemical manufacturing process for a pharmaceutical product: 176 production runs, 57 predictors (12 biological material measurements + 45 process parameters), response = percent product yield. A 1% yield improvement is worth approximately $100,000 per batch. We use the same imputation, data splitting, and pre-processing steps as in those exercises and train several tree-based regression models.

Data Loading and Imputation

data(ChemicalManufacturingProcess)

yield87 <- ChemicalManufacturingProcess$Yield
preds87 <- ChemicalManufacturingProcess[, -1]

cat("Dimensions:", nrow(ChemicalManufacturingProcess), "x",
    ncol(ChemicalManufacturingProcess), "\n")

Dimensions: 176 x 58

cat("Missing values in predictors:", sum(is.na(preds87)), "\n")

Missing values in predictors: 106

cat("Yield -- mean:", round(mean(yield87), 2),
    "  sd:", round(sd(yield87), 2),
    "  range:", paste(round(range(yield87), 2), collapse = " – "), "\n")

Yield – mean: 40.18 sd: 1.85 range: 35.25 – 46.34

pp_impute87 <- preProcess(preds87, method = "medianImpute")
preds_imp87 <- predict(pp_impute87, preds87)
cat("Missing values after median imputation:", sum(is.na(preds_imp87)), "\n")

Missing values after median imputation: 0

Train / Test Split

set.seed(100)
trainIdx87 <- createDataPartition(yield87, p = 0.8, list = FALSE)

trX87 <- preds_imp87[ trainIdx87, ];  teX87 <- preds_imp87[-trainIdx87, ]
trY87 <- yield87[trainIdx87];          teY87 <- yield87[-trainIdx87]

cat("Training:", nrow(trX87), " | Test:", nrow(teX87), "\n")

Training: 144 | Test: 32

Pre-processing

pp87 <- preProcess(trX87, method = c("nzv", "center", "scale"))
trX87_pp <- predict(pp87, trX87)
teX87_pp <- predict(pp87, teX87)
cat("Predictors after removing near-zero variance:", ncol(trX87_pp), "\n")

Predictors after removing near-zero variance: 56

train87_df <- data.frame(trX87_pp, y = trY87)
ctrl87     <- trainControl(method = "cv", number = 10, verboseIter = FALSE)

Model 1: Single Regression Tree

set.seed(7)
rpart87 <- train(y ~ ., data = train87_df,
                 method     = "rpart",
                 trControl  = ctrl87,
                 tuneLength = 10)
.tmp <- rpart87

knitr::kable(
  rpart87$results[, c("cp","RMSE","Rsquared","MAE")] %>%
    mutate(across(where(is.numeric), \(x) round(x, 4))) %>%
    arrange(RMSE) %>% head(5),
  caption = paste("Single tree cross-validation results (top 5 by RMSE). Best cp =",
                  round(rpart87$bestTune$cp, 5)),
  row.names = FALSE
)

Single tree cross-validation results (top 5 by RMSE). Best cp = 0.07736
cp	RMSE	Rsquared	MAE
0.0774	1.5061	0.4040	1.2028
0.0703	1.5105	0.4105	1.2220
0.0621	1.5424	0.4039	1.2387
0.0377	1.5443	0.4511	1.2118
0.0356	1.5455	0.4537	1.2029

Model 2: Random Forest

set.seed(7)
rf87 <- train(y ~ ., data = train87_df,
              method     = "rf",
              trControl  = ctrl87,
              tuneLength = 5,
              importance = TRUE)
.tmp <- rf87

knitr::kable(
  rf87$results[, c("mtry","RMSE","Rsquared","MAE")] %>%
    mutate(across(where(is.numeric), \(x) round(x, 4))),
  caption = paste("Random forest cross-validation results. Best mtry =",
                  rf87$bestTune$mtry),
  row.names = FALSE
)

Random forest cross-validation results. Best mtry = 42
mtry	RMSE	Rsquared	MAE
2	1.2699	0.6335	1.0149
15	1.1427	0.6672	0.8918
29	1.1200	0.6738	0.8643
42	1.1173	0.6713	0.8698
56	1.1264	0.6609	0.8692

Model 3: Gradient Boosting

gbm_grid87 <- expand.grid(
  n.trees           = c(100, 500, 1000),
  interaction.depth = c(1, 3),
  shrinkage         = 0.1,
  n.minobsinnode    = 10
)

set.seed(7)
gbm87 <- train(y ~ ., data = train87_df,
               method    = "gbm",
               trControl = ctrl87,
               tuneGrid  = gbm_grid87,
               verbose   = FALSE)
.tmp <- gbm87

knitr::kable(
  gbm87$results[, c("n.trees","interaction.depth","shrinkage","RMSE","Rsquared","MAE")] %>%
    mutate(across(where(is.numeric), \(x) round(x, 4))) %>%
    arrange(RMSE) %>% head(6),
  caption = paste0("GBM cross-validation results (top 6 by RMSE). Best: n.trees = ",
                   gbm87$bestTune$n.trees,
                   ", depth = ", gbm87$bestTune$interaction.depth),
  row.names = FALSE
)

GBM cross-validation results (top 6 by RMSE). Best: n.trees = 100, depth = 3
n.trees	interaction.depth	shrinkage	RMSE	Rsquared	MAE
100	3	0.1	1.1244	0.6796	0.8915
500	3	0.1	1.1334	0.6746	0.8907
1000	3	0.1	1.1370	0.6725	0.8958
500	1	0.1	1.2205	0.6082	0.9501
100	1	0.1	1.2281	0.5915	0.9771
1000	1	0.1	1.2322	0.6025	0.9515

Model 4: Cubist

cubist_grid87 <- expand.grid(committees = c(1, 10, 50, 100),
                              neighbors  = c(0, 5, 9))

set.seed(7)
cubist87 <- train(y ~ ., data = train87_df,
                  method    = "cubist",
                  trControl = ctrl87,
                  tuneGrid  = cubist_grid87)
.tmp <- cubist87

knitr::kable(
  cubist87$results[, c("committees","neighbors","RMSE","Rsquared","MAE")] %>%
    mutate(across(where(is.numeric), \(x) round(x, 4))) %>%
    arrange(RMSE) %>% head(6),
  caption = paste0("Cubist cross-validation results (top 6 by RMSE). Best: committees = ",
                   cubist87$bestTune$committees,
                   ", neighbors = ", cubist87$bestTune$neighbors),
  row.names = FALSE
)

Cubist cross-validation results (top 6 by RMSE). Best: committees = 100, neighbors = 5
committees	neighbors	RMSE	Rsquared	MAE
100	5	1.0240	0.7233	0.7989
50	5	1.0274	0.7170	0.7959
1	5	1.0602	0.7145	0.8548
50	9	1.0781	0.6829	0.8380
100	9	1.0809	0.6850	0.8421
10	5	1.0853	0.6930	0.8358

Part (a): Optimal Resampling and Test-Set Performance

models87 <- list(
  "Single Tree"   = rpart87,
  "Random Forest" = rf87,
  "GBM"           = gbm87,
  "Cubist"        = cubist87
)

perf87 <- lapply(models87, \(m) {
  p <- predict(m, newdata = teX87_pp)
  as.data.frame(t(postResample(pred = p, obs = teY87)))
}) %>%
  bind_rows(.id = "Model") %>%
  mutate(across(where(is.numeric), \(x) round(x, 4))) %>%
  arrange(RMSE)

knitr::kable(perf87,
             caption = "Test-set performance for tree-based models — Chemical Manufacturing",
             row.names = FALSE)

Test-set performance for tree-based models — Chemical Manufacturing
Model	RMSE	Rsquared	MAE
Cubist	0.8249	0.7540	0.6761
GBM	0.8884	0.7196	0.7497
Random Forest	0.9848	0.6554	0.8206
Single Tree	1.3017	0.4048	1.0117

resamps87 <- resamples(models87)
bwplot(resamps87, metric = "RMSE",
       main = "10-Fold CV RMSE Distribution by Model")

ggplot(perf87,
       aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
  geom_col(width = 0.55, show.legend = FALSE) +
  geom_text(aes(label = RMSE), vjust = -0.4, fontface = "bold", size = 4.5) +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Test-Set RMSE by Model",
    subtitle = "Chemical Manufacturing Process data — lower is better",
    x = NULL, y = "RMSE"
  ) +
  theme_minimal(base_size = 13)

Cubist achieves the lowest test RMSE (0.8249), followed by GBM (0.8884), Random Forest (0.9848), and Single Tree (1.3017). The single regression tree is a clear last place — its test RMSE is substantially higher, reflecting the instability and high variance of unpruned or lightly pruned trees when the predictor space is high-dimensional. Ensemble methods, which reduce variance by averaging predictions across many trees, all outperform the single tree by a meaningful margin.

The CV results are consistent with the test set, indicating that resampling provided a reliable guide to out-of-sample performance here. Among the ensembles, Cubist edges out the others, consistent with results from Chapters 6 and 7 where rule-based and smooth-surface models tended to do well on this manufacturing dataset.

Part (b): Variable Importance — Which Predictors Dominate?

# Pull importance from the best-performing model and from RF (calibrated importance)
best_model87 <- models87[[perf87$Model[1]]]
imp87 <- varImp(best_model87, scale = TRUE)$importance
imp87_df <- data.frame(
  Predictor  = rownames(imp87),
  Importance = imp87[, 1]
) %>%
  arrange(desc(Importance)) %>%
  mutate(
    Type = case_when(
      grepl("^BiologicalMaterial",    Predictor) ~ "Biological",
      grepl("^ManufacturingProcess",  Predictor) ~ "Process",
      TRUE                                        ~ "Other"
    ),
    Rank = row_number()
  )

top20_87 <- head(imp87_df, 20)

ggplot(top20_87,
       aes(x = reorder(Predictor, Importance), y = Importance, fill = Type)) +
  geom_col(width = 0.65) +
  coord_flip() +
  scale_fill_manual(values = c(Biological = "#4dac26",
                                Process    = "#d01c8b")) +
  labs(
    title    = paste("Top 20 Variable Importances —", perf87$Model[1]),
    subtitle = "Chemical Manufacturing Process Data",
    x = NULL, y = "Scaled Importance (%)", fill = NULL
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

top10_87 <- head(imp87_df, 10)
cat("Composition of top 10 predictors:\n")

Composition of top 10 predictors:

print(table(top10_87$Type))

Biological Process 4 6

cat("Top 10 predictors:\n")

Top 10 predictors:

knitr::kable(
  top10_87[, c("Rank","Predictor","Type","Importance")] %>%
    mutate(Importance = round(Importance, 1)),
  caption = paste("Top 10 predictors in the optimal tree model:", perf87$Model[1]),
  row.names = FALSE
)

Top 10 predictors in the optimal tree model: Cubist
Rank	Predictor	Type	Importance
1	ManufacturingProcess13	Process	100.0
2	ManufacturingProcess32	Process	90.0
3	ManufacturingProcess09	Process	62.9
4	ManufacturingProcess04	Process	55.7
5	ManufacturingProcess15	Process	50.0
6	BiologicalMaterial04	Biological	48.6
7	BiologicalMaterial10	Biological	47.1
8	BiologicalMaterial03	Biological	38.6
9	ManufacturingProcess17	Process	24.3
10	BiologicalMaterial02	Biological	24.3

# Compare to RF importance for reference against prior homework (7.5 used SVM/MARS)
rf_imp87 <- varImp(rf87, scale = TRUE)$importance
rf_imp87_df <- data.frame(
  Predictor  = rownames(rf_imp87),
  Importance = rf_imp87[, 1],
  Model      = "Random Forest (Ch 8)"
) %>%
  arrange(desc(Importance)) %>%
  head(10)

best_imp87_df <- top10_87 %>%
  select(Predictor, Importance) %>%
  mutate(Model = paste(perf87$Model[1], "(Ch 8)"))

bind_rows(best_imp87_df, rf_imp87_df) %>%
  mutate(Type = case_when(
    grepl("^BiologicalMaterial",   Predictor) ~ "Biological",
    grepl("^ManufacturingProcess", Predictor) ~ "Process",
    TRUE ~ "Other"
  )) %>%
  ggplot(aes(x = reorder(Predictor, Importance), y = Importance, fill = Type)) +
  geom_col(width = 0.65) +
  coord_flip() +
  facet_wrap(~Model, scales = "free") +
  scale_fill_manual(values = c(Biological = "#4dac26", Process = "#d01c8b")) +
  labs(
    title = "Top 10 Importances: Best Tree Model vs. Random Forest",
    x = NULL, y = "Importance (scaled 0-100)", fill = NULL
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom",
        strip.background = element_rect(fill = "grey90"),
        strip.text = element_text(face = "bold"))

Manufacturing process predictors dominate the top 10 in both tree models, consistent with the linear and nonlinear models from Chapters 6 and 7. ManufacturingProcess32 and ManufacturingProcess13 are consistently among the top-ranked predictors across linear, nonlinear, and now tree-based models, making them the most confidently identified drivers of yield in the entire analysis. ManufacturingProcess09 also appears across multiple model families, reinforcing its importance.

The direction of these relationships matters as much as their magnitude. The Chapter 6 PLS analysis found that ManufacturingProcess13 has a negative relationship with yield — higher values associate with lower output — while ManufacturingProcess32 shows a positive relationship. Cubist ranking ManufacturingProcess13 first in importance does not contradict this: importance measures how much a predictor drives variation in yield, not whether that effect is beneficial. From a process control standpoint, ManufacturingProcess32 is a parameter to maximize while ManufacturingProcess13 is one to minimize or tightly constrain.

Biological material predictors appear less prominently in the tree models than in some of the linear model results. This is a consequence of how tree-based importance is measured: if a manufacturing process variable captures much of the same variation as a biological predictor (due to correlation between input quality and downstream processing decisions), the tree will tend to select the stronger one at each split and the other will receive little credit. The biological predictors are not irrelevant — they reflect raw material quality, which the manufacturer cannot control, but their signal is largely absorbed by the process predictors in this tree framework.

Part (c): Optimal Single Tree and Terminal Node Yield Distributions

best_cp87 <- rpart87$bestTune$cp
optimal_tree87 <- rpart(y ~ ., data = train87_df,
                         control = rpart.control(cp = best_cp87))

rpart.plot(optimal_tree87,
           type    = 4,
           extra   = 101,
           under   = TRUE,
           cex     = 0.72,
           main    = "Optimal Single Regression Tree — Chemical Manufacturing Process",
           box.palette = "Blues")

train87_df$leaf_pred <- round(predict(optimal_tree87, train87_df), 3)
n_leaves87 <- length(unique(train87_df$leaf_pred))
cat("Number of terminal nodes:", n_leaves87, "\n")

Number of terminal nodes: 2

train87_df$Leaf <- factor(train87_df$leaf_pred)

ggplot(train87_df, aes(x = Leaf, y = y)) +
  geom_boxplot(fill = "#4393c3", outlier.alpha = 0.4, width = 0.55) +
  geom_hline(yintercept = mean(trY87), linetype = "dashed",
             colour = "grey40", linewidth = 0.8) +
  labs(
    title    = "Yield Distribution in Each Terminal Node of the Optimal Tree",
    subtitle = "Dashed line = overall mean yield; each box is one leaf node",
    x        = "Predicted Yield (Leaf Identifier)",
    y        = "Actual Yield (%)"
  ) +
  theme_minimal(base_size = 13) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

The single tree provides a different kind of value from the ensemble models: interpretability over accuracy. Several lessons emerge from the tree structure and terminal node distributions.

First, the root split is on a manufacturing process variable (likely ManufacturingProcess32), confirming that a single threshold on this predictor divides the data into two meaningfully different yield populations. This is a directly actionable finding: if engineers can ensure ManufacturingProcess32 stays above (or below) its optimal threshold, they can steer the production run into the higher-yield branch before it begins.

Second, the terminal node boxplot reveals that while the split on ManufacturingProcess32 successfully separates the data into two distinct yield groups, the low-yield leaf (ManufacturingProcess32 < 0.2, predicted yield 39.1, n = 83) sits mostly below the overall mean, while the high-yield leaf (≥ 0.2, predicted yield 41.5, n = 61) sits almost entirely above it — both nodes retain substantial within-leaf variation. The IQRs of both leaves span roughly 1.5–2 percentage points and the tails overlap considerably, confirming that ManufacturingProcess32 alone cannot fully account for yield variability. This within-leaf spread points to the influence of additional predictors — exactly what the ensemble models capture through their more complex structures.

Third, the tree highlights threshold values for key predictors, not just rankings. While the ensemble models tell us that ManufacturingProcess32 is the most important predictor, the single tree tells us the specific cut-point at which its effect changes. These cut-points are the most actionable output for process control, since they can be translated directly into specification limits.

The tree view complements the ensemble analysis: together, the two provide the best of both worlds — high accuracy from the ensemble and interpretable rules from the single tree.

sessionInfo()

R version 4.5.0 (2025-04-11) Platform: aarch64-apple-darwin20 Running under: macOS 26.4.1

Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1

locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York tzcode source: internal

attached base packages: [1] stats4 grid stats graphics grDevices utils datasets [8] methods base

other attached packages: [1] tidyr_1.3.1 dplyr_1.1.4
[3] rpart.plot_3.1.4 rpart_4.1.24
[5] AppliedPredictiveModeling_1.1-7 Cubist_0.6.0
[7] gbm_2.2.3 party_1.3-20
[9] strucchange_1.5-4 sandwich_3.1-1
[11] zoo_1.8-14 modeltools_0.2-24
[13] mvtnorm_1.3-3 caret_7.0-1
[15] lattice_0.22-6 ggplot2_4.0.0.9000
[17] randomForest_4.7-1.2 mlbench_2.1-7

loaded via a namespace (and not attached): [1] tidyselect_1.2.1 viridisLite_0.4.2 timeDate_4052.112
[4] libcoin_1.0-12 farver_2.1.2 S7_0.2.0
[7] fastmap_1.2.0 TH.data_1.1-5 pROC_1.19.0.1
[10] digest_0.6.37 timechange_0.3.0 lifecycle_1.0.4
[13] cluster_2.1.8.1 survival_3.8-3 magrittr_2.0.4
[16] compiler_4.5.0 rlang_1.1.6 sass_0.4.10
[19] tools_4.5.0 plotrix_3.8-14 yaml_2.3.10
[22] data.table_1.17.8 knitr_1.50 labeling_0.4.3
[25] plyr_1.8.9 RColorBrewer_1.1-3 multcomp_1.4-30
[28] withr_3.0.2 purrr_1.1.0 nnet_7.3-20
[31] future_1.68.0 globals_0.18.0 scales_1.4.0
[34] iterators_1.0.14 MASS_7.3-65 ellipse_0.5.0
[37] cli_3.6.5 rmarkdown_2.30 generics_0.1.4
[40] rstudioapi_0.17.1 future.apply_1.20.1 reshape2_1.4.4
[43] cachem_1.1.0 stringr_1.5.2 splines_4.5.0
[46] parallel_4.5.0 matrixStats_1.5.0 vctrs_0.6.5
[49] hardhat_1.4.2 Matrix_1.7-3 jsonlite_2.0.0
[52] listenv_0.10.0 foreach_1.5.2 gower_1.0.2
[55] jquerylib_0.1.4 recipes_1.3.1 glue_1.8.0
[58] parallelly_1.46.0 codetools_0.2-20 lubridate_1.9.4
[61] stringi_1.8.7 gtable_0.3.6 tibble_3.3.0
[64] CORElearn_1.57.3.1 pillar_1.11.1 htmltools_0.5.8.1
[67] ipred_0.9-15 lava_1.8.2 R6_2.6.1
[70] evaluate_1.0.5 bslib_0.9.0 class_7.3-23
[73] Rcpp_1.1.0 nlme_3.1-168 prodlim_2025.04.28
[76] xfun_0.53 coin_1.4-3 pkgconfig_2.0.3
[79] ModelMetrics_1.2.2.2

Regression Trees and Rule-Based Models

Chapter 8 Homework Problems

Kerwin Charles

2026-04-26