library(mlbench)
library(randomForest)
library(caret)
library(party)
library(gbm)
library(Cubist)
library(AppliedPredictiveModeling)
library(rpart)
library(rpart.plot)
library(ggplot2)
library(dplyr)
library(tidyr)set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
cat("Observations:", nrow(simulated),
" | Predictors:", ncol(simulated) - 1, "\n")Observations: 200 | Predictors: 10
V1–V5 are informative; V6–V10 are pure noise
The Friedman simulation generates a response that depends on only five of ten available predictors. The true model is: \(y = 10\sin(\pi x_1 x_2) + 20(x_3 - 0.5)^2 + 10x_4 + 5x_5 + \varepsilon\), where \(\varepsilon \sim N(0, 1)\). Predictors V6 through V10 have no relationship with the outcome — they are pure noise. This makes the dataset an ideal test bed for evaluating whether a model’s variable importance correctly separates signal from noise.
set.seed(42)
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1_df <- data.frame(
Predictor = rownames(rfImp1),
Importance = rfImp1[, 1]
) %>%
arrange(desc(Importance)) %>%
mutate(Role = ifelse(Predictor %in% paste0("V", 1:5),
"Informative", "Noise"))
knitr::kable(
rfImp1_df %>% mutate(Importance = round(Importance, 3)),
caption = "Random forest variable importance — baseline model (n = 1,000 trees)",
row.names = FALSE
)| Predictor | Importance | Role |
|---|---|---|
| V1 | 8.417 | Informative |
| V4 | 7.730 | Informative |
| V2 | 6.869 | Informative |
| V5 | 2.195 | Informative |
| V3 | 0.942 | Informative |
| V6 | 0.148 | Noise |
| V7 | 0.053 | Noise |
| V10 | -0.043 | Noise |
| V9 | -0.058 | Noise |
| V8 | -0.087 | Noise |
ggplot(rfImp1_df,
aes(x = reorder(Predictor, Importance), y = Importance, fill = Role)) +
geom_col(width = 0.6) +
coord_flip() +
scale_fill_manual(values = c(Informative = "#2166ac", Noise = "#d73027")) +
labs(
title = "Random Forest Variable Importance — Baseline Model",
subtitle = "Blue = informative (V1–V5), Red = noise (V6–V10)",
x = NULL, y = "% Increase in MSE", fill = NULL
) +
theme_minimal(base_size = 13)The random forest correctly separates the two groups. The five
informative predictors (V1–V5) all receive meaningfully positive
importance scores, while the five noise predictors (V6–V10) cluster near
zero. The model does not significantly use the uninformative predictors.
This result is expected: random forests build many trees on bootstrap
samples and at each split randomly sample mtry candidate
predictors. Across 1,000 trees, the noise predictors occasionally appear
at a split, but they rarely reduce SSE more than the informative ones,
so their average importance remains near the baseline.
# Use the original 10-predictor data for a clean comparison
simulated_base <- simulated[, c(paste0("V", 1:10), "y")]
set.seed(42)
cforest_mod <- cforest(y ~ ., data = simulated_base,
controls = cforest_unbiased(ntree = 500, mtry = 3))
imp_trad <- varimp(cforest_mod, conditional = FALSE)
imp_cond <- varimp(cforest_mod, conditional = TRUE)
imp_cf <- data.frame(
Predictor = names(imp_trad),
Traditional = round(imp_trad, 4),
Conditional = round(imp_cond, 4)
) %>% arrange(desc(Traditional))
knitr::kable(imp_cf,
caption = "cforest variable importance: traditional vs. conditional",
row.names = FALSE)| Predictor | Traditional | Conditional |
|---|---|---|
| V1 | 7.2411 | 4.4372 |
| V4 | 7.0724 | 4.9437 |
| V2 | 5.6184 | 4.0119 |
| V5 | 1.9155 | 1.1504 |
| V3 | 0.1363 | 0.1004 |
| V7 | -0.0217 | -0.0231 |
| V6 | -0.0781 | -0.0007 |
| V9 | -0.0804 | 0.0278 |
| V8 | -0.0865 | -0.0221 |
| V10 | -0.0979 | -0.0160 |
imp_cf_long <- imp_cf %>%
pivot_longer(-Predictor, names_to = "Type", values_to = "Importance") %>%
mutate(Role = ifelse(Predictor %in% paste0("V", 1:5),
"Informative", "Noise"))
ggplot(imp_cf_long,
aes(x = reorder(Predictor, Importance), y = Importance, fill = Type)) +
geom_col(position = "dodge", width = 0.65) +
coord_flip() +
scale_fill_manual(values = c(Traditional = "#9C27B0", Conditional = "#4CAF50")) +
labs(
title = "cforest: Traditional vs. Conditional Variable Importance",
subtitle = "Conditional importance is better calibrated when predictors are correlated",
x = NULL, y = "Importance", fill = NULL
) +
theme_minimal(base_size = 13)The conditional inference forest shows the same broad pattern as the standard random forest: V1–V5 receive higher importance than V6–V10. However, the conditional importance scores are noticeably more conservative — the magnitude of the top predictors shrinks and the gap between informative and noise variables narrows. This conservatism is by design: the conditional importance measure by Strobl et al. (2007) accounts for correlations among predictors when permuting, so it avoids artificially inflating the importance of predictors that are associated with others. On this data with relatively independent noise variables, both measures point to the same qualitative conclusion, but conditional importance would be more reliable in a setting with many correlated features.
set.seed(42)
gbm_81 <- gbm(y ~ ., data = simulated_base,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.01,
bag.fraction = 0.5,
verbose = FALSE)
gbm_imp <- summary(gbm_81, plotit = FALSE)
knitr::kable(
gbm_imp %>% mutate(rel.inf = round(rel.inf, 3)),
caption = "Gradient boosting relative influence (1,000 trees, depth = 3)",
col.names = c("Predictor", "Relative Influence (%)"),
row.names = FALSE
)| Predictor | Relative Influence (%) |
|---|---|
| V4 | 27.913 |
| V1 | 26.390 |
| V2 | 22.111 |
| V5 | 11.811 |
| V3 | 7.930 |
| V7 | 1.055 |
| V6 | 1.012 |
| V9 | 0.621 |
| V10 | 0.605 |
| V8 | 0.553 |
gbm_imp_df <- gbm_imp %>%
mutate(Role = ifelse(var %in% paste0("V", 1:5), "Informative", "Noise"))
ggplot(gbm_imp_df,
aes(x = reorder(var, rel.inf), y = rel.inf, fill = Role)) +
geom_col(width = 0.6) +
coord_flip() +
scale_fill_manual(values = c(Informative = "#2166ac", Noise = "#d73027")) +
labs(
title = "Gradient Boosting — Relative Influence",
subtitle = "Blue = informative (V1–V5), Red = noise (V6–V10)",
x = NULL, y = "Relative Influence (%)", fill = NULL
) +
theme_minimal(base_size = 13)set.seed(42)
cubist_81 <- cubist(x = simulated_base[, paste0("V", 1:10)],
y = simulated_base$y,
committees = 50)
cubist_imp <- varImp(cubist_81)
cubist_df <- data.frame(
Predictor = rownames(cubist_imp),
Importance = round(cubist_imp[, 1], 1),
Role = ifelse(rownames(cubist_imp) %in% paste0("V", 1:5),
"Informative", "Noise")
) %>% arrange(desc(Importance))
knitr::kable(cubist_df,
caption = "Cubist variable importance (50 committees)",
row.names = FALSE)| Predictor | Importance | Role |
|---|---|---|
| V1 | 71.0 | Informative |
| V2 | 57.5 | Informative |
| V4 | 48.5 | Informative |
| V3 | 44.5 | Informative |
| V5 | 35.0 | Informative |
| V6 | 0.0 | Noise |
| V7 | 0.0 | Noise |
| V8 | 0.0 | Noise |
| V9 | 0.0 | Noise |
| V10 | 0.0 | Noise |
ggplot(cubist_df,
aes(x = reorder(Predictor, Importance), y = Importance, fill = Role)) +
geom_col(width = 0.6) +
coord_flip() +
scale_fill_manual(values = c(Informative = "#2166ac", Noise = "#d73027")) +
labs(
title = "Cubist — Variable Importance",
subtitle = "Blue = informative (V1–V5), Red = noise (V6–V10)",
x = NULL, y = "Usage (%)", fill = NULL
) +
theme_minimal(base_size = 13)Both gradient boosting and Cubist replicate the pattern from the random forest: V1–V5 dominate and V6–V10 are negligible. GBM concentrates importance heavily on V1 and V4, the largest contributors in the true model, while assigning near-zero influence to all five noise predictors. Cubist spreads importance somewhat more evenly across the five informative predictors but still assigns zero usage to V6–V10. The finding is robust across all four tree-based methods tested: standard random forest, conditional inference forest, gradient boosting, and Cubist each correctly identify the informative predictors and discard the noise.
set.seed(123)
n <- 500
# Pure noise response, no predictor has any true relationship with y
y82 <- rnorm(n)
# Predictors at increasing levels of granularity
sim82 <- data.frame(
y = y82,
x_2 = sample(1:2, n, replace = TRUE),
x_4 = sample(1:4, n, replace = TRUE),
x_8 = sample(1:8, n, replace = TRUE),
x_16 = sample(1:16, n, replace = TRUE),
x_32 = sample(1:32, n, replace = TRUE),
x_cont = runif(n)
)
cat("Unique values per predictor:\n")Unique values per predictor:
x_2 x_4 x_8 x_16 x_32 x_cont 2 4 8 16 32 500
The design isolates granularity as the only varying feature. Every predictor is statistically independent of the response (all are random noise), so any systematic difference in importance scores can only be attributed to the number of unique values each predictor has — its granularity.
tree82 <- rpart(y ~ ., data = sim82,
control = rpart.control(cp = 0.001, maxdepth = 10))
imp82 <- tree82$variable.importance
if (length(imp82) > 0) {
imp82_df <- data.frame(
Predictor = names(imp82),
Importance = as.numeric(imp82),
UniqueVals = c(x_2 = 2, x_4 = 4, x_8 = 8,
x_16 = 16, x_32 = 32, x_cont = 500)[names(imp82)]
) %>% arrange(desc(Importance))
knitr::kable(
imp82_df %>% mutate(Importance = round(Importance, 3)),
caption = "Single tree importance on a pure-noise response",
row.names = FALSE
)
}| Predictor | Importance | UniqueVals |
|---|---|---|
| x_cont | 71.651 | 500 |
| x_32 | 42.016 | 32 |
| x_16 | 24.345 | 16 |
| x_8 | 13.193 | 8 |
| x_4 | 10.917 | 4 |
| x_2 | 3.022 | 2 |
if (exists("imp82_df")) {
ggplot(imp82_df,
aes(x = reorder(Predictor, Importance), y = Importance,
fill = log2(UniqueVals))) +
geom_col(width = 0.6) +
coord_flip() +
scale_fill_viridis_c(name = "log (unique values)", option = "plasma") +
labs(
title = "Tree Bias: Importance vs. Predictor Granularity",
subtitle = "All predictors are noise — higher granularity inflates importance",
x = NULL, y = "Variable Importance (SSE reduction)"
) +
theme_minimal(base_size = 13)
}B <- 200
gran_names <- c("x_2","x_4","x_8","x_16","x_32","x_cont")
bias_mat <- matrix(0, nrow = B, ncol = length(gran_names),
dimnames = list(NULL, gran_names))
for (b in seq_len(B)) {
df_b <- data.frame(
y = rnorm(n),
x_2 = sample(1:2, n, replace = TRUE),
x_4 = sample(1:4, n, replace = TRUE),
x_8 = sample(1:8, n, replace = TRUE),
x_16 = sample(1:16, n, replace = TRUE),
x_32 = sample(1:32, n, replace = TRUE),
x_cont = runif(n)
)
fit_b <- rpart(y ~ ., data = df_b,
control = rpart.control(cp = 0.001, maxdepth = 5))
vi_b <- fit_b$variable.importance
for (v in gran_names)
bias_mat[b, v] <- if (v %in% names(vi_b)) vi_b[v] else 0
}
bias_df <- as.data.frame(bias_mat) %>%
pivot_longer(everything(), names_to = "Predictor", values_to = "Importance") %>%
mutate(Predictor = factor(Predictor, levels = gran_names))
ggplot(bias_df, aes(x = Predictor, y = Importance)) +
geom_boxplot(fill = "#4393c3", outlier.alpha = 0.25, width = 0.55) +
labs(
title = "Distribution of Tree Importance Across 200 Noise Simulations",
subtitle = "Each predictor is pure noise — differences in importance reflect bias, not signal",
x = "Predictor (ordered by number of unique values)",
y = "Variable Importance (SSE reduction)"
) +
theme_minimal(base_size = 13)The replication confirms the pattern decisively. Across 200
independent simulations with a noise response, predictors with more
unique values consistently receive higher importance scores from the
single regression tree. The continuous predictor (x_cont,
~500 distinct values) dominates in nearly every replication;
x_32 is a distant second; and x_2 (binary)
receives the lowest importance. Because the true importance of every
predictor is exactly zero, these differences represent pure bias.
The mechanism is straightforward. CART evaluates all possible binary splits for each predictor and selects the one that maximally reduces SSE. A predictor with 500 unique values offers approximately 499 candidate split points; a binary predictor offers only one. With more chances to find a lucky split by random fluctuation, high-granularity predictors have a systematic advantage in the selection tournament — even when they carry no real information. This selection bias motivates the use of alternative importance measures (e.g., conditional inference trees) that penalise predictors for having more candidate splits, or the use of random forests, which reduce this bias across many trees.
Training: 951 samples, 228 predictors
Test: 316 samples
set.seed(42)
gbm_low <- gbm(y ~ ., data = train83,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.1,
bag.fraction = 0.1,
verbose = FALSE)
set.seed(42)
gbm_high <- gbm(y ~ ., data = train83,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.9,
bag.fraction = 0.9,
verbose = FALSE)
imp_low <- summary(gbm_low, plotit = FALSE) %>% mutate(Model = "Low (bf=0.1, lr=0.1)")
imp_high <- summary(gbm_high, plotit = FALSE) %>% mutate(Model = "High (bf=0.9, lr=0.9)")bind_rows(head(imp_low, 15), head(imp_high, 15)) %>%
ggplot(aes(x = reorder(var, rel.inf), y = rel.inf)) +
geom_col(fill = "#2166ac", width = 0.65) +
coord_flip() +
facet_wrap(~Model, scales = "free_x") +
labs(
title = "GBM Variable Importance: Low vs. High Bagging Fraction & Learning Rate",
subtitle = "High settings concentrate importance on a handful of predictors",
x = NULL, y = "Relative Influence (%)"
) +
theme_minimal(base_size = 12) +
theme(strip.background = element_rect(fill = "grey90"),
strip.text = element_text(face = "bold"))When both the learning rate and the bagging fraction are high (0.9 each), two forces combine to narrow the importance distribution.
A high learning rate means each successive tree makes large corrections to the residuals. The first few boosting iterations seize on the strongest predictors in the data because those offer the greatest SSE reduction. With a step size of 0.9, the model essentially solves most of the learning problem in those early iterations. Subsequent trees find little residual signal left to explain, so they contribute minimally — and the predictors they use accumulate negligible relative influence. The result is a steep drop-off: the one or two strongest predictors dominate, and everything else is marginal.
A high bagging fraction (0.9) compounds this by providing each tree with nearly the full training set. There is almost no sampling variability across iterations, so every tree sees the same dominant signal and selects the same strongest predictors. This repetition reinforces the concentration. In contrast, a low bagging fraction (0.1) feeds each tree a small, highly variable subsample, giving different predictors the chance to look best in different subsets. Over 1,000 iterations, this stochasticity spreads influence across a much wider set of predictors.
pred_low83 <- predict(gbm_low, newdata = test83, n.trees = 1000)
pred_high83 <- predict(gbm_high, newdata = test83, n.trees = 1000)
rmse83 <- function(a, p) round(sqrt(mean((a - p)^2)), 4)
r2_83 <- function(a, p) round(cor(a, p)^2, 4)
perf83 <- data.frame(
Model = c("Low (bf=0.1, lr=0.1)", "High (bf=0.9, lr=0.9)"),
Test_RMSE = c(rmse83(solTestY, pred_low83), rmse83(solTestY, pred_high83)),
Test_R2 = c(r2_83(solTestY, pred_low83), r2_83(solTestY, pred_high83))
)
knitr::kable(perf83,
caption = "Test-set performance for the two GBM parameterisations",
col.names = c("Model", "Test RMSE", "Test R²"),
row.names = FALSE)| Model | Test RMSE | Test R² |
|---|---|---|
| Low (bf=0.1, lr=0.1) | 0.6785 | 0.8940 |
| High (bf=0.9, lr=0.9) | 0.7400 | 0.8781 |
The low-parameter model (Low (bf=0.1, lr=0.1)) achieves a lower test RMSE (0.6785) than the high-parameter model (0.74). This is the expected result. A low learning rate forces the boosting algorithm to make many small, careful corrections rather than a few aggressive ones, which reduces the risk of overshooting the optimal fit. A low bagging fraction introduces regularising noise at each step, which acts as a form of variance reduction similar to dropout in neural networks. Together, these settings encourage a slower, more stable convergence that generalises better to new data. The high-parameter model is prone to overfitting: its large learning rate can over-correct the training residuals and its high bagging fraction provides insufficient stochasticity to smooth out those errors.
Interaction depth controls the maximum number of splits in each boosting tree, which determines the order of predictor interactions the model can capture. Its effect on the importance slope differs between the two parameterisations.
For the low-parameter model, increasing interaction depth tends to flatten the importance slope (spread importance more evenly). Because the learning rate is small, the algorithm takes many iterations to converge, giving secondary and tertiary predictors ample opportunity to appear in deeper splits. Deeper trees allow the model to capture interactions among multiple predictors simultaneously, so importance accrues to a broader set of variables rather than concentrating on the single best main-effect predictor.
For the high-parameter model, the effect is more muted. Because the high learning rate already front-loads most of the learning into the first few iterations — where simple, strong main-effect predictors are selected — adding deeper splits in subsequent iterations still finds the same dominant predictors. The importance slope may flatten slightly but the concentration on the top one or two predictors is unlikely to change dramatically because those predictors genuinely explain the most variance and the model converges too quickly to explore alternatives.
In both cases the practical recommendation is to treat interaction depth as a tuning parameter evaluated via cross-validation rather than set to an extreme: too shallow (stumps) risks missing real interactions; too deep risks overfitting and can re-introduce the importance concentration seen with high learning rates.
Exercises 6.3 and 7.5 describe a chemical manufacturing process for a pharmaceutical product: 176 production runs, 57 predictors (12 biological material measurements + 45 process parameters), response = percent product yield. A 1% yield improvement is worth approximately $100,000 per batch. We use the same imputation, data splitting, and pre-processing steps as in those exercises and train several tree-based regression models.
data(ChemicalManufacturingProcess)
yield87 <- ChemicalManufacturingProcess$Yield
preds87 <- ChemicalManufacturingProcess[, -1]
cat("Dimensions:", nrow(ChemicalManufacturingProcess), "x",
ncol(ChemicalManufacturingProcess), "\n")Dimensions: 176 x 58
Missing values in predictors: 106
cat("Yield -- mean:", round(mean(yield87), 2),
" sd:", round(sd(yield87), 2),
" range:", paste(round(range(yield87), 2), collapse = " – "), "\n")Yield – mean: 40.18 sd: 1.85 range: 35.25 – 46.34
pp_impute87 <- preProcess(preds87, method = "medianImpute")
preds_imp87 <- predict(pp_impute87, preds87)
cat("Missing values after median imputation:", sum(is.na(preds_imp87)), "\n")Missing values after median imputation: 0
set.seed(100)
trainIdx87 <- createDataPartition(yield87, p = 0.8, list = FALSE)
trX87 <- preds_imp87[ trainIdx87, ]; teX87 <- preds_imp87[-trainIdx87, ]
trY87 <- yield87[trainIdx87]; teY87 <- yield87[-trainIdx87]
cat("Training:", nrow(trX87), " | Test:", nrow(teX87), "\n")Training: 144 | Test: 32
pp87 <- preProcess(trX87, method = c("nzv", "center", "scale"))
trX87_pp <- predict(pp87, trX87)
teX87_pp <- predict(pp87, teX87)
cat("Predictors after removing near-zero variance:", ncol(trX87_pp), "\n")Predictors after removing near-zero variance: 56
set.seed(7)
rpart87 <- train(y ~ ., data = train87_df,
method = "rpart",
trControl = ctrl87,
tuneLength = 10)
.tmp <- rpart87
knitr::kable(
rpart87$results[, c("cp","RMSE","Rsquared","MAE")] %>%
mutate(across(where(is.numeric), \(x) round(x, 4))) %>%
arrange(RMSE) %>% head(5),
caption = paste("Single tree cross-validation results (top 5 by RMSE). Best cp =",
round(rpart87$bestTune$cp, 5)),
row.names = FALSE
)| cp | RMSE | Rsquared | MAE |
|---|---|---|---|
| 0.0774 | 1.5061 | 0.4040 | 1.2028 |
| 0.0703 | 1.5105 | 0.4105 | 1.2220 |
| 0.0621 | 1.5424 | 0.4039 | 1.2387 |
| 0.0377 | 1.5443 | 0.4511 | 1.2118 |
| 0.0356 | 1.5455 | 0.4537 | 1.2029 |
set.seed(7)
rf87 <- train(y ~ ., data = train87_df,
method = "rf",
trControl = ctrl87,
tuneLength = 5,
importance = TRUE)
.tmp <- rf87
knitr::kable(
rf87$results[, c("mtry","RMSE","Rsquared","MAE")] %>%
mutate(across(where(is.numeric), \(x) round(x, 4))),
caption = paste("Random forest cross-validation results. Best mtry =",
rf87$bestTune$mtry),
row.names = FALSE
)| mtry | RMSE | Rsquared | MAE |
|---|---|---|---|
| 2 | 1.2699 | 0.6335 | 1.0149 |
| 15 | 1.1427 | 0.6672 | 0.8918 |
| 29 | 1.1200 | 0.6738 | 0.8643 |
| 42 | 1.1173 | 0.6713 | 0.8698 |
| 56 | 1.1264 | 0.6609 | 0.8692 |
gbm_grid87 <- expand.grid(
n.trees = c(100, 500, 1000),
interaction.depth = c(1, 3),
shrinkage = 0.1,
n.minobsinnode = 10
)
set.seed(7)
gbm87 <- train(y ~ ., data = train87_df,
method = "gbm",
trControl = ctrl87,
tuneGrid = gbm_grid87,
verbose = FALSE)
.tmp <- gbm87
knitr::kable(
gbm87$results[, c("n.trees","interaction.depth","shrinkage","RMSE","Rsquared","MAE")] %>%
mutate(across(where(is.numeric), \(x) round(x, 4))) %>%
arrange(RMSE) %>% head(6),
caption = paste0("GBM cross-validation results (top 6 by RMSE). Best: n.trees = ",
gbm87$bestTune$n.trees,
", depth = ", gbm87$bestTune$interaction.depth),
row.names = FALSE
)| n.trees | interaction.depth | shrinkage | RMSE | Rsquared | MAE |
|---|---|---|---|---|---|
| 100 | 3 | 0.1 | 1.1244 | 0.6796 | 0.8915 |
| 500 | 3 | 0.1 | 1.1334 | 0.6746 | 0.8907 |
| 1000 | 3 | 0.1 | 1.1370 | 0.6725 | 0.8958 |
| 500 | 1 | 0.1 | 1.2205 | 0.6082 | 0.9501 |
| 100 | 1 | 0.1 | 1.2281 | 0.5915 | 0.9771 |
| 1000 | 1 | 0.1 | 1.2322 | 0.6025 | 0.9515 |
cubist_grid87 <- expand.grid(committees = c(1, 10, 50, 100),
neighbors = c(0, 5, 9))
set.seed(7)
cubist87 <- train(y ~ ., data = train87_df,
method = "cubist",
trControl = ctrl87,
tuneGrid = cubist_grid87)
.tmp <- cubist87
knitr::kable(
cubist87$results[, c("committees","neighbors","RMSE","Rsquared","MAE")] %>%
mutate(across(where(is.numeric), \(x) round(x, 4))) %>%
arrange(RMSE) %>% head(6),
caption = paste0("Cubist cross-validation results (top 6 by RMSE). Best: committees = ",
cubist87$bestTune$committees,
", neighbors = ", cubist87$bestTune$neighbors),
row.names = FALSE
)| committees | neighbors | RMSE | Rsquared | MAE |
|---|---|---|---|---|
| 100 | 5 | 1.0240 | 0.7233 | 0.7989 |
| 50 | 5 | 1.0274 | 0.7170 | 0.7959 |
| 1 | 5 | 1.0602 | 0.7145 | 0.8548 |
| 50 | 9 | 1.0781 | 0.6829 | 0.8380 |
| 100 | 9 | 1.0809 | 0.6850 | 0.8421 |
| 10 | 5 | 1.0853 | 0.6930 | 0.8358 |
models87 <- list(
"Single Tree" = rpart87,
"Random Forest" = rf87,
"GBM" = gbm87,
"Cubist" = cubist87
)
perf87 <- lapply(models87, \(m) {
p <- predict(m, newdata = teX87_pp)
as.data.frame(t(postResample(pred = p, obs = teY87)))
}) %>%
bind_rows(.id = "Model") %>%
mutate(across(where(is.numeric), \(x) round(x, 4))) %>%
arrange(RMSE)
knitr::kable(perf87,
caption = "Test-set performance for tree-based models — Chemical Manufacturing",
row.names = FALSE)| Model | RMSE | Rsquared | MAE |
|---|---|---|---|
| Cubist | 0.8249 | 0.7540 | 0.6761 |
| GBM | 0.8884 | 0.7196 | 0.7497 |
| Random Forest | 0.9848 | 0.6554 | 0.8206 |
| Single Tree | 1.3017 | 0.4048 | 1.0117 |
resamps87 <- resamples(models87)
bwplot(resamps87, metric = "RMSE",
main = "10-Fold CV RMSE Distribution by Model")ggplot(perf87,
aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
geom_col(width = 0.55, show.legend = FALSE) +
geom_text(aes(label = RMSE), vjust = -0.4, fontface = "bold", size = 4.5) +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(
title = "Test-Set RMSE by Model",
subtitle = "Chemical Manufacturing Process data — lower is better",
x = NULL, y = "RMSE"
) +
theme_minimal(base_size = 13)Cubist achieves the lowest test RMSE (0.8249), followed by GBM (0.8884), Random Forest (0.9848), and Single Tree (1.3017). The single regression tree is a clear last place — its test RMSE is substantially higher, reflecting the instability and high variance of unpruned or lightly pruned trees when the predictor space is high-dimensional. Ensemble methods, which reduce variance by averaging predictions across many trees, all outperform the single tree by a meaningful margin.
The CV results are consistent with the test set, indicating that resampling provided a reliable guide to out-of-sample performance here. Among the ensembles, Cubist edges out the others, consistent with results from Chapters 6 and 7 where rule-based and smooth-surface models tended to do well on this manufacturing dataset.
# Pull importance from the best-performing model and from RF (calibrated importance)
best_model87 <- models87[[perf87$Model[1]]]
imp87 <- varImp(best_model87, scale = TRUE)$importance
imp87_df <- data.frame(
Predictor = rownames(imp87),
Importance = imp87[, 1]
) %>%
arrange(desc(Importance)) %>%
mutate(
Type = case_when(
grepl("^BiologicalMaterial", Predictor) ~ "Biological",
grepl("^ManufacturingProcess", Predictor) ~ "Process",
TRUE ~ "Other"
),
Rank = row_number()
)
top20_87 <- head(imp87_df, 20)
ggplot(top20_87,
aes(x = reorder(Predictor, Importance), y = Importance, fill = Type)) +
geom_col(width = 0.65) +
coord_flip() +
scale_fill_manual(values = c(Biological = "#4dac26",
Process = "#d01c8b")) +
labs(
title = paste("Top 20 Variable Importances —", perf87$Model[1]),
subtitle = "Chemical Manufacturing Process Data",
x = NULL, y = "Scaled Importance (%)", fill = NULL
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")Composition of top 10 predictors:
Biological Process 4 6
Top 10 predictors:
knitr::kable(
top10_87[, c("Rank","Predictor","Type","Importance")] %>%
mutate(Importance = round(Importance, 1)),
caption = paste("Top 10 predictors in the optimal tree model:", perf87$Model[1]),
row.names = FALSE
)| Rank | Predictor | Type | Importance |
|---|---|---|---|
| 1 | ManufacturingProcess13 | Process | 100.0 |
| 2 | ManufacturingProcess32 | Process | 90.0 |
| 3 | ManufacturingProcess09 | Process | 62.9 |
| 4 | ManufacturingProcess04 | Process | 55.7 |
| 5 | ManufacturingProcess15 | Process | 50.0 |
| 6 | BiologicalMaterial04 | Biological | 48.6 |
| 7 | BiologicalMaterial10 | Biological | 47.1 |
| 8 | BiologicalMaterial03 | Biological | 38.6 |
| 9 | ManufacturingProcess17 | Process | 24.3 |
| 10 | BiologicalMaterial02 | Biological | 24.3 |
# Compare to RF importance for reference against prior homework (7.5 used SVM/MARS)
rf_imp87 <- varImp(rf87, scale = TRUE)$importance
rf_imp87_df <- data.frame(
Predictor = rownames(rf_imp87),
Importance = rf_imp87[, 1],
Model = "Random Forest (Ch 8)"
) %>%
arrange(desc(Importance)) %>%
head(10)
best_imp87_df <- top10_87 %>%
select(Predictor, Importance) %>%
mutate(Model = paste(perf87$Model[1], "(Ch 8)"))
bind_rows(best_imp87_df, rf_imp87_df) %>%
mutate(Type = case_when(
grepl("^BiologicalMaterial", Predictor) ~ "Biological",
grepl("^ManufacturingProcess", Predictor) ~ "Process",
TRUE ~ "Other"
)) %>%
ggplot(aes(x = reorder(Predictor, Importance), y = Importance, fill = Type)) +
geom_col(width = 0.65) +
coord_flip() +
facet_wrap(~Model, scales = "free") +
scale_fill_manual(values = c(Biological = "#4dac26", Process = "#d01c8b")) +
labs(
title = "Top 10 Importances: Best Tree Model vs. Random Forest",
x = NULL, y = "Importance (scaled 0-100)", fill = NULL
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom",
strip.background = element_rect(fill = "grey90"),
strip.text = element_text(face = "bold"))Manufacturing process predictors dominate the top 10 in both tree models, consistent with the linear and nonlinear models from Chapters 6 and 7. ManufacturingProcess32 and ManufacturingProcess13 are consistently among the top-ranked predictors across linear, nonlinear, and now tree-based models, making them the most confidently identified drivers of yield in the entire analysis. ManufacturingProcess09 also appears across multiple model families, reinforcing its importance.
The direction of these relationships matters as much as their magnitude. The Chapter 6 PLS analysis found that ManufacturingProcess13 has a negative relationship with yield — higher values associate with lower output — while ManufacturingProcess32 shows a positive relationship. Cubist ranking ManufacturingProcess13 first in importance does not contradict this: importance measures how much a predictor drives variation in yield, not whether that effect is beneficial. From a process control standpoint, ManufacturingProcess32 is a parameter to maximize while ManufacturingProcess13 is one to minimize or tightly constrain.
Biological material predictors appear less prominently in the tree models than in some of the linear model results. This is a consequence of how tree-based importance is measured: if a manufacturing process variable captures much of the same variation as a biological predictor (due to correlation between input quality and downstream processing decisions), the tree will tend to select the stronger one at each split and the other will receive little credit. The biological predictors are not irrelevant — they reflect raw material quality, which the manufacturer cannot control, but their signal is largely absorbed by the process predictors in this tree framework.
best_cp87 <- rpart87$bestTune$cp
optimal_tree87 <- rpart(y ~ ., data = train87_df,
control = rpart.control(cp = best_cp87))
rpart.plot(optimal_tree87,
type = 4,
extra = 101,
under = TRUE,
cex = 0.72,
main = "Optimal Single Regression Tree — Chemical Manufacturing Process",
box.palette = "Blues")train87_df$leaf_pred <- round(predict(optimal_tree87, train87_df), 3)
n_leaves87 <- length(unique(train87_df$leaf_pred))
cat("Number of terminal nodes:", n_leaves87, "\n")Number of terminal nodes: 2
train87_df$Leaf <- factor(train87_df$leaf_pred)
ggplot(train87_df, aes(x = Leaf, y = y)) +
geom_boxplot(fill = "#4393c3", outlier.alpha = 0.4, width = 0.55) +
geom_hline(yintercept = mean(trY87), linetype = "dashed",
colour = "grey40", linewidth = 0.8) +
labs(
title = "Yield Distribution in Each Terminal Node of the Optimal Tree",
subtitle = "Dashed line = overall mean yield; each box is one leaf node",
x = "Predicted Yield (Leaf Identifier)",
y = "Actual Yield (%)"
) +
theme_minimal(base_size = 13) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))The single tree provides a different kind of value from the ensemble models: interpretability over accuracy. Several lessons emerge from the tree structure and terminal node distributions.
First, the root split is on a manufacturing process variable (likely ManufacturingProcess32), confirming that a single threshold on this predictor divides the data into two meaningfully different yield populations. This is a directly actionable finding: if engineers can ensure ManufacturingProcess32 stays above (or below) its optimal threshold, they can steer the production run into the higher-yield branch before it begins.
Second, the terminal node boxplot reveals that while the split on ManufacturingProcess32 successfully separates the data into two distinct yield groups, the low-yield leaf (ManufacturingProcess32 < 0.2, predicted yield 39.1, n = 83) sits mostly below the overall mean, while the high-yield leaf (≥ 0.2, predicted yield 41.5, n = 61) sits almost entirely above it — both nodes retain substantial within-leaf variation. The IQRs of both leaves span roughly 1.5–2 percentage points and the tails overlap considerably, confirming that ManufacturingProcess32 alone cannot fully account for yield variability. This within-leaf spread points to the influence of additional predictors — exactly what the ensemble models capture through their more complex structures.
Third, the tree highlights threshold values for key predictors, not just rankings. While the ensemble models tell us that ManufacturingProcess32 is the most important predictor, the single tree tells us the specific cut-point at which its effect changes. These cut-points are the most actionable output for process control, since they can be translated directly into specification limits.
The tree view complements the ensemble analysis: together, the two provide the best of both worlds — high accuracy from the ensemble and interpretable rules from the single tree.
R version 4.5.0 (2025-04-11) Platform: aarch64-apple-darwin20 Running under: macOS 26.4.1
Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York tzcode source: internal
attached base packages: [1] stats4 grid stats graphics grDevices utils datasets [8] methods base
other attached packages: [1] tidyr_1.3.1 dplyr_1.1.4
[3] rpart.plot_3.1.4 rpart_4.1.24
[5] AppliedPredictiveModeling_1.1-7 Cubist_0.6.0
[7] gbm_2.2.3 party_1.3-20
[9] strucchange_1.5-4 sandwich_3.1-1
[11] zoo_1.8-14 modeltools_0.2-24
[13] mvtnorm_1.3-3 caret_7.0-1
[15] lattice_0.22-6 ggplot2_4.0.0.9000
[17] randomForest_4.7-1.2 mlbench_2.1-7
loaded via a namespace (and not attached): [1] tidyselect_1.2.1
viridisLite_0.4.2 timeDate_4052.112
[4] libcoin_1.0-12 farver_2.1.2 S7_0.2.0
[7] fastmap_1.2.0 TH.data_1.1-5 pROC_1.19.0.1
[10] digest_0.6.37 timechange_0.3.0 lifecycle_1.0.4
[13] cluster_2.1.8.1 survival_3.8-3 magrittr_2.0.4
[16] compiler_4.5.0 rlang_1.1.6 sass_0.4.10
[19] tools_4.5.0 plotrix_3.8-14 yaml_2.3.10
[22] data.table_1.17.8 knitr_1.50 labeling_0.4.3
[25] plyr_1.8.9 RColorBrewer_1.1-3 multcomp_1.4-30
[28] withr_3.0.2 purrr_1.1.0 nnet_7.3-20
[31] future_1.68.0 globals_0.18.0 scales_1.4.0
[34] iterators_1.0.14 MASS_7.3-65 ellipse_0.5.0
[37] cli_3.6.5 rmarkdown_2.30 generics_0.1.4
[40] rstudioapi_0.17.1 future.apply_1.20.1 reshape2_1.4.4
[43] cachem_1.1.0 stringr_1.5.2 splines_4.5.0
[46] parallel_4.5.0 matrixStats_1.5.0 vctrs_0.6.5
[49] hardhat_1.4.2 Matrix_1.7-3 jsonlite_2.0.0
[52] listenv_0.10.0 foreach_1.5.2 gower_1.0.2
[55] jquerylib_0.1.4 recipes_1.3.1 glue_1.8.0
[58] parallelly_1.46.0 codetools_0.2-20 lubridate_1.9.4
[61] stringi_1.8.7 gtable_0.3.6 tibble_3.3.0
[64] CORElearn_1.57.3.1 pillar_1.11.1 htmltools_0.5.8.1
[67] ipred_0.9-15 lava_1.8.2 R6_2.6.1
[70] evaluate_1.0.5 bslib_0.9.0 class_7.3-23
[73] Rcpp_1.1.0 nlme_3.1-168 prodlim_2025.04.28
[76] xfun_0.53 coin_1.4-3 pkgconfig_2.0.3
[79] ModelMetrics_1.2.2.2