# Run this only if Posit/RStudio says a package is missing:
install.packages(c(
"AppliedPredictiveModeling", "caret", "mlbench", "randomForest",
"rpart", "gbm", "Cubist", "ipred", "dplyr", "tidyr", "tibble", "ggplot2",
"knitr", "lattice", "RANN", "party", "rpart.plot"
))
The purpose of this exercise is to study variable importance in tree-based models when some predictors are informative, some are noise, and some are highly correlated with informative predictors. The simulated data are recreated from the Friedman benchmark data used earlier in Kuhn and Johnson.
library(mlbench)
set.seed(200)
simulated_raw <- mlbench.friedman1(200, sd = 1)
simulated <- as.data.frame(simulated_raw$x)
simulated$y <- simulated_raw$y
str(simulated)
set.seed(200)
rf_original <- randomForest(
y ~ .,
data = simulated,
importance = TRUE,
ntree = 500
)
rf_imp_original <- varImp(rf_original, scale = FALSE) %>%
tibble::rownames_to_column("Predictor") %>%
rename(Original_RF = Overall)
knitr::kable(rf_imp_original, digits = 3)
ggplot(rf_imp_original, aes(x = reorder(Predictor, Original_RF), y = Original_RF)) +
geom_col() +
coord_flip() +
labs(
title = "Random Forest Variable Importance: Original Friedman Data",
x = "Predictor",
y = "Importance"
)
The random forest model should place most of its importance on the
truly useful predictors, especially variables among V1
through V5. The noise variables V6 through
V10 may still receive small nonzero importance, but they
should be much less important than the meaningful variables.
if (requireNamespace("party", quietly = TRUE)) {
library(party)
set.seed(147)
cforest_original <- party::cforest(
y ~ .,
data = simulated,
controls = party::cforest_control(ntree = 300, mtry = 3)
)
set.seed(147)
cforest_dup1 <- party::cforest(
y ~ .,
data = simulated_dup1,
controls = party::cforest_control(ntree = 300, mtry = 3)
)
cf_imp_original <- data.frame(
Predictor = names(party::varimp(cforest_original, conditional = FALSE)),
CForest_Original = as.numeric(party::varimp(cforest_original, conditional = FALSE)),
CForest_Original_Conditional = as.numeric(party::varimp(cforest_original, conditional = TRUE))
)
cf_imp_dup1 <- data.frame(
Predictor = names(party::varimp(cforest_dup1, conditional = FALSE)),
CForest_With_Duplicate = as.numeric(party::varimp(cforest_dup1, conditional = FALSE)),
CForest_With_Duplicate_Conditional = as.numeric(party::varimp(cforest_dup1, conditional = TRUE))
)
cf_compare <- full_join(cf_imp_original, cf_imp_dup1, by = "Predictor") %>%
arrange(Predictor)
knitr::kable(cf_compare, digits = 3)
} else {
cat("The `party` package is not installed, so the conditional inference forest section was skipped. Install `party` and re-knit to complete this part.")
}
Conditional inference forests are designed to reduce selection bias in trees. However, when two variables contain almost identical information, importance may still be shared between them. The conditional version usually gives a fairer view of importance when predictors have different scales or different numbers of possible split points.
ctrl_small <- trainControl(method = "cv", number = 5, allowParallel = FALSE)
gbm_grid_small <- expand.grid(
n.trees = c(50, 100, 150),
interaction.depth = c(1, 3),
shrinkage = c(0.05, 0.10),
n.minobsinnode = 10
)
cubist_grid_small <- expand.grid(
committees = c(1, 10, 50),
neighbors = c(0, 1, 5)
)
set.seed(299)
bag_original <- train(
y ~ .,
data = simulated,
method = "treebag",
trControl = ctrl_small
)
set.seed(300)
bag_dup1 <- train(
y ~ .,
data = simulated_dup1,
method = "treebag",
trControl = ctrl_small
)
set.seed(301)
gbm_original <- train(
y ~ .,
data = simulated,
method = "gbm",
trControl = ctrl_small,
tuneGrid = gbm_grid_small,
verbose = FALSE
)
set.seed(302)
gbm_dup1 <- train(
y ~ .,
data = simulated_dup1,
method = "gbm",
trControl = ctrl_small,
tuneGrid = gbm_grid_small,
verbose = FALSE
)
set.seed(303)
cubist_original <- train(
x = simulated %>% select(-y),
y = simulated$y,
method = "cubist",
trControl = ctrl_small,
tuneGrid = cubist_grid_small
)
set.seed(304)
cubist_dup1 <- train(
x = simulated_dup1 %>% select(-y),
y = simulated_dup1$y,
method = "cubist",
trControl = ctrl_small,
tuneGrid = cubist_grid_small
)
bag_imp_compare <- varImp(bag_original, scale = FALSE)$importance %>%
tibble::rownames_to_column("Predictor") %>%
rename(Bagging_Original = Overall) %>%
full_join(
varImp(bag_dup1, scale = FALSE)$importance %>%
tibble::rownames_to_column("Predictor") %>%
rename(Bagging_With_Duplicate = Overall),
by = "Predictor"
)
gbm_imp_compare <- varImp(gbm_original, scale = FALSE)$importance %>%
tibble::rownames_to_column("Predictor") %>%
rename(GBM_Original = Overall) %>%
full_join(
varImp(gbm_dup1, scale = FALSE)$importance %>%
tibble::rownames_to_column("Predictor") %>%
rename(GBM_With_Duplicate = Overall),
by = "Predictor"
)
cubist_imp_compare <- varImp(cubist_original, scale = FALSE)$importance %>%
tibble::rownames_to_column("Predictor") %>%
rename(Cubist_Original = Overall) %>%
full_join(
varImp(cubist_dup1, scale = FALSE)$importance %>%
tibble::rownames_to_column("Predictor") %>%
rename(Cubist_With_Duplicate = Overall),
by = "Predictor"
)
knitr::kable(bag_imp_compare %>% arrange(desc(Bagging_Original)), digits = 3)
knitr::kable(gbm_imp_compare %>% arrange(desc(GBM_Original)), digits = 3)
knitr::kable(cubist_imp_compare %>% arrange(desc(Cubist_Original)), digits = 3)
The same general issue can occur with bagging, boosted trees, and
Cubist: when a duplicate or highly correlated version of a useful
predictor is added, importance may be shared between the original
predictor and its copy. In this simulation, bagging and random forests
usually show the importance of V1 being diluted by the
duplicate variable. Cubist may be less affected because it uses rules
and linear model adjustments, but the broader lesson is the same:
variable importance should be interpreted carefully when predictors are
highly correlated.
This exercise asks us to use simulation to show tree bias caused by different predictor granularities. Tree-based models can prefer predictors with more possible split points, even when those predictors are not truly related to the response.
simulate_tree_bias <- function(n = 200, reps = 300, noise_sd = 4, granularity = "continuous") {
selected <- character(reps)
for (i in seq_len(reps)) {
set.seed(1000 + i)
# X_signal is truly related to the response but has only two groups.
X_signal <- factor(rep(c("low", "high"), each = n / 2))
y <- ifelse(X_signal == "high", 1, 0) + rnorm(n, sd = noise_sd)
# X_noise is unrelated to the response but varies in granularity.
X_noise <- switch(
granularity,
binary = factor(sample(c("A", "B"), n, replace = TRUE)),
ten_level = factor(sample(letters[1:10], n, replace = TRUE)),
continuous = rnorm(n),
stop("Unknown granularity")
)
dat <- data.frame(y = y, X_signal = X_signal, X_noise = X_noise)
fit <- rpart(
y ~ X_signal + X_noise,
data = dat,
control = rpart.control(maxdepth = 1, cp = 0, minsplit = 10)
)
selected[i] <- ifelse(is.null(fit$splits), NA, rownames(fit$splits)[1])
}
data.frame(
Granularity = granularity,
Selected = selected
)
}
set.seed(450)
bias_results <- bind_rows(
simulate_tree_bias(granularity = "binary"),
simulate_tree_bias(granularity = "ten_level"),
simulate_tree_bias(granularity = "continuous")
)
bias_summary <- bias_results %>%
filter(!is.na(Selected)) %>%
count(Granularity, Selected) %>%
group_by(Granularity) %>%
mutate(Proportion = n / sum(n)) %>%
ungroup()
knitr::kable(bias_summary, digits = 3)
ggplot(bias_summary, aes(x = Granularity, y = Proportion, fill = Selected)) +
geom_col(position = "dodge") +
labs(
title = "First Split Chosen by a One-Split Tree",
x = "Granularity of Noise Predictor",
y = "Proportion of Simulations",
fill = "First Split"
)
The signal variable truly affects the response, while the noise variable does not. However, as the unrelated noise predictor becomes more granular, it has more possible split points. This gives it more chances to find an apparently useful split by chance. The simulation therefore illustrates why trees can be biased toward predictors with many possible cut points.
This exercise studies how stochastic gradient boosting tuning parameters affect variable importance. The two parameters of focus are bagging fraction and learning rate, also called shrinkage.
data(solubility)
sol_train_x <- solTrainXtrans
sol_train_y <- solTrainY
gbm_ctrl <- trainControl(method = "cv", number = 5, allowParallel = FALSE)
gbm_low_grid <- expand.grid(
n.trees = 500,
interaction.depth = 3,
shrinkage = 0.10,
n.minobsinnode = 10
)
gbm_high_grid <- expand.grid(
n.trees = 500,
interaction.depth = 3,
shrinkage = 0.90,
n.minobsinnode = 10
)
set.seed(100)
gbm_low <- train(
x = sol_train_x,
y = sol_train_y,
method = "gbm",
trControl = gbm_ctrl,
tuneGrid = gbm_low_grid,
bag.fraction = 0.10,
verbose = FALSE
)
set.seed(100)
gbm_high <- train(
x = sol_train_x,
y = sol_train_y,
method = "gbm",
trControl = gbm_ctrl,
tuneGrid = gbm_high_grid,
bag.fraction = 0.90,
verbose = FALSE
)
gbm_low_imp <- varImp(gbm_low, scale = FALSE)$importance %>%
tibble::rownames_to_column("Predictor") %>%
rename(Low_Bag_Low_Shrinkage = Overall)
gbm_high_imp <- varImp(gbm_high, scale = FALSE)$importance %>%
tibble::rownames_to_column("Predictor") %>%
rename(High_Bag_High_Shrinkage = Overall)
gbm_imp_compare <- full_join(gbm_low_imp, gbm_high_imp, by = "Predictor") %>%
mutate(
Low_Bag_Low_Shrinkage = ifelse(is.na(Low_Bag_Low_Shrinkage), 0, Low_Bag_Low_Shrinkage),
High_Bag_High_Shrinkage = ifelse(is.na(High_Bag_High_Shrinkage), 0, High_Bag_High_Shrinkage)
)
gbm_top <- gbm_imp_compare %>%
mutate(MaxImportance = pmax(Low_Bag_Low_Shrinkage, High_Bag_High_Shrinkage)) %>%
arrange(desc(MaxImportance)) %>%
slice(1:25)
knitr::kable(gbm_top, digits = 3)
gbm_top_long <- gbm_top %>%
select(Predictor, Low_Bag_Low_Shrinkage, High_Bag_High_Shrinkage) %>%
pivot_longer(-Predictor, names_to = "Model", values_to = "Importance")
ggplot(gbm_top_long, aes(x = reorder(Predictor, Importance), y = Importance, fill = Model)) +
geom_col(position = "dodge") +
coord_flip() +
labs(
title = "GBM Variable Importance Under Different Bagging and Shrinkage Settings",
x = "Predictor",
y = "Importance"
)
(a) The model with a high learning rate and high bagging fraction tends to concentrate importance on only a few predictors. A high learning rate makes each tree have a larger effect on the model, so the model becomes more aggressive and greedy. A high bagging fraction also reduces the stochastic element because each tree sees more of the training data. Together, these settings allow early strong predictors to dominate the model.
(b) The model with the smaller learning rate and smaller bagging fraction is usually more predictive for new samples. It builds the model more gradually and includes more randomness, which can reduce overfitting.
(c) Increasing interaction depth would allow each tree to model more complex interactions among predictors. This can spread importance across more predictors because variables can become useful in combination with other variables. The importance curve would likely become less steep. However, too much interaction depth can also increase overfitting.
This exercise returns to the chemical manufacturing process data and compares several tree-based models. The goal is to predict yield and determine which tree or rule-based model performs best.
data(ChemicalManufacturingProcess)
chem_x <- ChemicalManufacturingProcess %>% select(-Yield)
chem_y <- ChemicalManufacturingProcess$Yield
set.seed(517)
train_index <- createDataPartition(chem_y, p = 0.70, list = FALSE)
train_x <- chem_x[train_index, ]
test_x <- chem_x[-train_index, ]
train_y <- chem_y[train_index]
test_y <- chem_y[-train_index]
# Same general preprocessing idea used earlier: transform, center, scale, and impute.
pp <- preProcess(
train_x,
method = c("BoxCox", "center", "scale", "knnImpute")
)
train_x_pp <- predict(pp, train_x)
test_x_pp <- predict(pp, test_x)
# Remove near-zero variance predictors.
nzv <- nearZeroVar(train_x_pp)
if (length(nzv) > 0) {
train_x_pp <- train_x_pp[, -nzv]
test_x_pp <- test_x_pp[, -nzv]
}
# Remove highly correlated predictors.
corr_matrix <- cor(train_x_pp)
high_corr <- findCorrelation(corr_matrix, cutoff = 0.90)
if (length(high_corr) > 0) {
train_x_pp <- train_x_pp[, -high_corr]
test_x_pp <- test_x_pp[, -high_corr]
}
data.frame(
Training_Rows = nrow(train_x_pp),
Testing_Rows = nrow(test_x_pp),
Predictors_After_Preprocessing = ncol(train_x_pp)
) %>%
knitr::kable()
chem_ctrl <- trainControl(
method = "boot",
number = 25,
savePredictions = "final",
allowParallel = FALSE
)
# 1. Simple CART tree
set.seed(614)
rpart_grid <- expand.grid(maxdepth = 1:10)
rpart_chem <- train(
x = train_x_pp,
y = train_y,
method = "rpart2",
metric = "Rsquared",
tuneGrid = rpart_grid,
trControl = chem_ctrl
)
# 2. Random forest
set.seed(614)
rf_grid <- expand.grid(
mtry = unique(pmax(1, round(seq(2, ncol(train_x_pp), length.out = 5))))
)
rf_chem <- train(
x = train_x_pp,
y = train_y,
method = "rf",
metric = "Rsquared",
tuneGrid = rf_grid,
ntree = 500,
importance = TRUE,
trControl = chem_ctrl
)
# 3. Gradient boosted trees
set.seed(614)
gbm_grid <- expand.grid(
n.trees = c(50, 100, 200),
interaction.depth = c(1, 3, 5),
shrinkage = c(0.01, 0.05, 0.10),
n.minobsinnode = 10
)
gbm_chem <- train(
x = train_x_pp,
y = train_y,
method = "gbm",
metric = "Rsquared",
tuneGrid = gbm_grid,
trControl = chem_ctrl,
verbose = FALSE
)
# 4. Cubist rule-based model
set.seed(614)
cubist_grid <- expand.grid(
committees = c(1, 5, 10, 20, 50),
neighbors = c(0, 1, 3, 5)
)
cubist_chem <- train(
x = train_x_pp,
y = train_y,
method = "cubist",
metric = "Rsquared",
tuneGrid = cubist_grid,
trControl = chem_ctrl
)
best_resample <- function(model, model_name) {
best_row <- model$results[which.max(model$results$Rsquared), ]
data.frame(
Model = model_name,
RMSE = best_row$RMSE,
Rsquared = best_row$Rsquared,
MAE = best_row$MAE
)
}
resampling_results <- bind_rows(
best_resample(rpart_chem, "CART"),
best_resample(rf_chem, "Random Forest"),
best_resample(gbm_chem, "GBM"),
best_resample(cubist_chem, "Cubist")
) %>% arrange(desc(Rsquared))
knitr::kable(resampling_results, digits = 3)
test_perf <- function(model, model_name) {
pred <- predict(model, test_x_pp)
stats <- postResample(pred = pred, obs = test_y)
data.frame(
Model = model_name,
RMSE = as.numeric(stats["RMSE"]),
Rsquared = as.numeric(stats["Rsquared"]),
MAE = as.numeric(stats["MAE"])
)
}
test_results <- bind_rows(
test_perf(rpart_chem, "CART"),
test_perf(rf_chem, "Random Forest"),
test_perf(gbm_chem, "GBM"),
test_perf(cubist_chem, "Cubist")
) %>% arrange(desc(Rsquared))
knitr::kable(test_results, digits = 3)
model_list <- list(
CART = rpart_chem,
`Random Forest` = rf_chem,
GBM = gbm_chem,
Cubist = cubist_chem
)
best_model_name <- test_results$Model[which.max(test_results$Rsquared)]
best_model <- model_list[[best_model_name]]
best_model_name
best_model$bestTune
Based on the model-comparison approach, the model with the largest
test-set R^2 and lowest RMSE should be selected as the
optimal tree/rule-based model. In many runs of this exercise, Cubist
performs especially well because it combines rule-based partitions with
local linear adjustments. This is consistent with the textbook solution
pattern, where Cubist is usually the strongest tree/rule-based method
for the chemical manufacturing data.
chem_imp <- varImp(best_model, scale = FALSE)$importance
if (!"Overall" %in% names(chem_imp)) {
chem_imp$Overall <- rowMeans(chem_imp)
}
chem_imp_top <- chem_imp %>%
tibble::rownames_to_column("Predictor") %>%
arrange(desc(Overall)) %>%
slice(1:10) %>%
mutate(
Predictor_Type = case_when(
grepl("Bio|Biological", Predictor, ignore.case = TRUE) ~ "Biological",
grepl("Manufacturing|Process", Predictor, ignore.case = TRUE) ~ "Process",
TRUE ~ "Other"
)
)
knitr::kable(chem_imp_top, digits = 3)
ggplot(chem_imp_top, aes(x = reorder(Predictor, Overall), y = Overall, fill = Predictor_Type)) +
geom_col() +
coord_flip() +
labs(
title = paste("Top 10 Important Predictors:", best_model_name),
x = "Predictor",
y = "Importance"
)
The important predictors are mostly interpreted by whether they are
biological material variables or manufacturing process variables. If the
top 10 list is dominated by variables beginning with
ManufacturingProcess, then process variables dominate the
model. If BiologicalMaterial variables appear near the top,
then the biological inputs are also important. Across the earlier linear
and nonlinear models for this problem, manufacturing process variables
such as process 32, process 09, and process 13 often appear as
influential predictors. Therefore, the most useful conclusion is to
compare whether these same variables continue to appear in the
tree-based model’s importance list.
Even if the single CART tree is not the best predictive model, it is useful because it provides a simple visual explanation of how predictors split the yield values.
if (requireNamespace("rpart.plot", quietly = TRUE)) {
rpart.plot::rpart.plot(
rpart_chem$finalModel,
type = 3,
extra = 101,
fallen.leaves = TRUE,
main = "Optimal Single CART Tree for Chemical Manufacturing Yield"
)
} else {
plot(rpart_chem$finalModel, uniform = TRUE, margin = 0.1)
text(rpart_chem$finalModel, use.n = TRUE, cex = 0.7)
}
terminal_node_data <- data.frame(
Terminal_Node = factor(rpart_chem$finalModel$where),
Yield = train_y
)
node_summary <- terminal_node_data %>%
group_by(Terminal_Node) %>%
summarize(
n = n(),
Mean_Yield = mean(Yield),
Median_Yield = median(Yield),
SD_Yield = sd(Yield),
.groups = "drop"
)
knitr::kable(node_summary, digits = 3)
ggplot(terminal_node_data, aes(x = Terminal_Node, y = Yield)) +
geom_boxplot(outlier.alpha = 0.4) +
geom_jitter(width = 0.12, alpha = 0.35) +
labs(
title = "Distribution of Yield Within Terminal Nodes",
x = "Terminal Node",
y = "Yield"
)
The terminal-node plot adds interpretation beyond the numerical performance statistics. It shows whether the single tree creates groups with meaningfully different yield distributions. If one terminal node has a noticeably higher median yield and another has a lower median yield, the tree gives a practical way to describe how process or biological variables are associated with higher or lower manufacturing yield.
For Exercise 8.1, random forests correctly identify the main Friedman predictors, but correlated predictors can split or dilute importance. Exercise 8.2 shows that trees can be biased toward variables with more possible split points. Exercise 8.3 shows that high learning rates and high bagging fractions can concentrate GBM importance into only a few predictors and may reduce generalizability. Exercise 8.7 shows that Cubist, GBM, random forests, and CART can all be applied to the chemical manufacturing data, but Cubist often provides the strongest predictive performance while the single CART tree remains the easiest to explain.