library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
library(mlbench)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(party)
## Warning: package 'party' was built under R version 4.5.3
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.5.3
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.5.3
## 
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
## 
##     where
library(dplyr)
library(knitr)
library(rpart)
library(gbm)
## Warning: package 'gbm' was built under R version 4.5.3
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.5.3
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.3

8.1

library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

8.1a

model1 <- randomForest(y ~ ., 
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp1 <- varImp(model1, scale = FALSE)

rfImp1 |>
  arrange(desc(Overall)) |>
  knitr::kable()
Overall
V1 8.7322354
V4 7.6151188
V2 6.4153694
V5 2.0235246
V3 0.7635918
V6 0.1651112
V7 -0.0059617
V10 -0.0749448
V9 -0.0952927
V8 -0.1663626

No. V6-V10 are very insignificant compared to V1-V5.

8.1b)

simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
# Fit the second model
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)

# Compare V1 importance from Model 1 vs Model 2
print(rfImp2)
##                Overall
## V1          5.69119973
## V2          6.06896061
## V3          0.62970218
## V4          7.04752238
## V5          1.87238438
## V6          0.13569065
## V7         -0.01345645
## V8         -0.04370565
## V9          0.00840438
## V10         0.02894814
## duplicate1  4.28331581

The score for V1 has descreased significantly. The score for other significant variable has also decreased slightly

#Testing what happens if you have a third variable
simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1

# Fit the third model
model3 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp3 <- varImp(model3, scale = FALSE)

print(rfImp3)
##                Overall
## V1          4.91687329
## V2          6.52816504
## V3          0.58711552
## V4          7.04870917
## V5          2.03115561
## V6          0.14213148
## V7          0.10991985
## V8         -0.08405687
## V9         -0.01075028
## V10         0.09230576
## duplicate1  3.80068234
## duplicate2  1.87721959

When another highly correlated predictor is added the importance of all the others decreases even more, especially V1.

8.1c Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

model_cforest <- cforest(y ~ ., data = simulated, 
                         control = cforest_unbiased(ntree = 1000))

imp_traditional <- varimp(model_cforest)
imp_conditional <- varimp(model_cforest, conditional = TRUE)

imp_traditional %>% 
  as.data.frame() %>% 
  rename(Importance = ".") %>% 
  arrange(desc(Importance)) %>% 
  kable(caption = "Traditional Variable Importance (Unconditional)", digits = 4)
Traditional Variable Importance (Unconditional)
Importance
V4 7.1483
V2 5.8960
V1 4.3898
duplicate1 4.2258
V5 1.6395
duplicate2 1.1599
V7 0.0377
V3 0.0188
V9 -0.0057
V6 -0.0197
V10 -0.0227
V8 -0.0269
imp_conditional %>% 
  as.data.frame() %>% 
  rename(Importance = ".") %>% 
  arrange(desc(Importance)) %>% 
  kable(caption = "Conditional Variable Importance (Strobl et al. 2007)", digits = 4)
Conditional Variable Importance (Strobl et al. 2007)
Importance
V4 5.7366
V2 4.7151
V1 1.7548
duplicate1 1.4843
V5 0.9881
duplicate2 0.3221
V3 0.0106
V9 0.0080
V10 0.0050
V8 -0.0013
V7 -0.0080
V6 -0.0095

V3 is much less significant than in the original random forest model. Everything else remains about the same

8.1d Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

# Boosted Trees (GBM)
gbmModel <- train(y ~ ., data = simulated,
                  method = "gbm",
                  verbose = FALSE,
                  trControl = trainControl(method = "cv"))

# Estimate importance
gbmImp <- varImp(gbmModel, scale = FALSE)

# Print table
gbmImp$importance %>%
  arrange(desc(Overall)) %>%
  knitr::kable(caption = "Variable Importance: Boosted Trees (GBM)", digits = 4)
Variable Importance: Boosted Trees (GBM)
Overall
V4 4165.6156
V2 3526.2674
duplicate1 2437.6338
V1 2099.3391
V5 1611.6175
V3 1501.8632
duplicate2 191.2389
V6 159.3757
V7 109.9608
V10 106.8173
V8 78.8578
V9 69.6918
cubistModel <- cubist(x = simulated[, names(simulated) != "y"], 
                      y = simulated$y)

# Estimate importance
cubistImp <- varImp(cubistModel, scale = FALSE)

cubistImp %>%
  arrange(desc(Overall)) %>%
  knitr::kable(caption = "Variable Importance: Cubist", digits = 4)
Variable Importance: Cubist
Overall
V1 50
V2 50
V4 50
V5 50
V3 0
V6 0
V7 0
V8 0
V9 0
V10 0
duplicate1 0
duplicate2 0

The “dilution” pattern did not occur here nearly as much as it did in the Random Forest. In a Random Forest, V1 and its duplicates often end up with very similar, lower scores because they “steal” splits from each other in independent trees. The cubist model is exetremely efficient. If v1 can explain a rule, the algorithm has no reason to include a noisy version of V1 in that same rule. It picks the “cleanest” version and ignores the rest.

8.2 Use a simulation to show tree bias with different granularities.

n <- 2000
# y is just random noise
y <- rnorm(n)

# Predictors with different granularities (all are random noise)
x_binary  <- factor(sample(1:2,  n, replace = TRUE))      
x_five    <- factor(sample(1:5,  n, replace = TRUE))      
x_ten     <- factor(sample(1:10, n, replace = TRUE))     
x_twenty  <- factor(sample(1:20, n, replace = TRUE))     
x_cont    <- rnorm(n)                              

sim_data <- data.frame(y, x_binary, x_five, x_ten, x_twenty, x_cont)
tree_model <- rpart(y ~ ., data = sim_data, control = rpart.control(cp = 0))

# Calculate importance
tree_imp <- varImp(tree_model, scale = FALSE)

# Display the results
tree_imp %>%
  arrange(desc(Overall)) %>%
  knitr::kable(caption = "CART Variable Importance (Forced Tree)")
CART Variable Importance (Forced Tree)
Overall
x_cont 8.881308
x_five 6.415502
x_twenty 5.132555
x_ten 5.030555
x_binary 2.901237

8.3 In stochastic gradient boosting the bagging fraction and learning rate

will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

8.3a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The Model on the Right (Learning Rate = 0.9, Bagging Fraction = 0.9). This model is aggressive. The first few trees “use up” the predictive power of the best variables. Since the learning rate is so high, there is very little residual error left for later trees to explain, and consequently, the importance scores are heavily concentrated on the “winners” that the first few trees identified. The model on the left is on the other opposite end of the spectrum with a Learning Rate = 0.1, Bagging Fraction = 0.1. That means Because each tree is weak (low learning rate) and sees different data (low bagging fraction), the model is forced to look deeper into the feature set. It can’t rely solely on the “top” predictors because they aren’t always available or influential in every small subset of data. This “spreads” the importance across a much wider array of variables.

8.3b)Which model do you think would be more predictive of other samples?

In this context I’d rather of the left model(Learning Rate = 0.1, Bagging Fraction = 0.1)). The othermodel would over fit on the top predictor which makes it vulnerable to noise. The slow learning model isn’t as vulnerable to noise or unpredictable changes.

8.3c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

Increasing the interaction depth is likely to spread the variable importance over more predictors, increasing the importance of some of the less significant variables

8.7)Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing

process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

(a) Which tree-based regression model gives the optimal resampling and test

set performance?

data(ChemicalManufacturingProcess)
imputed_data <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
chem_data <- predict(imputed_data, ChemicalManufacturingProcess)

nzv <- nearZeroVar(chem_data)
chem_data <- chem_data[, -nzv]

inTrain <- createDataPartition(chem_data$Yield, p = 0.8, list = FALSE)
train_x <- chem_data[inTrain, -1] # Yield is column 1
train_y <- chem_data[inTrain, 1]
test_x  <- chem_data[-inTrain, -1]
test_y  <- chem_data[-inTrain, 1]

ctrl <- trainControl(method = "cv", number = 10)

# Random Forest
set.seed(2026)
rf_model <- train(train_x, train_y, method = "rf", 
                  tuneLength = 10, trControl = ctrl, importance = TRUE)

# GBM
set.seed(2026)
gbm_grid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
                        n.trees = seq(100, 1000, by = 100),
                        shrinkage = c(0.01, 0.1),
                        n.minobsinnode = 10)
gbm_model <- train(train_x, train_y, method = "gbm", 
                   tuneGrid = gbm_grid, trControl = ctrl, verbose = FALSE)

# Cubist
set.seed(2026)
cubist_model <- train(train_x, train_y, method = "cubist", trControl = ctrl)
#Test
rf_preds     <- predict(rf_model, test_x)
gbm_preds    <- predict(gbm_model, test_x)
cubist_preds <- predict(cubist_model, test_x)

rf_results     <- postResample(pred = rf_preds, obs = test_y)
gbm_results    <- postResample(pred = gbm_preds, obs = test_y)
cubist_results <- postResample(pred = cubist_preds, obs = test_y)

performance_table <- data.frame(
  RandomForest = rf_results,
  GBM = gbm_results,
  Cubist = cubist_results
)

print(performance_table)
##          RandomForest       GBM    Cubist
## RMSE        0.4767133 0.4533820 0.4256417
## Rsquared    0.7689087 0.7553879 0.7848917
## MAE         0.3795808 0.3584693 0.3402779
plot(test_y, cubist_preds, 
     main = "Cubist: Predicted vs. Actual Yield",
     xlab = "Actual Yield", ylab = "Predicted Yield",
     pch = 19, col = "steelblue")
abline(0, 1, col = "red", lwd = 2) # Add a 1-to-1 line

All the model were close but the best by a small margin was GBM.

8.7(b)

gbm_imp <- varImp(gbm_model, scale = FALSE)

# Print the top 10
print(gbm_imp, 10)
## gbm variable importance
## 
##   only 10 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  1392.0
## ManufacturingProcess06   368.6
## ManufacturingProcess13   294.1
## BiologicalMaterial12     291.5
## BiologicalMaterial03     275.0
## ManufacturingProcess09   245.8
## ManufacturingProcess31   233.2
## ManufacturingProcess17   191.6
## BiologicalMaterial11     142.4
## BiologicalMaterial09     128.8
# Optional: Visualize it
plot(gbm_imp, top = 10, main = "Top 10 Predictors - GBM")

The model is primarily driven by manufacturing processes. The GBM model highlights the extreme disproportionate importance of ManufacturingProcess32.

8.7c) Plot the optimal single tree with the distribution of yield in the terminal

nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

single_tree <- rpart(Yield ~ ., data = chem_data[inTrain, ], 
                     control = rpart.control(cp = 0.01))

# Plot the tree with yield distributions in terminal nodes
rpart.plot(single_tree, 
           type = 4, 
           extra = 1, # Shows the number of observations in the node
           under = TRUE, 
           cex = 0.8, 
           box.palette = "RdYlGn", # Red for low yield, Green for high yield
           main = "Single Tree: Chemical Manufacturing Yield")