library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(mlbench)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(party)
## Warning: package 'party' was built under R version 4.5.3
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.5.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.5.3
##
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
##
## where
library(dplyr)
library(knitr)
library(rpart)
library(gbm)
## Warning: package 'gbm' was built under R version 4.5.3
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.5.3
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.3
library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y ~ .,
data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1 |>
arrange(desc(Overall)) |>
knitr::kable()
| Overall | |
|---|---|
| V1 | 8.7322354 |
| V4 | 7.6151188 |
| V2 | 6.4153694 |
| V5 | 2.0235246 |
| V3 | 0.7635918 |
| V6 | 0.1651112 |
| V7 | -0.0059617 |
| V10 | -0.0749448 |
| V9 | -0.0952927 |
| V8 | -0.1663626 |
No. V6-V10 are very insignificant compared to V1-V5.
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
# Fit the second model
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
# Compare V1 importance from Model 1 vs Model 2
print(rfImp2)
## Overall
## V1 5.69119973
## V2 6.06896061
## V3 0.62970218
## V4 7.04752238
## V5 1.87238438
## V6 0.13569065
## V7 -0.01345645
## V8 -0.04370565
## V9 0.00840438
## V10 0.02894814
## duplicate1 4.28331581
The score for V1 has descreased significantly. The score for other significant variable has also decreased slightly
#Testing what happens if you have a third variable
simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1
# Fit the third model
model3 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp3 <- varImp(model3, scale = FALSE)
print(rfImp3)
## Overall
## V1 4.91687329
## V2 6.52816504
## V3 0.58711552
## V4 7.04870917
## V5 2.03115561
## V6 0.14213148
## V7 0.10991985
## V8 -0.08405687
## V9 -0.01075028
## V10 0.09230576
## duplicate1 3.80068234
## duplicate2 1.87721959
When another highly correlated predictor is added the importance of all the others decreases even more, especially V1.
model_cforest <- cforest(y ~ ., data = simulated,
control = cforest_unbiased(ntree = 1000))
imp_traditional <- varimp(model_cforest)
imp_conditional <- varimp(model_cforest, conditional = TRUE)
imp_traditional %>%
as.data.frame() %>%
rename(Importance = ".") %>%
arrange(desc(Importance)) %>%
kable(caption = "Traditional Variable Importance (Unconditional)", digits = 4)
| Importance | |
|---|---|
| V4 | 7.1483 |
| V2 | 5.8960 |
| V1 | 4.3898 |
| duplicate1 | 4.2258 |
| V5 | 1.6395 |
| duplicate2 | 1.1599 |
| V7 | 0.0377 |
| V3 | 0.0188 |
| V9 | -0.0057 |
| V6 | -0.0197 |
| V10 | -0.0227 |
| V8 | -0.0269 |
imp_conditional %>%
as.data.frame() %>%
rename(Importance = ".") %>%
arrange(desc(Importance)) %>%
kable(caption = "Conditional Variable Importance (Strobl et al. 2007)", digits = 4)
| Importance | |
|---|---|
| V4 | 5.7366 |
| V2 | 4.7151 |
| V1 | 1.7548 |
| duplicate1 | 1.4843 |
| V5 | 0.9881 |
| duplicate2 | 0.3221 |
| V3 | 0.0106 |
| V9 | 0.0080 |
| V10 | 0.0050 |
| V8 | -0.0013 |
| V7 | -0.0080 |
| V6 | -0.0095 |
V3 is much less significant than in the original random forest model. Everything else remains about the same
# Boosted Trees (GBM)
gbmModel <- train(y ~ ., data = simulated,
method = "gbm",
verbose = FALSE,
trControl = trainControl(method = "cv"))
# Estimate importance
gbmImp <- varImp(gbmModel, scale = FALSE)
# Print table
gbmImp$importance %>%
arrange(desc(Overall)) %>%
knitr::kable(caption = "Variable Importance: Boosted Trees (GBM)", digits = 4)
| Overall | |
|---|---|
| V4 | 4165.6156 |
| V2 | 3526.2674 |
| duplicate1 | 2437.6338 |
| V1 | 2099.3391 |
| V5 | 1611.6175 |
| V3 | 1501.8632 |
| duplicate2 | 191.2389 |
| V6 | 159.3757 |
| V7 | 109.9608 |
| V10 | 106.8173 |
| V8 | 78.8578 |
| V9 | 69.6918 |
cubistModel <- cubist(x = simulated[, names(simulated) != "y"],
y = simulated$y)
# Estimate importance
cubistImp <- varImp(cubistModel, scale = FALSE)
cubistImp %>%
arrange(desc(Overall)) %>%
knitr::kable(caption = "Variable Importance: Cubist", digits = 4)
| Overall | |
|---|---|
| V1 | 50 |
| V2 | 50 |
| V4 | 50 |
| V5 | 50 |
| V3 | 0 |
| V6 | 0 |
| V7 | 0 |
| V8 | 0 |
| V9 | 0 |
| V10 | 0 |
| duplicate1 | 0 |
| duplicate2 | 0 |
The “dilution” pattern did not occur here nearly as much as it did in the Random Forest. In a Random Forest, V1 and its duplicates often end up with very similar, lower scores because they “steal” splits from each other in independent trees. The cubist model is exetremely efficient. If v1 can explain a rule, the algorithm has no reason to include a noisy version of V1 in that same rule. It picks the “cleanest” version and ignores the rest.
n <- 2000
# y is just random noise
y <- rnorm(n)
# Predictors with different granularities (all are random noise)
x_binary <- factor(sample(1:2, n, replace = TRUE))
x_five <- factor(sample(1:5, n, replace = TRUE))
x_ten <- factor(sample(1:10, n, replace = TRUE))
x_twenty <- factor(sample(1:20, n, replace = TRUE))
x_cont <- rnorm(n)
sim_data <- data.frame(y, x_binary, x_five, x_ten, x_twenty, x_cont)
tree_model <- rpart(y ~ ., data = sim_data, control = rpart.control(cp = 0))
# Calculate importance
tree_imp <- varImp(tree_model, scale = FALSE)
# Display the results
tree_imp %>%
arrange(desc(Overall)) %>%
knitr::kable(caption = "CART Variable Importance (Forced Tree)")
| Overall | |
|---|---|
| x_cont | 8.881308 |
| x_five | 6.415502 |
| x_twenty | 5.132555 |
| x_ten | 5.030555 |
| x_binary | 2.901237 |
will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
The Model on the Right (Learning Rate = 0.9, Bagging Fraction = 0.9). This model is aggressive. The first few trees “use up” the predictive power of the best variables. Since the learning rate is so high, there is very little residual error left for later trees to explain, and consequently, the importance scores are heavily concentrated on the “winners” that the first few trees identified. The model on the left is on the other opposite end of the spectrum with a Learning Rate = 0.1, Bagging Fraction = 0.1. That means Because each tree is weak (low learning rate) and sees different data (low bagging fraction), the model is forced to look deeper into the feature set. It can’t rely solely on the “top” predictors because they aren’t always available or influential in every small subset of data. This “spreads” the importance across a much wider array of variables.
In this context I’d rather of the left model(Learning Rate = 0.1, Bagging Fraction = 0.1)). The othermodel would over fit on the top predictor which makes it vulnerable to noise. The slow learning model isn’t as vulnerable to noise or unpredictable changes.
Increasing the interaction depth is likely to spread the variable importance over more predictors, increasing the importance of some of the less significant variables
process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
set performance?
data(ChemicalManufacturingProcess)
imputed_data <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
chem_data <- predict(imputed_data, ChemicalManufacturingProcess)
nzv <- nearZeroVar(chem_data)
chem_data <- chem_data[, -nzv]
inTrain <- createDataPartition(chem_data$Yield, p = 0.8, list = FALSE)
train_x <- chem_data[inTrain, -1] # Yield is column 1
train_y <- chem_data[inTrain, 1]
test_x <- chem_data[-inTrain, -1]
test_y <- chem_data[-inTrain, 1]
ctrl <- trainControl(method = "cv", number = 10)
# Random Forest
set.seed(2026)
rf_model <- train(train_x, train_y, method = "rf",
tuneLength = 10, trControl = ctrl, importance = TRUE)
# GBM
set.seed(2026)
gbm_grid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
n.trees = seq(100, 1000, by = 100),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
gbm_model <- train(train_x, train_y, method = "gbm",
tuneGrid = gbm_grid, trControl = ctrl, verbose = FALSE)
# Cubist
set.seed(2026)
cubist_model <- train(train_x, train_y, method = "cubist", trControl = ctrl)
#Test
rf_preds <- predict(rf_model, test_x)
gbm_preds <- predict(gbm_model, test_x)
cubist_preds <- predict(cubist_model, test_x)
rf_results <- postResample(pred = rf_preds, obs = test_y)
gbm_results <- postResample(pred = gbm_preds, obs = test_y)
cubist_results <- postResample(pred = cubist_preds, obs = test_y)
performance_table <- data.frame(
RandomForest = rf_results,
GBM = gbm_results,
Cubist = cubist_results
)
print(performance_table)
## RandomForest GBM Cubist
## RMSE 0.4767133 0.4533820 0.4256417
## Rsquared 0.7689087 0.7553879 0.7848917
## MAE 0.3795808 0.3584693 0.3402779
plot(test_y, cubist_preds,
main = "Cubist: Predicted vs. Actual Yield",
xlab = "Actual Yield", ylab = "Predicted Yield",
pch = 19, col = "steelblue")
abline(0, 1, col = "red", lwd = 2) # Add a 1-to-1 line
All the model were close but the best by a small margin was GBM.
gbm_imp <- varImp(gbm_model, scale = FALSE)
# Print the top 10
print(gbm_imp, 10)
## gbm variable importance
##
## only 10 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 1392.0
## ManufacturingProcess06 368.6
## ManufacturingProcess13 294.1
## BiologicalMaterial12 291.5
## BiologicalMaterial03 275.0
## ManufacturingProcess09 245.8
## ManufacturingProcess31 233.2
## ManufacturingProcess17 191.6
## BiologicalMaterial11 142.4
## BiologicalMaterial09 128.8
# Optional: Visualize it
plot(gbm_imp, top = 10, main = "Top 10 Predictors - GBM")
The model is primarily driven by manufacturing processes. The GBM model highlights the extreme disproportionate importance of ManufacturingProcess32.
nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
single_tree <- rpart(Yield ~ ., data = chem_data[inTrain, ],
control = rpart.control(cp = 0.01))
# Plot the tree with yield distributions in terminal nodes
rpart.plot(single_tree,
type = 4,
extra = 1, # Shows the number of observations in the node
under = TRUE,
cex = 0.8,
box.palette = "RdYlGn", # Red for low yield, Green for high yield
main = "Single Tree: Chemical Manufacturing Yield")