library(AppliedPredictiveModeling)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlbench)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## Attaching package: 'party'
##
## The following object is masked from 'package:dplyr':
##
## where
library(partykit)
## Loading required package: libcoin
##
## Attaching package: 'partykit'
##
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
## node_terminal, varimp
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(Cubist)
library(rpart)
library(rpart.plot)
Recreate the simulated data from Exercise 7.2:
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
set.seed(0410)
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE) %>%
arrange(-Overall)
rfImp1
## Overall
## V1 8.551056372
## V4 7.799418018
## V2 6.514643576
## V5 2.301813439
## V3 0.745659304
## V6 0.129285870
## V7 0.005098382
## V9 -0.067323427
## V10 -0.134239796
## V8 -0.148212214
Did the random forest model significantly use the uninformative predic- tors (V6– V10)?
The uninformative variables (V6–V10) had very low or even negative importance scores, showing the model largely ignored them.
set.seed(0410)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9438394
set.seed(0410)
rf_model2<- randomForest(y ~ ., data = simulated, importance = TRUE)
rfImp2 <- varImp(rf_model2,scale = FALSE)
rfImp2
## Overall
## V1 6.018981645
## V2 6.338888849
## V3 0.673317091
## V4 7.301621642
## V5 2.222766362
## V6 0.195311963
## V7 -0.004850964
## V8 -0.048441942
## V9 -0.050808237
## V10 0.027125391
## duplicate1 3.999965000
Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
After adding duplicate1, which is highly correlated with V1, the importance score for V1 dropped. The model split the importance between V1 and duplicate1, showing that random forests divide importance when predictors are highly correlated.
set.seed(0410)
model3 <- cforest(y ~ ., data = simulated)
varimp_traditional <- varimp(model3, conditional = FALSE)
varimp_conditional <- varimp(model3, conditional = TRUE)
varimp_traditional
## V1 V2 V3 V4 V5 V6
## 6.78645298 5.92794998 0.04372512 6.25306762 2.12673146 0.01547017
## V7 V8 V9 V10 duplicate1
## 0.05447678 -0.18064994 0.16265115 -0.27969056 4.89945073
varimp_conditional
## V1 V2 V3 V4 V5 V6
## 3.80641081 4.98601200 0.01476454 5.47395087 1.36437539 -0.07285770
## V7 V8 V9 V10 duplicate1
## -0.14575448 -0.22370870 -0.12147383 -0.15853887 2.18397906
The traditional cforest importance showed high scores for V1 and duplicate1, similar to the regular random forest. The conditional importance reduced duplicate1’s score and gave more balanced values. This shows that conditional importance adjusts for correlation between predictors.
set.seed(0410)
#gbm model
gbm_model <- gbm(y ~ .,
data = simulated,
distribution = "gaussian",
n.trees = 100)
# Cubist model
set.seed(0410)
cubist_model <- cubist(x = simulated[, !names(simulated) %in% "y"],
y = simulated$y,)
summary(gbm_model)
## var rel.inf
## V4 V4 30.0618454
## V2 V2 22.5933974
## V1 V1 19.5209667
## V5 V5 10.3029130
## duplicate1 duplicate1 8.8671071
## V3 V3 7.9276390
## V6 V6 0.6204190
## V7 V7 0.1057124
## V8 V8 0.0000000
## V9 V9 0.0000000
## V10 V10 0.0000000
summary(cubist_model)
##
## Call:
## cubist.default(x = simulated[, !names(simulated) %in% "y"], y = simulated$y)
##
##
## Cubist [Release 2.07 GPL Edition] Sun May 11 20:37:35 2025
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 200 cases (12 attributes) from undefined.data
##
## Model:
##
## Rule 1: [200 cases, mean 14.416183, range 3.55596 to 28.38167, est err 1.944664]
##
## outcome = 0.183529 + 8.9 V4 + 7.9 V1 + 7.1 V2 + 5.3 V5
##
##
## Evaluation on training data (200 cases):
##
## Average |error| 2.211979
## Relative |error| 0.55
## Correlation coefficient 0.84
##
##
## Attribute usage:
## Conds Model
##
## 100% V1
## 100% V2
## 100% V4
## 100% V5
##
##
## Time: 0.0 secs
varImp_cubist <- varImp(cubist_model)
print(varImp_cubist)
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicate1 0
The boosted tree model(gbm) gave high importance to a few predictors and ignored the duplicates. The Cubist model strongly focused on V1, V2, V4, and V5 and completely ignored uninformative predictors. Both methods showed less bias from correlated predictors compared to random forest.
Use a simulation to show tree bias with different granularities.
set.seed(0410)
# Simulated data
n <- 500
x1 <- runif(n) # Continuous variable
x2 <- as.factor(cut(runif(n), 4)) # Factor with 4 levels
x3 <- as.factor(cut(runif(n), 10)) # Factor with 10 levels
y <- rnorm(n) # Random noise (no real relationship)
sim_data <- data.frame(x1, x2, x3, y)
rf_model <- randomForest(y ~ ., data = sim_data, importance = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = y ~ ., data = sim_data, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 1.111734
## % Var explained: -5.54
varImpPlot(rf_model)
I created a dataset with one continuous variable (x1), a factor with 4
levels (x2), and a factor with 10 levels (x3). None of them actually
predict the outcome since y was just random noise. After fitting a
random forest, the variable importance plot showed x3 had the highest
importance, followed by x1 and then x2. This confirms that tree models
tend to favor variables with more levels or categories, even when they
are not truly informative. It’s a clear example of how trees are biased
toward predictors with higher granularity.
In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradi- ent. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
The model on the right focused all the importance on just a few predictors because a high learning rate and high bagging fraction made it fit the data very quickly and aggressively. It found the strongest variables fast and didn’t explore much else. The left model, with both values at 0.1, moved slower and was testing more predictors along the way, so the importance was spread across more variables.
The left model would probably perform better on new samples. Since it doesn’t grab onto just a few variables, it’s less likely to overfit and more likely to generalize to new data. The right model looks too optimized for the training set.
If you increase interaction depth, the model would split on more variables and let more predictors into the mix. That would flatten the importance plot since no single variable would dominate as much.
8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
data(ChemicalManufacturingProcess)
# pre-processing from 6.3 exercise
set.seed(0506)
cmp <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
cmp_imputed <- predict(cmp, ChemicalManufacturingProcess)
any(is.na(cmp_imputed))
## [1] FALSE
#removing near zero values
cmp_filtered <- cmp_imputed[,-nearZeroVar(cmp_imputed)]
set.seed(0507)
cmp_train_index <- createDataPartition(cmp_filtered$Yield, p= 0.8, list = FALSE)
cmp_train <- cmp_filtered[cmp_train_index,]
cmp_test <- cmp_filtered[-cmp_train_index,]
control <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
# Single Tree
set.seed(0410)
model_rpart <- train(Yield ~ ., data = cmp_train, method = "rpart", trControl = control)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
rpartPred <- predict(model_rpart, newdata = cmp_test)
rpartResults <- postResample(pred = rpartPred, obs = cmp_test$Yield)
# Random Forest
set.seed(0410)
model_rf <- train(Yield ~ ., data = cmp_train, method = "rf", trControl = control)
rfPred <- predict(model_rf, newdata = cmp_test)
rfResults <- postResample(pred = rfPred, obs = cmp_test$Yield)
# Gradient Boosting
set.seed(0410)
model_gbm <- train(Yield ~ ., data = cmp_train, method = "gbm", trControl = control, verbose = FALSE)
gbmPred <- predict(model_gbm, newdata = cmp_test)
gbmResults <- postResample(pred = gbmPred, obs = cmp_test$Yield)
# Cubist
set.seed(0410)
model_cubist <- train(Yield ~ ., data = cmp_train, method = "cubist", trControl = control)
cubistPred <- predict(model_cubist, newdata = cmp_test)
cubistResults <- postResample(pred = cubistPred, obs = cmp_test$Yield)
#combining models
ranking_results <- data.frame(Model = c("Single Tree", "Random Forest", "Boosted Tree", "Cubist Tree"),
rbind(rpartResults, rfResults, gbmResults, cubistResults)) %>%
arrange(RMSE)
ranking_results
## Model RMSE Rsquared MAE
## cubistResults Cubist Tree 0.6292192 0.5946574 0.4664106
## rfResults Random Forest 0.6498299 0.5574889 0.4948898
## gbmResults Boosted Tree 0.6720605 0.5295593 0.5206975
## rpartResults Single Tree 0.7784772 0.3717932 0.6581676
The Cubist model had the best test set performance (RMSE = 0.63, Rsquared = 0.59). Random forest and boosted trees followed. The single tree had the worst performance with the highest error and lowest Rsquared.
varImp_cubist <- varImp(model_cubist)
print(varImp_cubist)
## cubist variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial03 59.81
## ManufacturingProcess09 57.01
## ManufacturingProcess13 48.60
## ManufacturingProcess17 46.73
## BiologicalMaterial02 43.93
## ManufacturingProcess04 26.17
## ManufacturingProcess34 25.23
## ManufacturingProcess29 24.30
## ManufacturingProcess26 19.63
## ManufacturingProcess27 18.69
## BiologicalMaterial12 18.69
## ManufacturingProcess10 14.95
## ManufacturingProcess14 14.02
## ManufacturingProcess33 14.02
## BiologicalMaterial06 14.02
## ManufacturingProcess20 12.15
## BiologicalMaterial05 12.15
## ManufacturingProcess18 12.15
## ManufacturingProcess24 11.21
The Cubist model identified ManufacturingProcess32 as the most important predictor, followed by a mix of process and biological variables (BiologicalMaterial03, ManufacturingProcess09, ManufacturingProcess13). Process variables still dominated overall, similar to the nonlinear model.
#plot (c)
rpart_model <- model_rpart$finalModel
rpart.plot(rpart_model, extra = 101)
The single tree showed that ManufacturingProcess32 was the main
splitter, with additional influence from
BiologicalMaterial11.