# Load required libraries
library(mlbench)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ ggplot2::margin() masks randomForest::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(party)
## Warning: package 'party' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.3
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## Attaching package: 'party'
##
## The following object is masked from 'package:dplyr':
##
## where
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.3.3
8.1. Recreate the simulated data from Exercise 7.2:
# Simulate the data
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
set.seed(1234)
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
# Compute variable importance
rfImp1 <- varImp(model1, scale = FALSE)
# Print variable importance
print(rfImp1)
## Overall
## V1 8.87315401
## V2 6.64432958
## V3 0.75512916
## V4 7.65645259
## V5 2.26750244
## V6 0.05986796
## V7 0.12614683
## V8 -0.13392719
## V9 -0.07299790
## V10 -0.11477312
# Visualize variable importance
varImpPlot(model1, main = "Variable Importance - Model 1")
# Add a duplicate predictor highly correlated with V1
set.seed(12345)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cat("Correlation between V1 and duplicate1:", cor(simulated$duplicate1, simulated$V1), "\n")
## Correlation between V1 and duplicate1: 0.9276736
# Fit a new Random Forest model
set.seed(12346)
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
# Compute and visualize variable importance
rfImp2 <- varImp(model2, scale = FALSE)
print(rfImp2)
## Overall
## V1 5.98979308
## V2 6.44811988
## V3 0.54234824
## V4 7.15946036
## V5 2.13532498
## V6 0.16984306
## V7 0.03657633
## V8 -0.04909462
## V9 -0.06767017
## V10 0.03781025
## duplicate1 4.09360177
varImpPlot(model2, main = "Variable Importance - Model 2")
# Add another duplicate predictor
set.seed(12347)
simulated$duplicate2 <- simulated$V1 + rnorm(200) * 0.1
# Fit a third Random Forest model
set.seed(12348)
model3 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
# Compute and visualize variable importance
rfImp3 <- varImp(model3, scale = FALSE)
print(rfImp3)
## Overall
## V1 4.3946945331
## V2 6.2734809213
## V3 0.6333111666
## V4 7.1586549867
## V5 2.0627857967
## V6 0.1147837929
## V7 -0.0007655926
## V8 -0.0112786817
## V9 -0.0285221488
## V10 -0.0101958452
## duplicate1 3.1492973037
## duplicate2 2.5214862794
varImpPlot(model3, main = "Variable Importance - Model 3")
# Fit a conditional inference tree forest model
set.seed(12349)
cforest_model <- cforest(y ~ ., data = simulated, controls = cforest_unbiased(ntree = 1000))
# Compute conditional importance
cond_imp <- varimp(cforest_model, conditional = TRUE)
print(cond_imp)
## V1 V2 V3 V4 V5 V6
## 1.829373642 4.673530555 0.006200107 5.810570841 1.241287562 -0.003403790
## V7 V8 V9 V10 duplicate1 duplicate2
## -0.005306408 -0.004077366 -0.006234777 -0.006530365 1.034077374 0.931977983
# Comparison to traditional importance
trad_imp <- varimp(cforest_model, conditional = FALSE)
print(trad_imp)
## V1 V2 V3 V4 V5 V6
## 4.53020990 5.82256141 0.02660196 6.95868444 1.73418971 -0.01128858
## V7 V8 V9 V10 duplicate1 duplicate2
## 0.01687006 -0.02486989 -0.02028303 -0.01914958 2.21375707 2.84800626
8.2. Use a simulation to show tree bias with different granularities.
set.seed(12349)
X1 <- rep(1:2,each=100)
Y <- X1 + rnorm(200,mean=0,sd=4)
set.seed(12349)
X2 <- rnorm(200,mean=0,sd=2)
simData <- data.frame(Y=Y,X1=X1,X2=X2)
print(simData)
## Y X1 X2
## 1 2.809795868 1 0.90489793
## 2 -1.527790862 1 -1.26389543
## 3 2.427260883 1 0.71363044
## 4 0.013685283 1 -0.49315736
## 5 2.002072357 1 0.50103618
## 6 -0.073175073 1 -0.53658754
## 7 1.181998039 1 0.09099902
## 8 5.530924121 1 2.26546206
## 9 5.779158327 1 2.38957916
## 10 1.621330503 1 0.31066525
## 11 4.878618007 1 1.93930900
## 12 3.628134673 1 1.31406734
## 13 3.688308418 1 1.34415421
## 14 5.083912543 1 2.04195627
## 15 -0.090225111 1 -0.54511256
## 16 -1.476556567 1 -1.23827828
## 17 0.178218211 1 -0.41089089
## 18 5.663311458 1 2.33165573
## 19 -9.726043484 1 -5.36302174
## 20 2.306414410 1 0.65320721
## 21 7.407652631 1 3.20382632
## 22 -0.679245662 1 -0.83962283
## 23 3.606612584 1 1.30330629
## 24 0.640026966 1 -0.17998652
## 25 -6.192319194 1 -3.59615960
## 26 4.944586070 1 1.97229304
## 27 1.282130088 1 0.14106504
## 28 5.375169358 1 2.18758468
## 29 -0.382550365 1 -0.69127518
## 30 3.123009861 1 1.06150493
## 31 -2.041237179 1 -1.52061859
## 32 4.379914460 1 1.68995723
## 33 -0.581217310 1 -0.79060866
## 34 7.109070296 1 3.05453515
## 35 5.163712565 1 2.08185628
## 36 2.966606848 1 0.98330342
## 37 -7.593835273 1 -4.29691764
## 38 -0.805203825 1 -0.90260191
## 39 -1.556687086 1 -1.27834354
## 40 0.039269447 1 -0.48036528
## 41 3.560738741 1 1.28036937
## 42 0.537710682 1 -0.23114466
## 43 5.904257551 1 2.45212878
## 44 2.516721688 1 0.75836084
## 45 11.025981860 1 5.01299093
## 46 -2.143133364 1 -1.57156668
## 47 5.522566917 1 2.26128346
## 48 -0.852075699 1 -0.92603785
## 49 -2.960484618 1 -1.98024231
## 50 -4.941870005 1 -2.97093500
## 51 2.496372273 1 0.74818614
## 52 -0.859831698 1 -0.92991585
## 53 7.122586057 1 3.06129303
## 54 -1.374298908 1 -1.18714945
## 55 -2.082783174 1 -1.54139159
## 56 4.271467695 1 1.63573385
## 57 -5.488843623 1 -3.24442181
## 58 -2.972483684 1 -1.98624184
## 59 -4.939102710 1 -2.96955136
## 60 -3.576568104 1 -2.28828405
## 61 -0.561628386 1 -0.78081419
## 62 2.738884272 1 0.86944214
## 63 0.383270049 1 -0.30836498
## 64 2.811555967 1 0.90577798
## 65 -3.574543028 1 -2.28727151
## 66 -4.253313091 1 -2.62665655
## 67 -0.543771472 1 -0.77188574
## 68 0.736965534 1 -0.13151723
## 69 0.042992765 1 -0.47850362
## 70 1.497287621 1 0.24864381
## 71 -5.250400873 1 -3.12520044
## 72 1.487364602 1 0.24368230
## 73 -2.276860964 1 -1.63843048
## 74 -0.654657755 1 -0.82732888
## 75 -1.636776326 1 -1.31838816
## 76 5.284383548 1 2.14219177
## 77 0.840649376 1 -0.07967531
## 78 2.986707140 1 0.99335357
## 79 -1.767178508 1 -1.38358925
## 80 -1.698837620 1 -1.34941881
## 81 -7.829478780 1 -4.41473939
## 82 3.814021552 1 1.40701078
## 83 -2.968304650 1 -1.98415232
## 84 4.262203033 1 1.63110152
## 85 3.157407916 1 1.07870396
## 86 6.489499383 1 2.74474969
## 87 -2.812639813 1 -1.90631991
## 88 6.700250050 1 2.85012502
## 89 2.991895685 1 0.99594784
## 90 -2.716082777 1 -1.85804139
## 91 0.829906712 1 -0.08504664
## 92 4.118764715 1 1.55938236
## 93 4.012091252 1 1.50604563
## 94 2.787708825 1 0.89385441
## 95 5.736577969 1 2.36828898
## 96 1.698178132 1 0.34908907
## 97 3.006650639 1 1.00332532
## 98 2.209997253 1 0.60499863
## 99 -1.157633821 1 -1.07881691
## 100 2.185787959 1 0.59289398
## 101 3.236275322 2 0.61813766
## 102 0.008907967 2 -0.99554602
## 103 -5.223580192 2 -3.61179010
## 104 -1.144791743 2 -1.57239587
## 105 0.028875506 2 -0.98556225
## 106 0.363393041 2 -0.81830348
## 107 5.356351175 2 1.67817559
## 108 5.264583095 2 1.63229155
## 109 0.967273463 2 -0.51636327
## 110 -1.943864402 2 -1.97193220
## 111 -1.199209710 2 -1.59960486
## 112 4.575946200 2 1.28797310
## 113 7.840557764 2 2.92027888
## 114 -3.061594530 2 -2.53079727
## 115 4.393483355 2 1.19674168
## 116 3.725718727 2 0.86285936
## 117 2.697310131 2 0.34865507
## 118 1.794051052 2 -0.10297447
## 119 4.136536015 2 1.06826801
## 120 1.830360295 2 -0.08481985
## 121 7.534031365 2 2.76701568
## 122 1.901432265 2 -0.04928387
## 123 2.766641689 2 0.38332084
## 124 -5.633610996 2 -3.81680550
## 125 1.991608659 2 -0.00419567
## 126 -1.398251887 2 -1.69912594
## 127 0.074754996 2 -0.96262250
## 128 1.004892903 2 -0.49755355
## 129 -1.089366966 2 -1.54468348
## 130 3.075476080 2 0.53773804
## 131 -11.696392027 2 -6.84819601
## 132 0.670252607 2 -0.66487370
## 133 3.485387399 2 0.74269370
## 134 -1.597391698 2 -1.79869585
## 135 -0.329193164 2 -1.16459658
## 136 0.403454670 2 -0.79827266
## 137 -7.229041278 2 -4.61452064
## 138 1.905477366 2 -0.04726132
## 139 3.381467960 2 0.69073398
## 140 -0.726947499 2 -1.36347375
## 141 9.246323871 2 3.62316194
## 142 5.459139917 2 1.72956996
## 143 -0.082265137 2 -1.04113257
## 144 4.106041748 2 1.05302087
## 145 6.908425609 2 2.45421280
## 146 -1.206927226 2 -1.60346361
## 147 -6.187032679 2 -4.09351634
## 148 -0.671349967 2 -1.33567498
## 149 6.036996764 2 2.01849838
## 150 2.458375068 2 0.22918753
## 151 4.761220243 2 1.38061012
## 152 1.577722363 2 -0.21113882
## 153 4.291443717 2 1.14572186
## 154 0.724764387 2 -0.63761781
## 155 2.763120278 2 0.38156014
## 156 7.442830306 2 2.72141515
## 157 -1.454709737 2 -1.72735487
## 158 7.979179309 2 2.98958965
## 159 3.975831133 2 0.98791557
## 160 -2.258439365 2 -2.12921968
## 161 2.525543155 2 0.26277158
## 162 0.808219715 2 -0.59589014
## 163 6.224022119 2 2.11201106
## 164 4.605144566 2 1.30257228
## 165 -2.078968940 2 -2.03948447
## 166 1.727481790 2 -0.13625911
## 167 5.550678693 2 1.77533935
## 168 -1.817015437 2 -1.90850772
## 169 1.832973223 2 -0.08351339
## 170 -2.705167668 2 -2.35258383
## 171 -5.823714548 2 -3.91185727
## 172 7.115203592 2 2.55760180
## 173 0.297601539 2 -0.85119923
## 174 8.264444867 2 3.13222243
## 175 0.792658637 2 -0.60367068
## 176 -1.481543203 2 -1.74077160
## 177 1.755478671 2 -0.12226066
## 178 -2.935127899 2 -2.46756395
## 179 4.021264167 2 1.01063208
## 180 1.511825659 2 -0.24408717
## 181 -0.002734983 2 -1.00136749
## 182 3.573124833 2 0.78656242
## 183 2.332246123 2 0.16612306
## 184 -3.260450541 2 -2.63022527
## 185 -1.615644381 2 -1.80782219
## 186 3.735804712 2 0.86790236
## 187 -0.893803626 2 -1.44690181
## 188 2.235460454 2 0.11773023
## 189 -1.458438734 2 -1.72921937
## 190 -2.938433446 2 -2.46921672
## 191 -0.296183175 2 -1.14809159
## 192 6.942567258 2 2.47128363
## 193 12.619451423 2 5.30972571
## 194 -1.441595594 2 -1.72079780
## 195 8.762730739 2 3.38136537
## 196 -0.145180629 2 -1.07259031
## 197 2.746404003 2 0.37320200
## 198 6.037004430 2 2.01850222
## 199 -7.103426347 2 -4.55171317
## 200 -4.855515167 2 -3.42775758
# Generate data with different levels of granularity
set.seed(12350)
granularity_data <- function(granularity) {
x <- runif(500, 0, 1)
y <- ifelse(x > 0.5, 1, 0) + rnorm(500, 0, 0.1) # Binary outcome with noise
x <- round(x * granularity) / granularity # Adjust granularity
data.frame(x, y)
}
# Test different granularities
granularities <- c(2, 10, 100, 1000)
models <- list()
for (g in granularities) {
data <- granularity_data(g)
models[[as.character(g)]] <- randomForest(y ~ x, data = data, ntree = 500)
}
# Plot variable importance for different granularities
par(mfrow = c(2, 2))
for (g in granularities) {
varImpPlot(models[[as.character(g)]], main = paste("Granularity:", g))
}
8.3. In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9: (a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors? (b) Which model do you think would be more predictive of other samples?
# Generate simulation data
set.seed(12352)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
# Fit two boosted tree models with extreme parameters
set.seed(12353)
# Lower shrinkage and bag.fraction
gbm_model_low <- gbm(
y ~ .,
data = simulated,
distribution = "gaussian",
n.trees = 500,
interaction.depth = 3,
shrinkage = 0.1,
bag.fraction = 0.5,
n.minobsinnode = 5
)
# Higher shrinkage and bag.fraction
gbm_model_high <- gbm(
y ~ .,
data = simulated,
distribution = "gaussian",
n.trees = 500,
interaction.depth = 3,
shrinkage = 0.9,
bag.fraction = 0.9,
n.minobsinnode = 5
)
# Variable importance for low shrinkage
imp_low <- summary(gbm_model_low, plotit = FALSE)
# Variable importance for high shrinkage
imp_high <- summary(gbm_model_high, plotit = FALSE)
# Visualize variable importance
par(mfrow = c(1, 2), mar = c(10, 4, 4, 2))
barplot(
imp_low$rel.inf,
names.arg = imp_low$var,
las = 2, # Rotate axis labels
main = "Shrinkage=0.1, Bagging=0.5",
col = "skyblue",
ylim = c(0, max(imp_low$rel.inf, imp_high$rel.inf))
)
barplot(
imp_high$rel.inf,
names.arg = imp_high$var,
las = 2, # Rotate axis labels
main = "Shrinkage=0.9, Bagging=0.9",
col = "orange",
ylim = c(0, max(imp_low$rel.inf, imp_high$rel.inf))
)
# Increase interaction depth with adjusted parameters
set.seed(12355)
gbm_model_depth <- gbm(
y ~ .,
data = simulated,
distribution = "gaussian",
n.trees = 500,
interaction.depth = 10,
shrinkage = 0.1,
bag.fraction = 0.5,
n.minobsinnode = 5
)
# Extract and visualize variable importance
imp_depth <- summary(gbm_model_depth, plotit = FALSE)
# Improved barplot for variable importance
par(mfrow = c(1, 1), mar = c(10, 4, 4, 2)) # Adjust margins for display
barplot(
imp_depth$rel.inf,
names.arg = imp_depth$var,
las = 2,
main = "Increased Interaction Depth",
col = "lightgreen",
ylim = c(0, max(imp_depth$rel.inf))
)
8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models: (a) Which tree-based regression model gives the optimal resampling and test set performance? (b) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models? (c) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
The Cubist model provides the best performance across all the models we have tuned throughout this exercise.
set.seed(12359)
# Load the ChemicalManufacturingProcess dataset
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
# Using knn imputation to handle missing values
knn_model <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
df <- predict(knn_model, ChemicalManufacturingProcess)
# Remove near-zero variance predictors
df <- df %>% select(-one_of(nearZeroVar(df, names = TRUE)))
# Partition data into training and testing sets
set.seed(12359)
in_train <- createDataPartition(df$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]
# Fit models
# 1. Random Forest
set.seed(12356)
rf_model <- randomForest(Yield ~ ., data = train_df, ntree = 500)
# 2. Gradient Boosted Trees
set.seed(12356)
gbm_model <- gbm(
Yield ~ ., data = train_df, distribution = "gaussian",
n.trees = 500, interaction.depth = 3, shrinkage = 0.1, bag.fraction = 0.75
)
# 3. Cubist
set.seed(12356)
cubist_model <- cubist(x = train_df[, -ncol(train_df)], y = train_df$Yield)
# Evaluate models
# Predictions
rf_pred <- predict(rf_model, test_df)
gbm_pred <- predict(gbm_model, test_df, n.trees = 500)
cubist_pred <- predict(cubist_model, test_df[, -ncol(test_df)])
# Calculate RMSE for each model
rf_rmse <- RMSE(rf_pred, test_df$Yield)
gbm_rmse <- RMSE(gbm_pred, test_df$Yield)
cubist_rmse <- RMSE(cubist_pred, test_df$Yield)
# Print results
cat("Random Forest RMSE:", rf_rmse, "\n")
## Random Forest RMSE: 0.5718748
cat("GBM RMSE:", gbm_rmse, "\n")
## GBM RMSE: 0.5703229
cat("Cubist RMSE:", cubist_rmse, "\n")
## Cubist RMSE: 1.902987e-08
# Determine the best model based on RMSE
best_model <- ifelse(
rf_rmse < gbm_rmse & rf_rmse < cubist_rmse, "Random Forest",
ifelse(gbm_rmse < cubist_rmse, "GBM", "Cubist")
)
cat("Best Model:", best_model, "\n")
## Best Model: Cubist
# Feature importance visualization for the best model
if (best_model == "Random Forest") {
importance <- importance(rf_model)
varImpPlot(rf_model, main = "Variable Importance - Random Forest")
} else if (best_model == "GBM") {
gbm_imp <- summary(gbm_model)
barplot(gbm_imp$rel.inf, names.arg = gbm_imp$var, main = "Variable Importance - GBM")
} else if (best_model == "Cubist") {
cubist_summary <- summary(cubist_model)
print(cubist_summary)
}
##
## Call:
## cubist.default(x = train_df[, -ncol(train_df)], y = train_df$Yield)
##
##
## Cubist [Release 2.07 GPL Edition] Mon Nov 18 18:25:27 2024
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 144 cases (57 attributes) from undefined.data
##
## Model:
##
## Rule 1: [144 cases, mean 0.0155360, range -2.669244 to 3.339426, est err 0.0000000]
##
## outcome = 0 + 1 Yield
##
##
## Evaluation on training data (144 cases):
##
## Average |error| 0.0000000
## Relative |error| 0.00
## Correlation coefficient 1.00
##
##
## Attribute usage:
## Conds Model
##
## 100% Yield
##
##
## Time: 0.0 secs