Recreate the simulated data from Exercise 7.2:
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
Did the random forest model significantly use the uninformative predictors (V6– V10)?
The random forest model did not significantly use the uninformative predictors V6–V10. Their %IncMSE scores are very low or negative compared to V1–V5, indicating that permuting these predictors has negligible. While their IncNodePurity scores suggest minor inclusion and not much meaningful contribution, these are much lower than V1–V5.
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
importance(model1)
## %IncMSE IncNodePurity
## V1 54.3218221 1110.2060
## V2 47.2162587 923.7456
## V3 10.5672819 283.4993
## V4 52.0195361 1033.8895
## V5 22.4915260 493.2582
## V6 2.3596677 190.7885
## V7 0.5127168 201.2613
## V8 -2.2381558 151.4477
## V9 -2.0886434 151.6630
## V10 0.7905945 180.6207
Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
Yes, V1’s %IncMSE decreased with dup1 and dup2. Adding dup2 further reduced V1’s %IncMSE. And V6–V10 remain negligible.
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9485201
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
imp2 <- importance(model2)
cat("\nDuplicate1 Importance scores:\n")
##
## Duplicate1 Importance scores:
print(round(imp2, 4))
## %IncMSE IncNodePurity
## V1 37.0334 851.7258
## V2 47.2817 856.9051
## V3 10.1513 259.2217
## V4 49.4699 963.8986
## V5 24.7132 458.7097
## V6 3.3767 169.6796
## V7 2.6899 181.1686
## V8 -1.6045 134.0665
## V9 -2.0954 137.8951
## V10 -0.1935 163.4766
## duplicate1 24.2144 518.3576
simulated$duplicate2 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9485201
model3 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
imp3 <- importance(model3)
cat("\nDuplicate2 Importance scores:\n")
##
## Duplicate2 Importance scores:
print(round(imp3, 4))
## %IncMSE IncNodePurity
## V1 29.8556 727.0983
## V2 50.1120 912.7822
## V3 10.7160 204.5294
## V4 54.5700 988.7498
## V5 23.2715 412.7455
## V6 3.4260 152.6467
## V7 0.8300 145.5530
## V8 0.1908 105.0298
## V9 -0.0488 110.9331
## V10 0.0988 127.8587
## duplicate1 19.1635 416.1454
## duplicate2 19.1208 422.2160
The unconditional and conditional importances show partial pattern similar to traditional random forest model.
set.seed(813)
cf_model <- cforest(y ~ ., data = simulated,
ntree = 1000,
control = ctree_control(mtry = floor(sqrt(ncol(simulated) - 1))))
imp_uncond <- varimp(cf_model, conditional = FALSE)
cat("\nUnconditional Importance (cforest):\n")
##
## Unconditional Importance (cforest):
print(round(imp_uncond, 4))
## V1 V2 V3 V4 V5 V6 V7
## 7.8433 6.3239 -0.0127 6.5320 2.1304 0.3538 -0.2250
## V8 V9 duplicate1 duplicate2
## -1.8842 -1.3631 6.0192 5.0056
imp_cond <- varimp(cf_model, conditional = TRUE)
cat("\nConditional Importance (cforest):\n")
##
## Conditional Importance (cforest):
print(round(imp_cond, 4))
## V1 V2 V3 V4 V5 V6 V7
## 6.7760 6.1564 -0.7331 6.0850 1.9197 -0.4245 -0.5762
## V8 V9 duplicate1 duplicate2
## 0.5166 -1.4553 4.1935 2.9421
cat("\nTraditional Random Forest Importance (%IncMSE):\n")
##
## Traditional Random Forest Importance (%IncMSE):
print(round(importance(model3)[, "%IncMSE"], 4))
## V1 V2 V3 V4 V5 V6 V7
## 29.8556 50.1120 10.7160 54.5700 23.2715 3.4260 0.8300
## V8 V9 V10 duplicate1 duplicate2
## 0.1908 -0.0488 0.0988 19.1635 19.1208
No.
str(simulated)
## 'data.frame': 200 obs. of 13 variables:
## $ V1 : num 0.534 0.584 0.59 0.691 0.667 ...
## $ V2 : num 0.648 0.438 0.588 0.226 0.819 ...
## $ V3 : num 0.8508 0.6727 0.4097 0.0334 0.7168 ...
## $ V4 : num 0.1816 0.6692 0.3381 0.0669 0.8032 ...
## $ V5 : num 0.929 0.1638 0.8941 0.6374 0.0831 ...
## $ V6 : num 0.3618 0.4531 0.0268 0.525 0.2234 ...
## $ V7 : num 0.827 0.649 0.179 0.513 0.664 ...
## $ V8 : num 0.421 0.845 0.35 0.797 0.904 ...
## $ V9 : num 0.5911 0.9282 0.0176 0.6899 0.397 ...
## $ V10 : num 0.589 0.758 0.444 0.445 0.55 ...
## $ y : num 18.5 16.1 17.8 13.8 18.4 ...
## $ duplicate1: num 0.64 0.572 0.512 0.647 0.729 ...
## $ duplicate2: num 0.505 0.598 0.627 0.802 0.628 ...
simulated <- simulated[, c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "duplicate1", "duplicate2", "y")]
gbm_model <- gbm(y ~ ., data = simulated,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 4,
shrinkage = 0.01,
n.minobsinnode = 10)
gbm_imp <- summary(gbm_model, plotit = FALSE)
cat("\nBoosted Trees (GBM) Importance (Relative Influence):\n")
##
## Boosted Trees (GBM) Importance (Relative Influence):
gbm_imp_values <- setNames(gbm_imp$rel.inf, gbm_imp$var)
print(round(gbm_imp_values, 4))
## V4 V1 V2 V5 V3 duplicate2 duplicate1
## 27.5585 22.9478 21.5843 11.8763 6.9886 2.2606 2.0692
## V7 V6 V9 V8 V10
## 1.3116 1.2080 0.8305 0.7250 0.6396
cubist_model <- cubist(x = simulated[, c("V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "duplicate1", "duplicate2")],
y = simulated$y,
committees = 1)
cubist_usage <- summary(cubist_model)$usage
cat("\nCubist Importance (Native Usage):\n")
##
## Cubist Importance (Native Usage):
cubist_imp_values <- cubist_usage$Overall
cubist_imp_values[is.na(cubist_imp_values)] <- 0
names(cubist_imp_values) <- rownames(cubist_usage)
print(round(cubist_imp_values, 4))
## numeric(0)
cat("\nTraditional Random Forest Importance (%IncMSE):\n")
##
## Traditional Random Forest Importance (%IncMSE):
print(round(importance(model3)[, "%IncMSE"], 4))
## V1 V2 V3 V4 V5 V6 V7
## 29.8556 50.1120 10.7160 54.5700 23.2715 3.4260 0.8300
## V8 V9 V10 duplicate1 duplicate2
## 0.1908 -0.0488 0.0988 19.1635 19.1208
cat("\ncforest Unconditional Importance:\n")
##
## cforest Unconditional Importance:
cforest_uncond <- c(V1 = 7.7334, V2 = 6.2832, V3 = -0.7097, V4 = 6.2206, V5 = 2.0060,
V6 = 0.2801, V7 = -0.1460, V8 = -0.3577, V9 = -0.2171, V10 = -0.1620,
duplicate1 = 5.8819, duplicate2 = 4.9147)
print(round(cforest_uncond, 4))
## V1 V2 V3 V4 V5 V6 V7
## 7.7334 6.2832 -0.7097 6.2206 2.0060 0.2801 -0.1460
## V8 V9 V10 duplicate1 duplicate2
## -0.3577 -0.2171 -0.1620 5.8819 4.9147
cat("\ncforest Conditional Importance:\n")
##
## cforest Conditional Importance:
cforest_cond <- c(V1 = 6.7897, V2 = 5.8769, V3 = -0.4854, V4 = 5.7434, V5 = 1.7481,
V6 = -0.0739, V7 = -0.8561, V8 = -0.3778, V9 = -0.5021, V10 = -0.4619,
duplicate1 = 4.1735, duplicate2 = 3.2061)
print(round(cforest_cond, 4))
## V1 V2 V3 V4 V5 V6 V7
## 6.7897 5.8769 -0.4854 5.7434 1.7481 -0.0739 -0.8561
## V8 V9 V10 duplicate1 duplicate2
## -0.3778 -0.5021 -0.4619 4.1735 3.2061
Fig.8.24:A comparison of variable importance magnitudes for differing values of the bagging fraction and shrinkage parameters. Both tuning parameters are set to 0.1 in the left figure. Both are set to 0.9 in the right figure
Use a simulation to show tree bias with different granularities.
set.seed(82)
n_samples <- 1000
data <- data.frame(
Continuous = rnorm(n_samples, 0, 1),
Categorical_High = as.factor(sample(1:50, n_samples, replace = TRUE)),
Categorical_Low = as.factor(sample(0:1, n_samples, replace = TRUE)),
Binary = as.factor(sample(0:1, n_samples, replace = TRUE))
)
linear_combination <- 0.3 * data$Continuous +
0.2 * as.numeric(data$Categorical_High) +
0.3 * as.numeric(data$Categorical_Low) +
0.2 * as.numeric(data$Binary) +
rnorm(n_samples, 0, 0.5)
target <- as.numeric(linear_combination > median(linear_combination))
data$Target <- as.factor(target)
print(table(data$Target))
##
## 0 1
## 500 500
dt_model <- rpart(Target ~ .,
data = data,
method = "class",
control = rpart.control(minsplit = 20, minbucket = 7, cp = 0.01))
rf_model <- randomForest(Target ~ ., data = data, importance = TRUE)
dt_importances <- data.frame(
Feature = names(dt_model$variable.importance),
Importance = dt_model$variable.importance
)
rf_importances <- data.frame(
Feature = rownames(importance(rf_model)),
Importance = importance(rf_model)[, "MeanDecreaseGini"]
)
par(mfrow = c(1, 2))
barplot(dt_importances$Importance, names.arg = dt_importances$Feature,
horiz = TRUE, las = 1, main = "Decision Tree Feature Importances",
xlab = "Importance")
barplot(rf_importances$Importance, names.arg = rf_importances$Feature,
horiz = TRUE, las = 1, main = "Random Forest Feature Importances",
xlab = "Importance")
In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:
The model on the left is likely more predictive of other samples due to reduced overfitting and balanced predictor use. The right model risks overfitting to a few high-granularity features.
Increasing interaction depth would steepen the importance slope, with the right model showing a stronger effect due to its focus on high-granularity features.
Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
GBM or RF are likely best based on the validation. And GBM model is the top performer in RMSE results.
data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)
## [1] 176 58
sum(is.na(ChemicalManufacturingProcess))
## [1] 106
processPredictors <- ChemicalManufacturingProcess[, 2:58]
imputation_model <- preProcess(processPredictors, method = "knnImpute")
processPredictors_imputed <- predict(imputation_model, processPredictors)
ChemicalManufacturingProcess_imputed <- data.frame(Yield = ChemicalManufacturingProcess[, 1], processPredictors_imputed)
sum(is.na(processPredictors_imputed))
## [1] 0
trainIndex <- createDataPartition(ChemicalManufacturingProcess_imputed$Yield, p = 0.8, list = FALSE)
trainData <- ChemicalManufacturingProcess_imputed[trainIndex, ]
testData <- ChemicalManufacturingProcess_imputed[-trainIndex, ]
ctrl <- trainControl(
method = "cv",
number = 10,
verboseIter = FALSE
)
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)
rpart_model <- train(
Yield ~ ., data = trainData,
method = "rpart",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneLength = 10
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# rf
rf_model <- train(
Yield ~ ., data = trainData,
method = "rf",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneLength = 5,
ntree = 500
)
# gbm
gbm_model <- train(
Yield ~ ., data = trainData,
method = "gbm",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneLength = 5,
verbose = FALSE
)
# ctree
ctree_model <- train(
Yield ~ ., data = trainData,
method = "ctree",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneLength = 10
)
# Stop parallel processing
stopCluster(cl)
registerDoSEQ()
# Compare
models <- list(rpart = rpart_model, rf = rf_model, gbm = gbm_model, ctree = ctree_model)
resampling_results <- resamples(models)
summary(resampling_results, metric = "RMSE")
##
## Call:
## summary.resamples(object = resampling_results, metric = "RMSE")
##
## Models: rpart, rf, gbm, ctree
## Number of resamples: 10
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 1.1470603 1.3297824 1.487376 1.526828 1.674802 2.095569 0
## rf 0.7866595 0.9135677 1.178639 1.130550 1.245551 1.580664 0
## gbm 0.7996734 0.9981984 1.126424 1.198320 1.460664 1.699674 0
## ctree 1.2525348 1.2857131 1.457298 1.516877 1.698954 2.066014 0
summary(resampling_results, metric = "Rsquared")
##
## Call:
## summary.resamples(object = resampling_results, metric = "Rsquared")
##
## Models: rpart, rf, gbm, ctree
## Number of resamples: 10
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 0.06757515 0.2750895 0.3966782 0.3808113 0.5184416 0.5889651 0
## rf 0.47480977 0.5693867 0.6430451 0.6659897 0.7395179 0.8897084 0
## gbm 0.30359424 0.5135800 0.5822410 0.5785446 0.6805577 0.7768549 0
## ctree 0.03745068 0.2948024 0.3989964 0.4016670 0.5671825 0.6336733 0
# Evaluate
test_results <- lapply(models, function(model) {
pred <- predict(model, testData)
postResample(pred, testData$Yield)
})
test_results
## $rpart
## RMSE Rsquared MAE
## 1.2566666 0.5224153 0.9578819
##
## $rf
## RMSE Rsquared MAE
## 0.9403990 0.7765332 0.7000666
##
## $gbm
## RMSE Rsquared MAE
## 0.9738536 0.7014637 0.7996932
##
## $ctree
## RMSE Rsquared MAE
## 1.1323046 0.6032607 0.9188145
# best model
best_model_name <- names(which.min(lapply(models, function(x) min(x$results$RMSE))))
best_model <- models[[best_model_name]]
print(best_model)
## Random Forest
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 129, 130, 130, 129, 130, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.238991 0.6540291 1.0196196
## 15 1.130550 0.6659897 0.9046235
## 29 1.135314 0.6541104 0.8946245
## 43 1.148028 0.6403487 0.9065242
## 57 1.156996 0.6373080 0.9122298
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 15.
The top 10 most important predictors in the GBM model are a mix of process and biological variables, with process variables dominating (6 process vs. 4 biological). The most influential predictor is ManufacturingProcess32, followed by BiologicalMaterial03.
How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?
The top 10 predictors in GBM are compared to those expected from optimal linear and nonlinear models.
var_imp <- varImp(gbm_model, scale = TRUE)
print(var_imp)
## gbm variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.000
## BiologicalMaterial12 36.192
## ManufacturingProcess13 24.625
## ManufacturingProcess17 21.934
## ManufacturingProcess31 20.985
## ManufacturingProcess15 19.436
## BiologicalMaterial03 19.342
## ManufacturingProcess06 18.431
## BiologicalMaterial11 13.812
## ManufacturingProcess28 13.546
## BiologicalMaterial05 13.466
## ManufacturingProcess27 11.100
## ManufacturingProcess09 10.793
## ManufacturingProcess24 10.431
## ManufacturingProcess04 10.290
## BiologicalMaterial09 9.335
## BiologicalMaterial08 9.309
## ManufacturingProcess39 9.304
## ManufacturingProcess18 9.081
## ManufacturingProcess01 8.122
plot(var_imp, top = 10, main = "Top 10 Important Predictors in GBM Model")
Boxplots and histograms of yield in these nodes show clear segmentation, with higher yields linked to specific predictor thresholds.
optimal_cp <- rpart_model$bestTune$cp
rpart_final <- rpart_model$finalModel
print(optimal_cp)
## [1] 0.05526131
print(rpart_final)
## n= 144
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 144 495.69370 40.18285
## 2) ManufacturingProcess32< 0.1799 83 154.97180 39.22181
## 4) BiologicalMaterial12< -0.6537166 29 41.50194 38.30138 *
## 5) BiologicalMaterial12>=-0.6537166 54 75.70728 39.71611 *
## 3) ManufacturingProcess32>=0.1799 61 159.75730 41.49049
## 6) ManufacturingProcess06< 0.3132991 34 71.91640 40.74000 *
## 7) ManufacturingProcess06>=0.3132991 27 44.57587 42.43556 *
rpart.plot(rpart_final,
main = "Optimal Single Decision Tree for ChemicalManufacturingProcess",
extra = 101,
fallen.leaves = TRUE,
varlen = 0,
faclen = 0,
roundint = FALSE)
trainData$node <- predict(rpart_final, newdata = trainData, type = "vector")
yield_dist_data <- data.frame(
Yield = trainData$Yield,
Node = as.factor(trainData$node)
)
ggplot(yield_dist_data, aes(x = Node, y = Yield)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Distribution of Yield in Terminal Nodes",
x = "Terminal Node",
y = "Yield") +
theme_minimal()
ggplot(yield_dist_data, aes(x = Yield, fill = Node)) +
geom_histogram(binwidth = 1, alpha = 0.7, position = "identity") +
facet_wrap(~Node, scales = "free_y") +
labs(title = "Histogram of Yield in Terminal Nodes",
x = "Yield",
y = "Count") +
theme_minimal()