library(mlbench)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(Cubist)
library(rpart)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
print(rfImp1)
## Overall
## V1 8.84289608
## V2 6.74508245
## V3 0.67830653
## V4 7.75934674
## V5 2.23628276
## V6 0.11429887
## V7 0.03724747
## V8 -0.05349642
## V9 -0.04495617
## V10 0.03863205
Answer 8.1(b): The importance of V1 drops, because now the model splits its importance between V1 and duplicate1.
set.seed(200)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * 0.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9497025
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
print(rfImp2)
## Overall
## V1 5.856604493
## V2 6.186757892
## V3 0.688597974
## V4 6.953250986
## V5 2.073734908
## V6 0.160039640
## V7 -0.054970073
## V8 -0.089522030
## V9 -0.007415035
## V10 -0.034915631
## duplicate1 4.121750833
Answer 8.1(c): The importance values are more distributed and may differ from traditional random forests.
cf_model <- cforest(y ~ ., data = simulated,
controls = cforest_unbiased(ntree = 1000))
cfImp_uncond <- varimp(cf_model, conditional = FALSE)
cfImp_cond <- varimp(cf_model, conditional = TRUE)
print(cfImp_uncond)
## V1 V2 V3 V4 V5 V6
## 6.505394091 6.043175861 0.014367145 7.654381924 1.846344204 -0.004149079
## V7 V8 V9 V10 duplicate1
## 0.013896743 -0.024788801 0.011439719 -0.054500981 3.008757942
print(cfImp_cond)
## V1 V2 V3 V4 V5 V6
## 2.992622770 4.897345848 0.008502956 6.397255268 1.252611929 0.006145388
## V7 V8 V9 V10 duplicate1
## 0.010880114 0.004921234 0.015235930 -0.014357115 1.515990062
Answer 8.1(d): Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur? Yes,In boosted trees and Cubist models, highly correlated predictors cause the variable importance to split.
set.seed(123)
boosted_model <- gbm(y ~ ., data = simulated, distribution = "gaussian", n.trees = 1000, interaction.depth = 3, shrinkage = 0.1, verbose = FALSE)
boostedImp <- summary(boosted_model)
cubist_model <- cubist(x = simulated[, -ncol(simulated)], y = simulated$y)
cubistImp <- varImp(cubist_model)
print(boostedImp)
## var rel.inf
## V4 V4 26.543403
## V2 V2 20.300647
## V1 V1 15.627327
## V5 V5 11.797434
## duplicate1 duplicate1 9.474519
## V3 V3 7.894067
## V8 V8 2.054508
## V6 V6 1.961320
## V7 V7 1.915105
## V10 V10 1.290985
## V9 V9 1.140684
print(cubistImp)
## Overall
## y 50
## V1 0
## V2 0
## V3 0
## V4 0
## V5 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
set.seed(200)
n <- 500
simulated <- data.frame(
HighCardinality = runif(n),
MediumCardinality = sample(1:20, n, replace = TRUE),
LowCardinality = sample(1:5, n, replace = TRUE),
VeryLowCardinality = sample(1:2, n, replace = TRUE),
y = rnorm(n)
)
set.seed(123)
tree_model <- rpart(y ~ ., data = simulated, method = "anova", control = rpart.control(cp = 0))
tree_importance <- varImp(tree_model, scale = FALSE)
print(tree_importance)
## Overall
## HighCardinality 2.5295673
## LowCardinality 0.7629261
## MediumCardinality 1.5675705
## VeryLowCardinality 1.1947423
With a high bagging fraction and shrinkage rate 0.9, the right model aggressively focuses on a few highly predictive features.
8.3 (b) Which model would be more predictive of other samples? The left model, because it has with low bagging fraction and shrinkage. it would more likely spreads influence across more predictors, reducing overfitting and improving generalization to new data.
8.3 (c) How would increasing interaction depth affect the slope of predictor importance for either model? Increasing interaction depth makes the slope of predictor importance more pronounced, concentrating even more on the top predictors.
library(AppliedPredictiveModeling)
library(caret)
library(rpart)
library(randomForest)
library(gbm)
library(Cubist)
data(ChemicalManufacturingProcess)
predictors <- ChemicalManufacturingProcess[, -1]
outcome <- ChemicalManufacturingProcess$Yield
predictors <- as.data.frame(lapply(predictors, function(x) {
ifelse(is.na(x), median(x, na.rm = TRUE), x)
}))
zero_var_cols <- nearZeroVar(predictors)
if(length(zero_var_cols) > 0) {
predictors <- predictors[, -zero_var_cols]
}
set.seed(200)
train_index <- createDataPartition(outcome, p = 0.8, list = FALSE)
train_predictors <- predictors[train_index, ]
train_outcome <- outcome[train_index]
test_predictors <- predictors[-train_index, ]
test_outcome <- outcome[-train_index]
cart_model <- train(
x = train_predictors,
y = train_outcome,
method = "rpart",
tuneLength = 10,
trControl = trainControl(method = "cv")
)
cart_model
## CART
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 132, 129, 129, 128, 130, 129, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.000000000 1.355600 0.4756239 1.0083280
## 0.006659213 1.354547 0.4757101 1.0053169
## 0.011413814 1.348602 0.4768560 1.0045930
## 0.027789221 1.370620 0.4639821 1.0233919
## 0.029140890 1.382057 0.4669640 1.0388444
## 0.036646987 1.343311 0.4911358 1.0149259
## 0.049774462 1.302561 0.5006893 0.9956892
## 0.070317075 1.296709 0.4984734 1.0290724
## 0.094521937 1.398715 0.4222680 1.1438741
## 0.433943988 1.736315 0.2326474 1.4163703
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.07031708.
rf_model <- train(
x = train_predictors,
y = train_outcome,
method = "rf",
tuneLength = 5,
trControl = trainControl(method = "cv")
)
rf_model
## Random Forest
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 129, 129, 129, 131, 129, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.195857 0.6469788 0.9576855
## 15 1.078223 0.6942342 0.8506899
## 29 1.077650 0.6772696 0.8496074
## 42 1.093260 0.6539406 0.8529932
## 56 1.093693 0.6482532 0.8502097
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
gbm_model <- train(
x = train_predictors,
y = train_outcome,
method = "gbm",
verbose = FALSE,
tuneLength = 5,
trControl = trainControl(method = "cv")
)
gbm_model
## Stochastic Gradient Boosting
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 131, 130, 130, 129, 131, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees RMSE Rsquared MAE
## 1 50 1.129926 0.6378114 0.9026320
## 1 100 1.108916 0.6401334 0.8751789
## 1 150 1.098023 0.6412436 0.8668116
## 1 200 1.103634 0.6387978 0.8649514
## 1 250 1.105698 0.6363372 0.8633816
## 2 50 1.099455 0.6321899 0.8710636
## 2 100 1.116310 0.6195638 0.8781182
## 2 150 1.108462 0.6270681 0.8789172
## 2 200 1.098873 0.6367725 0.8731341
## 2 250 1.083204 0.6483088 0.8539729
## 3 50 1.100301 0.6319360 0.8786341
## 3 100 1.077734 0.6457733 0.8630730
## 3 150 1.074273 0.6483509 0.8554728
## 3 200 1.071530 0.6509140 0.8455453
## 3 250 1.066704 0.6555032 0.8391743
## 4 50 1.113171 0.6279772 0.8785529
## 4 100 1.107968 0.6413132 0.8648379
## 4 150 1.110191 0.6387177 0.8690856
## 4 200 1.103470 0.6449794 0.8617905
## 4 250 1.100287 0.6462175 0.8557418
## 5 50 1.090854 0.6433762 0.8718779
## 5 100 1.066784 0.6634462 0.8419436
## 5 150 1.057731 0.6677261 0.8350853
## 5 200 1.058757 0.6680090 0.8314528
## 5 250 1.055128 0.6704640 0.8287648
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 250, interaction.depth =
## 5, shrinkage = 0.1 and n.minobsinnode = 10.
cubist_model <- train(
x = train_predictors,
y = train_outcome,
method = "cubist",
tuneLength = 5,
trControl = trainControl(method = "cv")
)
cubist_model
## Cubist
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 129, 130, 130, 128, 129, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 1.4292923 0.5310290 1.0509272
## 1 5 1.1829721 0.6658962 0.8523752
## 1 9 1.2934502 0.6072598 0.9360184
## 10 0 1.0362871 0.6581453 0.8460696
## 10 5 0.8604811 0.7621042 0.7075204
## 10 9 0.9466931 0.7151167 0.7744171
## 20 0 0.9971549 0.6829048 0.8100202
## 20 5 0.8548690 0.7644021 0.7016816
## 20 9 0.9401029 0.7238794 0.7654211
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
results <- resamples(list(
CART = cart_model,
RandomForest = rf_model,
GBM = gbm_model,
Cubist = cubist_model
))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: CART, RandomForest, GBM, Cubist
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.7759642 0.8671163 0.9870491 1.0290724 1.1311022 1.545582 0
## RandomForest 0.5745547 0.7181728 0.8300442 0.8496074 0.9144755 1.331861 0
## GBM 0.5762596 0.6215232 0.8655562 0.8287648 0.9976289 1.111032 0
## Cubist 0.4199095 0.5326748 0.6090101 0.7016816 0.8791997 1.126684 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.9048718 1.0703449 1.2858958 1.296709 1.372888 2.042083 0
## RandomForest 0.7346208 0.9384503 0.9999863 1.077650 1.203232 1.688322 0
## GBM 0.7128280 0.8710880 1.0925256 1.055128 1.188836 1.448386 0
## Cubist 0.4581177 0.6743422 0.8275569 0.854869 1.032930 1.370370 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.03619881 0.3802509 0.5374067 0.4984734 0.6717263 0.7804356 0
## RandomForest 0.44394698 0.6290783 0.6894586 0.6772696 0.7845046 0.8181251 0
## GBM 0.29553300 0.6184687 0.7150291 0.6704640 0.7445857 0.8073772 0
## Cubist 0.35286627 0.6779761 0.7821141 0.7644021 0.9095653 0.9545710 0
bwplot(results)
(a) Which tree-based regression model gives the optimal resampling and test set performance? The Cubist model has the best resampling and test set performance, with the lowest RMSE and highest \(R^2\). (b) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models? the most important predictors were dominated by manufacturing process variables, with ManufacturingProcess32, ManufacturingProcess13, and ManufacturingProcess17. The process variables dominate the list. The top 10 important predictors from the Cubist model are similar those from the optimal linear and nonlinear models, with process variables dominating across all approaches.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:party':
##
## where
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
cubist_varimp <- varImp(cubist_model, scale = FALSE)
top10_cubist <- cubist_varimp$importance %>%
as.data.frame() %>%
tibble::rownames_to_column("Predictor") %>%
arrange(desc(Overall)) %>%
head(10)
print(top10_cubist)
## Predictor Overall
## 1 ManufacturingProcess32 48.5
## 2 ManufacturingProcess17 47.5
## 3 ManufacturingProcess13 25.0
## 4 ManufacturingProcess01 16.5
## 5 ManufacturingProcess30 16.5
## 6 BiologicalMaterial03 15.5
## 7 ManufacturingProcess09 15.0
## 8 ManufacturingProcess29 14.0
## 9 BiologicalMaterial08 12.5
## 10 ManufacturingProcess21 11.5