library(AppliedPredictiveModeling)
library(caret)
library(corrplot)
library(Cubist)
library(dplyr)
library(earth)
library(gbm)
library(ggplot2)
library(gridExtra)
library(ipred)
library(kernlab)
library(knitr)
library(MASS)
library(mice)
library(mlbench)
library(ModelMetrics)
library(party)
library(psych)
library(randomForest)
library(rpart)
library(tidyr)
library(utils)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
simulated.orig = simulated
# Fit a random forest model to all of the predictors, then estimate the
# variable importance scores:
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
model1
##
## Call:
## randomForest(formula = y ~ ., data = simulated, importance = TRUE, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 6.754258
## % Var explained: 72.3
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
importance(model1)
## %IncMSE IncNodePurity
## V1 57.1035713 1063.5046
## V2 47.0088894 941.2445
## V3 10.8342807 299.2105
## V4 51.6960809 1077.4011
## V5 22.0568333 478.4097
## V6 2.9419111 182.3768
## V7 -0.1001392 204.3141
## V8 -3.2180039 147.7224
## V9 -1.8067946 154.3783
## V10 -1.3228315 184.4402
Did the random forest model significantly use the uninformative predictors (V6 - V10)?
We see that \(V6 - V10\) are the least important predictors, as shown by the first column (\(IncMSE\)) of the output of the \(importance()\) function. The decrease in accuracy caused by excluding the predictors \(V6 - V10\) is relatively small compared to the other predictors.
Now add an additional predictor that is highly correlated with one of the informative predictors. Fit another random forest model to these data.
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
model2 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
model2
##
## Call:
## randomForest(formula = y ~ ., data = simulated, importance = TRUE, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 6.922537
## % Var explained: 71.61
importance(model2)
## %IncMSE IncNodePurity
## V1 32.9767755 798.1209
## V2 47.1898096 833.3881
## V3 10.4550932 253.4221
## V4 51.0810910 937.5096
## V5 22.5698231 416.2464
## V6 2.8046502 164.9521
## V7 -0.2664751 170.5892
## V8 -1.0005274 126.4463
## V9 0.1925953 136.1965
## V10 0.5977125 161.6598
## duplicate1 28.4077023 701.3079
Add another predictor correlated with V1:
simulated$duplicate2 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9408631
model3 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
model3
##
## Call:
## randomForest(formula = y ~ ., data = simulated, importance = TRUE, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 6.784205
## % Var explained: 72.18
importance(model3)
## %IncMSE IncNodePurity
## V1 28.4726297 689.7017
## V2 50.7990567 859.5530
## V3 10.8410623 219.7327
## V4 54.8742896 952.4116
## V5 24.0380225 397.1578
## V6 3.3237982 147.4934
## V7 2.3928835 137.1524
## V8 -2.1256429 107.8981
## V9 -0.2832118 109.2677
## V10 2.1062647 126.0633
## duplicate1 23.2238354 604.6521
## duplicate2 17.0728309 370.5212
\(Q.\) Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
The importance score for V1 is lowered when another predictor highly correlated with V1 is added ot the input dataset. It decreases further when a second such predictor is added to the dataset.
Use the \(cforest()\) function in the \(party\) package to fit a random forest model using conditional inference trees. The \(party\) package function \(varimp()\) can calculate predictor importance. The \(conditional\) argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?
simulated = simulated.orig
# For random forests, the mtry parameter should be less than the number of predictors (the
# number of columns minus 1 for the outcome). Since we want to compare cforest() with the randomForest()
# function used previously, we set it to the default value used by randomForest(), which is the
# number of predictors divided by 3.
num_predictors = (ncol(simulated) - 1)/3
rf_cforest_ctrl <- cforest_control(mtry = num_predictors)
rf_cforest <- party::cforest(y ~ ., data = simulated, controls = rf_cforest_ctrl)
rf_cforest
##
## Random Forest using Conditional Inference Trees
##
## Number of trees: 500
##
## Response: y
## Inputs: V1, V2, V3, V4, V5, V6, V7, V8, V9, V10
## Number of observations: 200
varimp(rf_cforest, conditional=FALSE)
## V1 V2 V3 V4 V5 V6
## 7.21884777 4.82031067 0.10689702 6.32456337 2.13566587 0.03967575
## V7 V8 V9 V10
## 0.06735770 -0.08614242 -0.04728777 -0.08579815
varimp(rf_cforest, conditional=TRUE)
## V1 V2 V3 V4 V5
## 3.076320455 3.299142480 0.099842679 4.152675630 0.867081262
## V6 V7 V8 V9 V10
## 0.017381991 0.051367105 -0.048744193 -0.006197914 -0.017429174
Although the importance scores are different, these models show the same sets of importances as the traditional random forest models.
Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?
set.seed(123)
gbmModel <- gbm(y ~ ., data = simulated, distribution = "gaussian", cv.folds=5)
print(gbmModel)
## gbm(formula = y ~ ., distribution = "gaussian", data = simulated,
## cv.folds = 5)
## A gradient boosted model with gaussian loss function.
## 100 iterations were performed.
## The best cross-validation iteration was 100.
## There were 10 predictors of which 5 had non-zero influence.
summary(gbmModel, method = relative.influence, cBars=10, las=2)
## var rel.inf
## V4 V4 29.036562
## V1 V1 28.938702
## V2 V2 23.090319
## V5 V5 10.505565
## V3 V3 8.428852
## V6 V6 0.000000
## V7 V7 0.000000
## V8 V8 0.000000
## V9 V9 0.000000
## V10 V10 0.000000
simulated_x = simulated %>% dplyr::select(-y)
simulated_y = simulated$y
cubistModel = cubist(x = simulated_x, y = simulated_y)
varImp(cubistModel)
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
We do see the same pattern regarding importance of predictors with both Boosted Trees and Cubist models.
Use a simulation to show tree bias with different granularities.
To show tree bias as a function of granularities of the predictors, we create a simulated data set (shown as the data frame \(example_data\) below). It includes a predictor variable P1 and a response variable which has a high correlation with it. We then add a completely random variable with high variance (shown as \(rand1\) below) as a second predictor. Because of bias, we would then expect to see the random variable with high variance get a higher relative importance compared to other predictors. This is an illustration of tree bias in favor of high variance predictors.
set.seed(123)
example_data = data.frame(rand1 = rnorm(100, sd=10, mean=0), P1 = rep(1:100, each=1))
# We now add a response that is strongly correlated with predictor P1
example_data$response = example_data$P1 * rnorm(100)
tree_model <- rpart(response ~ ., data = example_data)
varImp(tree_model)
## Overall
## P1 0.1737314
## rand1 0.3386625
From the above simulation result we confirm that the regression tree model has high bias when a random predictor with high variance is added.
Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?
In the well-known 1999 paper on Stochastic Gradient Boosting, its author (Friedman) recommended small values of sampling fraction and learning rate. The model on the left incorporates these suggestions; the one on the right does not. One possibility is that, due to its high bagging fraction, the right model is biased in favor of predictors with more variance: if the data set contained a few predictors with high variance, they would be considered more important by an algorithm with high bagging fraction.
Which model do you think would be more predictive of other samples?
The left model has a lower bagging fraction, thereby selecting fewer predictors on average, and would therefore be less prone to overfitting. One would thus expect it to be more predictive of other samples.
How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?
In the context of boosting tress, tree depth is also known as interaction depth. Increasing the size of the trees is believed to usually degrade performance through overfitting. One would expect that by doing so, both the left and right models would have the effect of increasing the importance of their top predictors, i.e. doing would probably make the top predictors more important and the bottom ones even less so for both models.
Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:
Which tree-based regression model gives the optimal resampling and test set performance?
data(ChemicalManufacturingProcess)
chem.full = ChemicalManufacturingProcess
# Use MICE library for imputation of missing data.
imputed.1 <- mice(chem.full, m=5, maxit = 5, seed = 500, printFlag=F)
chem = mice::complete(imputed.1, 2)
set.seed(100)
index = sample(1:floor(0.7 * nrow(chem)))
chem.train = chem[index,] %>% dplyr::select(-Yield)
chem.train.Y = chem[index,]$Yield
chem.test = chem[-index,] %>% dplyr::select(-Yield)
chem.test.Y = chem[-index,]$Yield
chem.train.df = chem.train
chem.train.df$y = chem.train.Y
chem.test.df = chem.test
chem.test.df$y = chem.test.Y
set.seed(100)
rf.fit = randomForest(y ~ ., data = chem.train.df)
rf.pred = predict(rf.fit, chem.test)
rmse.rf = rmse(chem.test.Y, rf.pred)
cat("For Random Forest, Resampled RMSE on training data =", rmse.rf)
## For Random Forest, Resampled RMSE on training data = 1.577754
set.seed(100)
ctrl <- trainControl(method = "cv", number = 10)
# CV bagged model
baggedTree.fit <- train(y ~ .,
data = chem.train.df,
method = "treebag",
trControl = ctrl,
importance = TRUE)
baggedTree.pred <- predict(baggedTree.fit, chem.test)
rmse.baggedTree = rmse(chem.test.Y, baggedTree.pred)
cat("For Bagged Tree, Resampled RMSE on training data =", rmse.baggedTree)
## For Bagged Tree, Resampled RMSE on training data = 1.52044
set.seed(100)
gbm.fit = gbm(formula = y ~ .,
distribution = "gaussian",
data = chem.train.df,
n.trees = 6000,
verbose = FALSE)
gbm.pred = predict(gbm.fit, n.trees = 6000, chem.test)
rmse.gbm = rmse(chem.test.Y, gbm.pred)
cat("For GBM, Resampled RMSE on training data =", rmse.gbm)
## For GBM, Resampled RMSE on training data = 2.165997
The Bagged Tree model produces the best RMSE results on the test data.
Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?
imp1 = varImp(baggedTree.fit)
#print(imp1)
plot(imp1, 20)
We find that the top predictors selected by the bagging tree model are different than those selected by the optimal linear and nonlinear models.