Packages:

library(tidyverse)
library(mlbench)
library(randomForest)
library(caret)
library(AppliedPredictiveModeling)
library(knitr)
library(party)
library(gbm)
library(Cubist)
library(rpart)

Exercise 8.1:

Recreate the simulated data from Exercise 7.2:

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

Did the random forest model significantly use the uninformative predictors (V6 to V10)?

cols <- c("Predictor", "Importance")
rfImp1 <- rfImp1 |>
    rownames_to_column()
colnames(rfImp1) <- cols
rfImp1 <- rfImp1 |>
    arrange(desc(Importance))
knitr::kable(rfImp1, format = "simple")
Predictor Importance
V1 8.7322354
V4 7.6151188
V2 6.4153694
V5 2.0235246
V3 0.7635918
V6 0.1651112
V7 -0.0059617
V10 -0.0749448
V9 -0.0952927
V8 -0.1663626

No, the estimated variable importance scores are near zero for the uninformative predictors (V6 to V10).

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

model2 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2 <- rfImp2 |>
    rownames_to_column()
colnames(rfImp2) <- cols
rfImp2 <- rfImp2 |>
    arrange(desc(Importance))
knitr::kable(rfImp2, format = "simple")
Predictor Importance
V4 7.0475224
V2 6.0689606
V1 5.6911997
duplicate1 4.2833158
V5 1.8723844
V3 0.6297022
V6 0.1356906
V10 0.0289481
V9 0.0084044
V7 -0.0134564
V8 -0.0437056

As a result of adding duplicate1, a predictor that was highly correlated with V1, the estimated importance score for V1 decreased from \(8.73\) to \(5.69\). V1 was originally the most important variable, but now V4 and V2 are both considered more important than V1. The estimated importance score of \(4.28\) that duplicate1 received is a little less than 50 percent of the original estimated importance score for V1.

We add a second predictor that is highly correlated with V1 to see what happens.

e <- seq(-0.5, 0.5, 0.01)
simulated$duplicate2 <- simulated$V1 * (2 - sample(e, 200, replace = TRUE))
cor(simulated$duplicate2, simulated$V1)
## [1] 0.964375
model3 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp3 <- varImp(model3, scale = FALSE)
rfImp3 <- rfImp3 |>
    rownames_to_column()
colnames(rfImp3) <- cols
rfImp3 <- rfImp3 |>
    arrange(desc(Importance))
knitr::kable(rfImp3, format = "simple")
Predictor Importance
V4 7.1901547
V2 6.7210069
V1 3.9176362
duplicate2 3.5380604
duplicate1 3.1250315
V5 1.8034213
V3 0.4269919
V6 0.0889672
V7 0.0758502
V10 0.0416157
V9 -0.0317967
V8 -0.1025550

Adding duplicate2, another predictor that was highly correlated with V1, decreased the estimated importance score for V1 even further to \(3.92\). With three highly correlated variables, duplicate1 and duplicate2 each received estimated variable importance scores equal to approximately 35 to 40 percent of the original estimated variable importance score for V1. We see that increasing the number of correlated variables decreases estimates for these correlated variables’ individual variable importance scores. The relationship is approximately inverse.

First, we remove the collinear variables we created so that we’re only comparing the estimated importance scores of the original predictors V1 to V10.

duplicates <- c("duplicate1", "duplicate2")
simulated <- simulated |>
    select(-all_of(duplicates))

Then we estimate the unconditional importance scores.

model4 <- cforest(y ~ ., data = simulated,
                  controls = cforest_unbiased(ntree = 500))
cfImp4_trad <- varimp(model4, conditional = FALSE)
cfImp4_trad <- as.data.frame(cfImp4_trad) |>
    rownames_to_column()
cols <- c("Predictor", "Unconditional_Importance")
colnames(cfImp4_trad) <- cols
cfImp4_trad <- cfImp4_trad |>
    arrange(desc(Unconditional_Importance))
knitr::kable(cfImp4_trad, format = "simple")
Predictor Unconditional_Importance
V1 8.8718345
V4 8.3130761
V2 6.6952094
V5 1.8527520
V3 0.0424266
V7 0.0119454
V9 -0.0034614
V6 -0.0141549
V10 -0.0365760
V8 -0.0582395

Next we estimate the conditional importance scores.

cfImp4_mod <- varimp(model4, conditional = TRUE)
cfImp4_mod <- as.data.frame(cfImp4_mod) |>
    rownames_to_column()
cols <- c("Predictor", "Conditional_Importance")
colnames(cfImp4_mod) <- cols
cfImp4_mod <- cfImp4_mod |>
    arrange(desc(Conditional_Importance))
knitr::kable(cfImp4_mod, format = "simple")
Predictor Conditional_Importance
V4 6.4224045
V1 5.3349757
V2 5.1917815
V5 1.1309958
V6 0.0131644
V7 0.0111674
V9 0.0057002
V3 0.0023189
V10 -0.0079181
V8 -0.0240279

The estimated unconditional importance scores maintain the same order of importance as the traditional random forest model’s estimated importance scores, although V3’s importance is estimated to be much closer to zero in the cforest model than its importance in the traditional random forest model was.

The estimated conditional importance scores shift the order of importance such that V4 is now the most important variable, and V3 no longer makes the top five most important predictors. V6, V7, and V9, which we know to be irrelevant, all outrank V3 in estimated conditional importance.

We create a boosted tree model using the gbm function from the gbm library, and we extract the estimated relative importance scores using the summary.gbm function from the same library.

model5 <- gbm(y ~ ., data = simulated,
                  distribution = "gaussian", n.trees = 500)
gbmImp5 <- summary.gbm(model5, n.trees = 500, order = TRUE, plotit = FALSE)
rownames(gbmImp5) <- NULL
cols <- c("Predictor", "Relative_Importance")
colnames(gbmImp5) <- cols
knitr::kable(gbmImp5, format = "simple")
Predictor Relative_Importance
V4 27.792368
V1 23.356956
V2 21.078367
V5 10.787111
V3 9.390918
V7 2.293095
V6 1.720987
V10 1.412284
V9 1.163810
V8 1.004104

The top five most important predictors are V1 to V5 in the boosted tree model, but unlike the rankings in the traditional random forest model, V4 is ranked higher than V1.

We create a Cubist model using the cubist function from the Cubist library and extract the estimated importance scores using the varImp function from the caret library.

model6 <- cubist(simulated |> select(-y), simulated$y)
cubImp6 <- varImp(model6)
cubImp6 <- cubImp6 |>
    rownames_to_column()
cols <- c("Predictor", "Importance")
colnames(cubImp6) <- cols
cubImp6 <- cubImp6 |>
    arrange(desc(Importance))
knitr::kable(cubImp6, format = "simple")
Predictor Importance
V1 50
V2 50
V4 50
V5 50
V3 0
V6 0
V7 0
V8 0
V9 0
V10 0

V1, V2, V4, and V5 are estimated to be the only important predictors here, and they all have equal scores. V3 has been lumped in with the unimportant predictors again.

Exercise 8.2:

Use a simulation to show tree bias with different granularities.

We create predictor variables with lower variance a through d, and we create predictor variables with higher variance e and f. The response variable y is a simple sum of the predictor variables a through f and a random error term. We then estimate the importance of the predictor variables.

a <- as.data.frame(rnorm(200, mean = 1000, sd = 0.001))
b <- as.data.frame(rnorm(200, mean = 1000, sd = 0.01))
c <- as.data.frame(rnorm(200, mean = 1000, sd = 0.1))
d <- as.data.frame(rnorm(200, mean = 1000, sd = 1))
e <- as.data.frame(rnorm(200, mean = 1000, sd = 10))
f <- as.data.frame(rnorm(200, mean = 1000, sd = 100))
error <- seq(-100, 100, 10)
z <- a |>
    bind_cols(b, c, d, e, f)
colnames(z) <- c("a", "b", "c", "d", "e", "f")
z <- z |>
    mutate(y = a + b + c + d + e + f + sample(error, size = 1))
model7 <- randomForest(y ~ ., data = z,
                       importance = TRUE,
                       ntree = 1000)
rfImp7 <- varImp(model7, scale = FALSE)
rfImp7 <- rfImp7 |>
    rownames_to_column()
colnames(rfImp7) <- cols
rfImp7 <- rfImp7 |>
    arrange(desc(Importance))
knitr::kable(rfImp7, format = "simple")
Predictor Importance
f 17531.7710190
a 26.8862698
b -0.1192625
e -20.8888066
d -115.4973783
c -124.2298652

The highest variance predictor, f, is estimated to be much more important than any of the other predictors. However, a and b, lower variance predictors, are estimated to be more important than e, a higher variance predictor.

Exercise 8.3:

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

The model on the right has a high bagging fraction, so its trees are more diverse. Thus, overfitting has been reduced, and fewer predictors with more relative importance each have been favored over many predictors with less relative importance each. However, its learning rate is also high, so it’s possible the wrong small number of predictors have been favored.

The model on the right would normally be better at predicting on new data since it doesn’t suffer form overfitting, but there is some uncertainty due to its high learning rate.

Increasing interaction depth should reduce the slope of predictor importance for either model. That is to say, the drop from most important predictor to least important predictor should become more gradual.

Exercise 8.7:

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

data(ChemicalManufacturingProcess)
x <- colSums(is.na(ChemicalManufacturingProcess))
missing_val_cols <- names(x[x > 0])
ChemicalManufacturingProcess <- ChemicalManufacturingProcess |>
    VIM::kNN(variable = missing_val_cols, k = 15, numFun = weighted.mean,
             weightDist = TRUE, imp_var = FALSE)
nzv_predictors <- nearZeroVar(ChemicalManufacturingProcess |> select(-Yield),
                              names = TRUE, saveMetrics = FALSE)
ChemicalManufacturingProcess <- ChemicalManufacturingProcess |>
    select(-all_of(nzv_predictors))
rows <- sample(nrow(ChemicalManufacturingProcess))
ChemicalManufacturingProcess <- ChemicalManufacturingProcess[rows, ]
sample <- sample(c(TRUE, FALSE), nrow(ChemicalManufacturingProcess),
                 replace=TRUE, prob=c(0.7,0.3))
train_CMP <- ChemicalManufacturingProcess[sample, ]
train_CMP_x <- train_CMP |>
    select(-Yield)
train_CMP_y <- train_CMP$Yield
train_CMP_y <- as.numeric(train_CMP_y)
test_CMP <- ChemicalManufacturingProcess[!sample, ]
test_CMP_x <- test_CMP |>
    select(-Yield)
test_CMP_y <- test_CMP$Yield
test_CMP_y <- as.numeric(test_CMP_y)

We train a Single Regression Tree, Boosted Tree, and Cubist model.

rpartTune <- train(train_CMP_x, train_CMP_y,
                   method = "rpart2",
                   tuneLength = 10,
                   trControl = trainControl(method = "cv"))
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
                       n.trees = seq(100, 1000, by = 50),
                       shrinkage = c(0.01, 0.1),
                       n.minobsinnode = 10)
gbmTune <- train(train_CMP_x, train_CMP_y,
                 method = "gbm",
                 tuneGrid = gbmGrid,
                 verbose = FALSE)
cubistTuned <- train(train_CMP_x, train_CMP_y,
                    method = "cubist")
test_pred1 <- predict(rpartTune, test_CMP_x)
test_pred2 <- predict(gbmTune, test_CMP_x)
test_pred3 <- predict(cubistTuned, test_CMP_x)
test_rsq1 <- as.numeric(R2(test_pred1, test_CMP_y, form = "traditional"))
test_rsq2 <- as.numeric(R2(test_pred2, test_CMP_y, form = "traditional"))
test_rsq3 <- as.numeric(R2(test_pred3, test_CMP_y, form = "traditional"))
test_rmse1 <- as.numeric(RMSE(test_pred1, test_CMP_y))
test_rmse2 <- as.numeric(RMSE(test_pred2, test_CMP_y))
test_rmse3 <- as.numeric(RMSE(test_pred3, test_CMP_y))
models <- c("Single Regression Tree", "Boosted Tree", "Cubist")
rsqs <- round(c(test_rsq1, test_rsq2, test_rsq3), 4)
rmses <- round(c(test_rmse1, test_rmse2, test_rmse3), 4)
tbl <- as.data.frame(cbind(models, rsqs, rmses))
cols <- c("Model", "Predictive_RSquared", "RMSE")
colnames(tbl) <- cols
tbl <- tbl |>
    arrange(desc(Predictive_RSquared))
knitr::kable(tbl, format = "simple")
Model Predictive_RSquared RMSE
Cubist 0.7872 0.9133
Boosted Tree 0.6722 1.1334
Single Regression Tree 0.2088 1.7608

The Cubist model has the highest predictive \(R^2\) and the lowest \(RMSE\).

The top 20 most important predictors in the Cubist model are:

cubistImportance <- varImp(cubistTuned)
cubistImportance <- cubistImportance$importance |>
    rownames_to_column()
cols <- c("Predictor", "Importance")
colnames(cubistImportance) <- cols
cubistImportance <- cubistImportance |>
    arrange(desc(Importance)) |>
    top_n(n = 20)
knitr::kable(cubistImportance, format = "simple")
Predictor Importance
ManufacturingProcess32 100.000000
ManufacturingProcess29 50.000000
ManufacturingProcess09 48.837209
ManufacturingProcess17 45.348837
BiologicalMaterial03 45.348837
ManufacturingProcess28 45.348837
ManufacturingProcess13 38.372093
ManufacturingProcess25 33.720930
BiologicalMaterial02 32.558140
ManufacturingProcess02 29.069767
ManufacturingProcess39 20.930233
ManufacturingProcess01 16.279070
ManufacturingProcess31 16.279070
ManufacturingProcess33 16.279070
BiologicalMaterial01 10.465116
BiologicalMaterial06 10.465116
ManufacturingProcess26 10.465116
ManufacturingProcess27 10.465116
ManufacturingProcess37 10.465116
BiologicalMaterial04 6.976744

The manufacturing process variables dominate the list.

The top 10 most important predictors in the optimal linear and nonlinear models were:

top_10_linear <- c("ManufacturingProcess09", "ManufacturingProcess32",
                   "ManufacturingProcess34", "ManufacturingProcess45",
                   "ManufacturingProcess29", "BiologicalMaterial05",
                   "BiologicalMaterial03", "ManufacturingProcess06",
                   "ManufacturingProcess04", "ManufacturingProcess01")
top_10_nonlinear <- c("ManufacturingProcess32", "ManufacturingProcess36",
                      "ManufacturingProcess37", "ManufacturingProcess06",
                      "ManufacturingProcess09", "BiologicalMaterial09",
                      "BiologicalMaterial05", "ManufacturingProcess04",
                      "ManufacturingProcess17", "ManufacturingProcess20")
top_10 <- as.data.frame(top_10_linear) |>
    bind_cols(top_10_nonlinear)
colnames(top_10) <- c("Linear Top 10", "Nonlinear Top 10")
knitr::kable(top_10, format = "simple")
Linear Top 10 Nonlinear Top 10
ManufacturingProcess09 ManufacturingProcess32
ManufacturingProcess32 ManufacturingProcess36
ManufacturingProcess34 ManufacturingProcess37
ManufacturingProcess45 ManufacturingProcess06
ManufacturingProcess29 ManufacturingProcess09
BiologicalMaterial05 BiologicalMaterial09
BiologicalMaterial03 BiologicalMaterial05
ManufacturingProcess06 ManufacturingProcess04
ManufacturingProcess04 ManufacturingProcess17
ManufacturingProcess01 ManufacturingProcess20

The predictors that are in the top 10 of the optimal tree-based regression model, but not in the top 10 of either of the optimal linear or nonlinear regression models, are:

x <- cubistImportance[1:10, 1]
x <- x[!x %in% unique(c(top_10_linear, top_10_nonlinear))]
x <- as.data.frame(x)
colnames(x) <- "Predictor"
knitr::kable(x, format = "simple")
Predictor
ManufacturingProcess28
ManufacturingProcess13
ManufacturingProcess25
BiologicalMaterial02
ManufacturingProcess02

The predictors that are in the top 10 of the optimal linear or nonlinear regression models, but not in the top 10 of the optimal tree-based regression model, are:

x <- unique(c(top_10_linear, top_10_nonlinear))
x <- x[!x %in% cubistImportance[1:10, 1]]
x <- as.data.frame(x)
colnames(x) <- "Predictor"
knitr::kable(x, format = "simple")
Predictor
ManufacturingProcess34
ManufacturingProcess45
BiologicalMaterial05
ManufacturingProcess06
ManufacturingProcess04
ManufacturingProcess01
ManufacturingProcess36
ManufacturingProcess37
BiologicalMaterial09
ManufacturingProcess20
p1 <- dotplot(cubistTuned$finalModel, what = "splits")
p1

We unfortunately aren’t quite sure how to interpret the only plot we’re able to produce for Cubist models.