library(mlbench)
library(caret)
library(randomForest)
library(dplyr)
library(party)
library(gbm)
library(Cubist)
library(AppliedPredictiveModeling)
library(rpart)
library(tibble)
library(kableExtra)
library(mice)
library(vip)

Problem 8.1

Recreate the simulated data from Exercise 7.2:

set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

Random Forest

Fit a random forest model to all of the predictors, then estimate thevariable importance scores

model1 <- randomForest(y ~ ., 
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp1 <- varImp(model1, scale = FALSE)
rfImp1 %>%
  kable() %>%
    kable_styling()
Overall
V1 8.7322354
V2 6.4153694
V3 0.7635918
V4 7.6151188
V5 2.0235246
V6 0.1651112
V7 -0.0059617
V8 -0.1663626
V9 -0.0952927
V10 -0.0749448
lv <- rownames(rfImp1)

rfImp1 %>% rownames_to_column(var="Variable") %>%
  mutate(Variable = factor(Variable, levels=lv, ordered=T)) %>%
  ggplot(aes(x=Overall, y=factor(Variable))) + geom_col() +
  labs(title="Variable Importance", subtitle="Random Forest Model",
       y="Variable", x="Overall Importance")

Fit a random forest model to all of the predictors, then estimate the variable importance scores. Did the random forest model significantly use the uninformative predictors (V6 – V10)?

Compared to other predictors, the model did not use the uniformative predictors V6 ~ V10 as they each have very small importance.

Additional Predictor

Now add an additional predictor that is highly correlated with one of theinformative predictors.

set.seed(200)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9497025
rf2 <- randomForest(y ~ ., 
                       data = simulated,
                       importance = TRUE,
                       ntree = 1000)
rfImp2 <- varImp(rf2, scale = FALSE)
rfImp2 %>%
  kable() %>%
    kable_styling()
Overall
V1 6.0070978
V2 6.0593790
V3 0.5846529
V4 6.8636329
V5 2.1993989
V6 0.1089804
V7 0.0610421
V8 -0.0405920
V9 0.0612366
V10 0.0999934
duplicate1 4.4332317
lv <- rownames(rfImp2)

rfImp2 %>% rownames_to_column(var="Variable") %>%
  mutate(Variable = factor(Variable, levels=lv, ordered=T)) %>%
  ggplot(aes(x=Overall, y=factor(Variable))) + geom_col() +
  labs(title="Variable Importance", subtitle="Random Forest Model w/ 1 duplicate",
       y="Variable", x="Overall Importance")

Fit another random forest model to these data. Did the importance scoreforV1change? What happens when you add another predictor that isalso highly correlated with V1?

Score V1 is lower and did change. As duplicated1 is highly correlated to V1, the score is almost split between the two predictors.

cForest

cforestModel <- cforest(y ~ ., data=simulated)

# Unconditional
varimp(cforestModel) %>% 
  sort(decreasing = T) %>%
    kable() %>%
      kable_styling() 
x
V4 7.3425229
V1 6.8132534
V2 6.2300356
duplicate1 2.5367822
V5 1.8964155
V3 0.0304965
V9 -0.0038388
V7 -0.0144267
V10 -0.0168416
V8 -0.0293477
V6 -0.0313029
varimp(cforestModel, conditional=T) %>% sort(decreasing = T) # Conditional (new) importance measure 
##           V4           V2           V1   duplicate1           V5           V3 
##  6.454461879  4.925594442  3.451196767  1.193686871  1.161453114  0.012921429 
##          V10           V6           V7           V9           V8 
##  0.010784008  0.008566096  0.008551612  0.002857419 -0.011831100

The conditional = T measurement differs slightly from the traditional measurement as both measurements scored V4 as the most important predictor, V6 ~ V10 are still considered to be of low importance and V1, along with the highly correlated predictor: duplicated1, are reduced the most.

Boosted Trees and Cubist

Repeat this process with different tree models, such as boosted trees andCubist. Does the same pattern occur?

As we see, the V6 ~ V10 has low score. V4 is the top predictor followed by V2. We also see the predictor V1 and duplicate1 has higher difference. V1 could be more important than duplicated1

gbmModel <- gbm(y ~ ., data=simulated, distribution='gaussian')
summary(gbmModel)

##                   var    rel.inf
## V4                 V4 29.2878965
## V1                 V1 22.7111395
## V2                 V2 21.9712536
## V5                 V5 11.7719972
## V3                 V3  7.5956111
## duplicate1 duplicate1  6.2765877
## V6                 V6  0.2526191
## V9                 V9  0.1328954
## V7                 V7  0.0000000
## V8                 V8  0.0000000
## V10               V10  0.0000000
# Committees =100 as GBM model uses 100 trees
cubistModel <- cubist(x=simulated[,-(ncol(simulated)-1)], y=simulated$y, committees=100)
varImp(cubistModel) %>%
    kable() %>%
      kable_styling() 
Overall
V1 71.0
V3 46.5
V2 59.0
V4 48.0
V5 32.5
V6 12.0
V8 1.0
V7 0.0
V9 0.0
V10 0.0
duplicate1 0.0

8.2 Simulation

As we read “trees suffer from selection bias: predictors with a higher number of distinct values are favored over more granular predictors”. We can perform a test by training a large number of trees and seeing which variable is chosen from simulated data.

set.seed(200)
X1 <- rep(1:2,each=100)
X2 <- rnorm(200,mean=0,sd=2)
Y <- X1 + rnorm(200,mean=0,sd=4)

df1 <- data.frame(Y=Y, X1=X1, X2=X2)

mod <- rpart(Y ~ ., data = df1)
varImp(mod) %>%
    kable() %>%
      kable_styling()
Overall
X1 0.1390440
X2 0.4393341

Variable X2 had no direct impact on outcome Y, the trees found that to be the most important variable most of the time. X2 is now the more important variable in this dataset which affects the bias.

8.3 Stochastic Gradient Boosting

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

Predictors

Why does the model on the right focus its importance on just the first few of predictors, whereas the model onthe left spreads importance across more predictors?

The learning rate increases, the model on the right focuses its importance on first few pewdictors. Also due to the bagging fraction, the higher the fraction, the less predictors will be identified as important.

Predictive

Which model do you think would be more predictive of other samples?

The left-hand model is likely to do better on data it has not seen. I would suspect that more dissimilar weak learners contributing, rather than fewer similar trees (in terms of variable selection), gives the final model more flexibility with out-of-sample observations.

Interaction Depth

How would increasing interaction depth affect the slope of predictor importance for either modelin Fig.8.24?

Variable importance for boosting is a function of the reduction in squared error. I would expect that the slopes would decrease as the interaction depth was increased because the model allows more variables to be utilized in the trees.

8.7 Chemical Manufacturing

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

data(ChemicalManufacturingProcess)
tmp.data <- mice(ChemicalManufacturingProcess,m=2,maxit=5,meth='pmm',seed=500)
ChemicalManufacturingProcess = complete(tmp.data)

# train test split
set.seed(100)
rows = nrow(ChemicalManufacturingProcess)
t.index <- sample(1:rows, size = round(0.75*rows), replace=FALSE)
df.train <- ChemicalManufacturingProcess[t.index ,]
df.test <- ChemicalManufacturingProcess[-t.index ,]
df.train.x = df.train[,-1]
df.train.y = df.train[,1]
df.test.x = df.test[,-1]
df.test.y = df.test[,1]
model.eval = function(modelmethod, gridSearch = NULL)
{
  Model = train(x = df.train.x, y = df.train.y, method = modelmethod, tuneGrid = gridSearch, preProcess = c('center', 'scale'), trControl = trainControl(method='cv'))
  Pred = predict(Model, newdata = df.test.x)
  modelperf = postResample(Pred, df.test.y)
  print(modelperf)
}

Question A

Which tree-based regression model gives the optimal resampling and test set performance?

Simple Tree

perftree = model.eval('rpart')
##     RMSE Rsquared      MAE 
## 1.486780 0.227814 1.210479

Random Forest

perfrf = model.eval('rf')
##      RMSE  Rsquared       MAE 
## 1.1522583 0.4554709 0.9178730

Boosting Trees

perfgbm = model.eval('gbm')

Cubist

perfcubist = model.eval('cubist')
##      RMSE  Rsquared       MAE 
## 1.0886085 0.5405831 0.8232466
df.perf = rbind(data.frame(Name = 'SimpleTree', RMSE = perftree[1]), data.frame(Name= 'RandomForest', RMSE = perfrf[1]) , data.frame(Name = 'BoostingTree', RMSE = perfgbm[1]), data.frame(Name = 'Cubist', RMSE = perfcubist[1]))

ggplot(data = df.perf, aes(x = Name, y = RMSE, fill=Name)) +
  geom_bar(stat="identity", position=position_dodge()) +
  geom_text(aes(label=RMSE), vjust=1, color="white",
            position = position_dodge(0.9), size=3.5)

As we see the performance chart the Cubist model gives the lowest RMSE on test set. Cubist is the most optimal model for this dataset

Question B

Which predictors are most important in the optimal tree-based regressionmodel? Do either the biological or process variables dominate the list?How do the top 10 important predictors compare to the top 10 predictorsfrom the optimal linear and nonlinear models?

cModel <- train(x = df.train.x,
                     y = df.train.y,
                     method = 'cubist')
vip(cModel, color = 'red', fill='purple')

Question C

Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

library(rpart.plot)
multi.class.model  = rpart(Yield~., data=df.train)
rpart.plot(multi.class.model)

As we see from the above tree plot manufacturing process vairaibles contributes to higher Yield