Libraries

library(tidyverse)
library(caret)
library(earth)
library(knitr)
library(caret)
library(pls)
library(glmnet)
library(Amelia)
library(knitr)
library(mice)
library(psych)
library(rattle)

Exercise 8.1

Recreate the simulated data from Exercise 7.2:

library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(a)Fit a random forest model to all of the predictors, then estimate the variable importance scores:

library(randomForest)
library(caret)
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

Did the random forest model significantly use the uninformative predictors (V6 – V10)?

rfImp1
##          Overall
## V1   8.732235404
## V2   6.415369387
## V3   0.763591825
## V4   7.615118809
## V5   2.023524577
## V6   0.165111172
## V7  -0.005961659
## V8  -0.166362581
## V9  -0.095292651
## V10 -0.074944788

No, random forest model didn’t significantly use the uninformative predictors (V6 – V10) as we can see from above that (V6 – V10) has very lower importance compared to (V1-V5).

(b)Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

model2 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
##                Overall
## V1          5.69119973
## V2          6.06896061
## V3          0.62970218
## V4          7.04752238
## V5          1.87238438
## V6          0.13569065
## V7         -0.01345645
## V8         -0.04370565
## V9          0.00840438
## V10         0.02894814
## duplicate1  4.28331581

Yes, importance score for V1 change after we added an additional predictor that is highly correlated with one of the informative predictors. The the addition of highly correlated variable with V1 changes the overall variable importance of the model. We can see that importance score of V1 reduced from 8.73 to 5.69.

(c)Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

library(party)
model3 <- cforest(y ~ ., data=simulated)
varimp(model3) %>% sort(decreasing = T)
##            V4            V2    duplicate1            V1            V5 
##  7.6223892727  6.0579730772  5.0941897280  4.6171158805  1.7161194047 
##            V7            V9            V3            V6           V10 
##  0.0465374951  0.0046062409  0.0003116115 -0.0289427183 -0.0310326410 
##            V8 
## -0.0380965511
# Conditional importance measure
varimp(model3, conditional = T) %>% sort(decreasing = T)
##           V4           V2   duplicate1           V1           V5           V6 
##  6.190578351  4.688980360  1.926751729  1.807953145  1.051666850  0.028174759 
##           V9           V3           V8           V7          V10 
##  0.015118505  0.012878752 -0.004356587 -0.011709437 -0.022190587

We can see from above that the importance slightly changes if we use conditional = True

(d)Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

library(gbm)
## Loaded gbm 2.1.8
gbm_model <- gbm(y ~ ., data=simulated, distribution='gaussian')
summary(gbm_model)

##                   var    rel.inf
## V4                 V4 29.9389549
## V2                 V2 21.6012918
## V1                 V1 15.6771729
## duplicate1 duplicate1 13.2189845
## V5                 V5 10.0440211
## V3                 V3  9.0948053
## V6                 V6  0.4247697
## V7                 V7  0.0000000
## V8                 V8  0.0000000
## V9                 V9  0.0000000
## V10               V10  0.0000000
library(Cubist)
cubist_model <- cubist(x=simulated[,-(ncol(simulated)-1)], y=simulated$y, committees=100)
varImp(cubist_model) %>% as.data.frame() %>% rownames_to_column() %>%   rename(Variable = rowname) %>%
  arrange(desc(Overall))
##      Variable Overall
## 1          V2    59.5
## 2          V1    52.5
## 3          V4    46.0
## 4          V3    43.5
## 5  duplicate1    27.5
## 6          V5    27.0
## 7          V6    10.0
## 8          V8     4.0
## 9         V10     1.0
## 10         V7     0.0
## 11         V9     0.0

Same pattern occurred. The uninformative predictors V6-V10 rated low in both boosted trees and Cubist.

Exercise 8.2

Use a simulation to show tree bias with different granularities

I will create a simulated dataset with 5 output variable and Y is the combination of X1 and x2.

x1 <- sample(0:10000 / 10000, 200, replace = T)
x2 <- sample(0:1000 / 1000, 200, replace = T)
x3 <- sample(0:100 / 100, 200, replace = T)
x4 <- sample(0:10 / 10, 200, replace = T)
y <- x1 + x2

df <- data.frame(x1, x2, x3, x4, y)
str(df)
## 'data.frame':    200 obs. of  5 variables:
##  $ x1: num  0.674 0.79 0.839 0.105 0.05 ...
##  $ x2: num  0.809 0.087 0.37 0.468 0.419 0.637 0.932 0.59 0.504 0.881 ...
##  $ x3: num  0.28 0.36 0.47 0.03 0.59 0.61 0.48 0.12 0.11 0.73 ...
##  $ x4: num  0.7 0.4 0.6 0.8 0.7 0.8 0.2 0.5 0.4 0.4 ...
##  $ y : num  1.483 0.877 1.209 0.573 0.469 ...
library(rpart)

rpart_tree <- rpart(y ~ ., data=df)
varImp(rpart_tree) %>% as.data.frame() %>% rownames_to_column() %>%   rename(Variable = rowname) %>%
  arrange(desc(Overall))
##   Variable   Overall
## 1       x2 5.0772481
## 2       x1 2.9888327
## 3       x4 0.4711458
## 4       x3 0.3622335

We can see importance differ because of granularities of variable. Also, the model gives importance to x4 and x3 while we didn’t use them to calculate y.

Exercise 8.3

In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

(a)Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

Learning rate determines the contribution of each tree on the final outcome and controls how quickly the algorithm proceeds down the gradient descent (learns); Values range from 0–1 with typical values between 0.001–0.3. Smaller values make the model robust to the specific characteristics of each individual tree, thus allowing it to generalize well. Smaller values also make it easier to stop prior to over-fitting; however, they increase the risk of not reaching the optimum with a fixed number of trees and are more computationally demanding. This hyperparameter is also called shrinkage. Generally, the smaller this value, the more accurate the model can be but also will require more trees in the sequence.

Source: https://bradleyboehmke.github.io/HOML/gbm.html

In our example learning rate is set to 0.1 in left-hand plot which makes the importance of variables get’s spread out over more predictors. The higher learning rate will focus the importance on a smaller set of variables.

(b)Which model do you think would be more predictive of other samples?

The one on the left-hand side would be more predictive of other samples, Smaller values of learning rate make the model robust to the specific characteristics of each individual tree, thus allowing it to generalize well.

(c)How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24
If we increase interaction depth predictor importance would spread across more predictors.

Excercise 8.7

Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

I will use data, data imputation, data splitting, and pre-processing steps from homework 7 (Exercise 6.3) as per instruction above.

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)
## [1] 176  58

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

Imputation

table(is.na(ChemicalManufacturingProcess))
## 
## FALSE  TRUE 
## 10102   106
missmap(ChemicalManufacturingProcess)

We can see there are 106 (around 1%) missing values in our dataset. I will impute the dataset using mice package which tend to give really good imputation compared to other techniques mentions in the book.

chem_imputed <- mice(ChemicalManufacturingProcess, print = F, m = 1) %>% complete()
## Warning: Number of logged events: 135
missmap(chem_imputed)

There is no more NA in our dataset.

In these data, a significant number of predictors had pair-wise absolute correlations that were larger than 0.90. Because of this, a high-correlation filter was used on the predictor set to remove these highly redundant predictors from the data. In the end, 10 predictors were eliminated from the data for this reason. I will skip the histogram shown in homework 7 here.

tooHighCor <- findCorrelation(cor(chem_imputed), 0.90)
chem_filtered <- chem_imputed[, -tooHighCor]
dim(chem_filtered)
## [1] 176  48

Split the data

# Set seed
set.seed(43)
train <- createDataPartition(chem_filtered$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- chem_filtered[train, ]
test_df <- chem_filtered[-train, ]
set.seed(43)
rpart_model <- train(Yield ~ ., 
                    data = train_df,
                    method = "rpart",
                    tuneLength = 10,
                    control = rpart.control(maxdepth=2))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
rpart_pred <- predict(rpart_model, newdata = test_df)
result_rpart <- postResample(pred = rpart_pred, obs = test_df$Yield)
set.seed(43)
rpart_model <- train(Yield ~ ., 
                    data = train_df,
                    method = "rpart",
                    tuneLength = 10,
                    control = rpart.control(maxdepth=2))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
rpart_pred <- predict(rpart_model, newdata = test_df)
result_rpart <- postResample(pred = rpart_pred, obs = test_df$Yield)
set.seed(43)
randomforest_model <- train(Yield ~ ., 
                    data = train_df,
                    method = "rf",
                    tuneLength = 10)

randomforest_pred <- predict(randomforest_model, newdata = test_df)
result_randomforest <- postResample(pred = randomforest_pred, obs = test_df$Yield)
set.seed(43)

grid <- expand.grid(n.trees=c(50, 100, 150, 200), 
                    interaction.depth=c(1, 5, 10, 15), 
                    shrinkage=c(0.01, 0.1, 0.5), 
                    n.minobsinnode=c(5, 10, 15))

gmb_model <- train(Yield ~ ., 
                    data = train_df,
                    method = "gbm",
                    tuneGrid = grid, 
                    verbose = FALSE)

gmb_pred <- predict(gmb_model, newdata = test_df)
result_gmb <- postResample(pred = gmb_pred, obs = test_df$Yield)
set.seed(43)
cubist_model <- train(Yield ~ ., 
                    data = train_df,
                    method = "cubist")

cubist_pred <- predict(cubist_model, newdata = test_df)
result_cubist <- postResample(pred = cubist_pred, obs = test_df$Yield)

(a)Which tree-based regression model gives the optimal resampling and test set performance?

Resampling

resamp <- resamples(list(SingleTree=rpart_model, RandomForest=randomforest_model, GradientBoosting=gmb_model, Cubist=cubist_model))

summary(resamp)
## 
## Call:
## summary.resamples(object = resamp)
## 
## Models: SingleTree, RandomForest, GradientBoosting, Cubist 
## Number of resamples: 25 
## 
## MAE 
##                       Min.   1st Qu.    Median      Mean   3rd Qu.     Max.
## SingleTree       0.9612766 1.0510472 1.1846275 1.1766635 1.2370985 1.544472
## RandomForest     0.6983362 0.8709610 0.9363437 0.9260900 0.9625233 1.146091
## GradientBoosting 0.6725964 0.8398536 0.8811127 0.8947369 0.9521960 1.123289
## Cubist           0.7049279 0.8186761 0.8613634 0.8763843 0.9389505 1.165735
##                  NA's
## SingleTree          0
## RandomForest        0
## GradientBoosting    0
## Cubist              0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## SingleTree       1.2297846 1.385090 1.490308 1.513643 1.657671 1.924963    0
## RandomForest     0.8771677 1.094546 1.131262 1.168080 1.282329 1.491319    0
## GradientBoosting 0.8866369 1.056447 1.143043 1.149389 1.239570 1.403641    0
## Cubist           0.9347982 1.041831 1.108204 1.128553 1.196898 1.502999    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## SingleTree       0.05649142 0.2959391 0.3373893 0.3540496 0.4367346 0.5492778
## RandomForest     0.47544621 0.5802525 0.6111513 0.6126937 0.6614501 0.7210773
## GradientBoosting 0.42944177 0.5534641 0.6057520 0.6035067 0.6473071 0.7537708
## Cubist           0.47552467 0.5428975 0.6339935 0.6141241 0.6685025 0.7615181
##                  NA's
## SingleTree          0
## RandomForest        0
## GradientBoosting    0
## Cubist              0

Test set performance

rpart <- data.frame(t(result_rpart)) %>% mutate("Model"= "rpart")
randomforest <- data.frame(t(result_randomforest)) %>% mutate("Model"= "Random Forest")
gmb <- data.frame(t(result_gmb)) %>% mutate("Model"= "gmb")
cubist <- data.frame(t(result_cubist)) %>% mutate("Model"= "cubist")
 
all_result <- rbind(rpart,randomforest,gmb,cubist) %>% select(Model, RMSE, Rsquared, MAE) %>% arrange(RMSE) %>% kable()

all_result
Model RMSE Rsquared MAE
cubist 1.057992 0.6617236 0.8199423
gmb 1.152897 0.6151057 0.8925818
Random Forest 1.187794 0.5831935 0.8669127
rpart 1.457282 0.3672256 1.1265150

We can see from above that Cubist model gives the optimal resampling and test set performance.

(b)Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

Let’s check most important predictors in the optimal tree-based regression model which is Cubist in our case:

varImp(cubist_model)
## cubist variable importance
## 
##   only 20 most important variables shown (out of 47)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess09   55.10
## ManufacturingProcess17   53.06
## BiologicalMaterial06     47.96
## ManufacturingProcess45   37.76
## BiologicalMaterial03     29.59
## ManufacturingProcess37   28.57
## ManufacturingProcess28   22.45
## BiologicalMaterial08     22.45
## ManufacturingProcess13   21.43
## ManufacturingProcess33   21.43
## ManufacturingProcess04   19.39
## ManufacturingProcess39   17.35
## ManufacturingProcess10   16.33
## ManufacturingProcess14   16.33
## ManufacturingProcess01   15.31
## BiologicalMaterial01     14.29
## BiologicalMaterial05     14.29
## ManufacturingProcess11   14.29
## BiologicalMaterial09     12.24

Process variables dominate the list as 8 of the top 10 important predictors are process variables.

Let’s get the important predictors from optimal linear and nonlinear models model from homework 7 and 8.

Optimal linear model: PLS model

pls_model <- train(
  Yield ~ ., data = train_df, method = "pls",
  preProc = c("center", "scale"),
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)
# predictions
pls_predictions <- predict(pls_model, test_df)
# Performance metrics
results <- data.frame(RMSE = caret::RMSE(pls_predictions, test_df$Yield),
           Rsquared = caret::R2(pls_predictions, test_df$Yield))
results 
##       RMSE  Rsquared
## 1 2.626721 0.1976182
varImp(pls_model)
## pls variable importance
## 
##   only 20 most important variables shown (out of 47)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess36   83.58
## ManufacturingProcess09   80.47
## ManufacturingProcess13   79.40
## BiologicalMaterial06     75.42
## ManufacturingProcess17   71.52
## ManufacturingProcess33   67.98
## BiologicalMaterial03     67.87
## BiologicalMaterial08     64.54
## BiologicalMaterial01     60.34
## BiologicalMaterial11     58.58
## ManufacturingProcess06   57.34
## ManufacturingProcess11   54.01
## ManufacturingProcess12   39.57
## ManufacturingProcess28   38.69
## ManufacturingProcess04   37.61
## ManufacturingProcess30   37.04
## BiologicalMaterial10     36.32
## ManufacturingProcess02   33.93
## ManufacturingProcess20   30.34

Optimal non-linear model: SVM model

SVM_model <- train(
  Yield ~ ., data = train_df, method = "svmRadial",
  preProcess = c("center", "scale"),
  trControl = trainControl(method = "cv"),
  tuneLength = 25
)

SVM_pred <- predict(SVM_model, newdata = test_df)
result_SVM <- postResample(pred = SVM_pred, obs = test_df$Yield)
result_SVM
##      RMSE  Rsquared       MAE 
## 1.1492248 0.6030577 0.8224003
varImp(SVM_model)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 47)
## 
##                        Overall
## ManufacturingProcess32  100.00
## BiologicalMaterial06     86.37
## ManufacturingProcess13   81.44
## ManufacturingProcess36   73.95
## BiologicalMaterial03     71.18
## ManufacturingProcess17   66.65
## ManufacturingProcess09   64.70
## ManufacturingProcess06   53.38
## BiologicalMaterial11     51.54
## ManufacturingProcess33   50.61
## BiologicalMaterial08     45.54
## ManufacturingProcess11   45.30
## ManufacturingProcess30   41.03
## BiologicalMaterial01     39.21
## BiologicalMaterial09     29.91
## ManufacturingProcess20   28.06
## ManufacturingProcess35   25.70
## ManufacturingProcess15   19.78
## ManufacturingProcess16   19.76
## ManufacturingProcess12   19.70

We can see from above that all three model agrees that ManufacturingProcess32 most important variable. Also, Process variables dominate the list of the top 10 important predictors for all three optimal models.

(c)Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

rpart_model$finalModel
## n= 144 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 144 491.55800 40.18333  
##   2) ManufacturingProcess32< 159.5 84 160.27350 39.23988  
##     4) BiologicalMaterial11< 145.075 39  44.65496 38.45564 *
##     5) BiologicalMaterial11>=145.075 45  70.84419 39.91956 *
##   3) ManufacturingProcess32>=159.5 60 151.83990 41.50417  
##     6) BiologicalMaterial03< 72 46  85.57719 41.01043 *
##     7) BiologicalMaterial03>=72 14  18.20492 43.12643 *
fancyRpartPlot(rpart_model$finalModel, sub="")

This indicates higher value of ManufacturingProcess32 will produce higher Yield.