library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr     1.1.2     âś” readr     2.1.4
## âś” forcats   1.0.0     âś” stringr   1.5.0
## âś” ggplot2   3.4.4     âś” tibble    3.2.1
## âś” lubridate 1.9.2     âś” tidyr     1.3.0
## âś” purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(base)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(AppliedPredictiveModeling)

8.1. Recreate the simulated data from Exercise 7.2:

library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.2
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(a) Fit a random forest model to all of the predictors, then estimate the variable importance scores: Did the random forest model significantly use the uninformative predictors (V6 – V10)?

The model did not give importance to V6 to V10 predictives based on the graph below/

library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(caret)
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

rfImp1 %>% 
  mutate (var = rownames(rfImp1)) %>%
  ggplot(aes(Overall, reorder(var, Overall, sum), var)) + 
  geom_col(fill = 'purple') + 
  labs(title = 'Variable Importance' , y = 'Variable')

(b) Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

V1 drops from the most important feature to the 4th most important feature based on the graph.

model2 <- randomForest(y ~ ., data = simulated,
                       importance = TRUE,
                       ntree = 1000)

rfImp2 <- varImp(model2, scale = FALSE)

rfImp2 %>% 
  mutate (var = rownames(rfImp2)) %>%
  ggplot(aes(Overall, reorder(var, Overall, sum), var)) + 
  geom_col(fill = 'purple') + 
  labs(title = 'Variable Importance' , y = 'Variable')

(c) Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?

library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
model3 <- cforest(y ~ ., data = simulated)
rfImp3 <- varimp(model3, conditional = TRUE) %>% as.data.frame()
rfImp3 %>% 
  rename(Overall = '.') %>%
  mutate (var = rownames(rfImp3)) %>%
  ggplot(aes(Overall, reorder(var, Overall, sum), var)) + 
  geom_col(fill = 'purple') + 
  labs(title = 'Variable Importance' , y = 'Variable')

## (d) Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

library(Cubist)
## Warning: package 'Cubist' was built under R version 4.3.3
model4 <- cubist(simulated[, colnames(simulated)[colnames(simulated) != 'y']], 
                 simulated$y)
rfImp4 <- varImp(model4, scale = FALSE)
rfImp4 %>% 
  mutate (var = rownames(rfImp4)) %>%
  ggplot(aes(Overall, reorder(var, Overall, sum), var)) + 
  geom_col(fill = 'purple') + 
  labs(title = 'Variable Importance' , y = 'Variable')

8.2. Use a simulation to show tree bias with different granularities.

The most significant variable is V3 based on the graph below.

set.seed(88)
V1 <- runif(500, 2,500)
V2 <- rnorm(500, 2,10)
V3 <- rnorm(500, 1,1000)
y <- V2 + V3
df <- data.frame(V1, V2, V3, y)
test_model <- cforest(y ~ ., data = df, ntree = 10)
test_model_imp <- varimp(test_model, conditional = FALSE)
barplot(sort(test_model_imp),horiz = TRUE, col = rainbow(5))

8.3. In stochastic gradient boosting the bagging fraction and learning rate

will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9: ## (a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors? The model on the right concentrates its importance on a few predictors because it has a higher learning rate and a higher bagging rate. This means it uses a larger subset of the data, increasing the correlation at each iteration. Consequently, only a few variables end up with high importance. ## (b) Which model do you think would be more predictive of other samples? The model with the higher learning and bagging rates is likely to overfit because it considers fewer variables. Therefore, the model on the left, which spreads importance across more predictors, is more likely to be predictive of other samples. ## (c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24? Increasing the interaction depth would include more predictors in the model. This would lower the RMSE (Root Mean Squared Error) and result in a steeper slope of predictor importance.

8.7. Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

data(ChemicalManufacturingProcess)
cmp_impute <- preProcess(ChemicalManufacturingProcess[,-c(1)], method=c('bagImpute'))
cmp <- predict(cmp_impute, ChemicalManufacturingProcess[,-c(1)])
set.seed(480)
train_r <- createDataPartition(ChemicalManufacturingProcess$Yield, p=0.8, list=FALSE)
X_train <- cmp[train_r,]
y_train <- ChemicalManufacturingProcess$Yield[train_r]
X_test <- cmp[-train_r,]
y_test <- ChemicalManufacturingProcess$Yield[-train_r]

GRADIANT BOOST

set.seed(44)
grid <- expand.grid(n.trees=c(50, 100, 150, 200), 
                    interaction.depth=c(1, 5, 10, 15), 
                    shrinkage=c(0.01, 0.1, 0.5), 
                    n.minobsinnode=c(5, 10, 15))
model_gbm1 <- train(x = X_train,y = y_train, method = 'gbm',tuneGrid = grid, verbose = FALSE)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
model_gbm1$bestTune
##    n.trees interaction.depth shrinkage n.minobsinnode
## 76     200                10       0.1              5

RANDOM FOREST

set.seed(77)
model_rf3<- train(X_train, y_train, method='rf', tuneLength = 10)
model_rf3
## Random Forest 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    1.334811  0.5276895  1.0719915
##    8    1.254487  0.5561364  0.9842596
##   14    1.237085  0.5602251  0.9599156
##   20    1.238068  0.5549427  0.9565511
##   26    1.236422  0.5521014  0.9517747
##   32    1.240043  0.5464730  0.9510833
##   38    1.244768  0.5418790  0.9518807
##   44    1.251081  0.5360229  0.9552270
##   50    1.258535  0.5292140  0.9582833
##   57    1.268117  0.5218864  0.9645965
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 26.

SINGLE TREE

library(rpart)
set.seed(66)
model_rpart <- train(x= X_train, y= y_train, method="rpart", tuneLength=10, control= rpart.control(maxdepth=2))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
model_rpart
## CART 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   MAE     
##   0.01684157  1.516456  0.3578700  1.162605
##   0.02137409  1.516456  0.3578700  1.162605
##   0.02341864  1.516456  0.3578700  1.162605
##   0.02532733  1.516456  0.3578700  1.162605
##   0.02890208  1.516456  0.3578700  1.162605
##   0.03111746  1.516456  0.3578700  1.162605
##   0.04559790  1.517792  0.3567906  1.165215
##   0.06391370  1.532447  0.3473686  1.174197
##   0.09164355  1.543158  0.3396923  1.183760
##   0.39332469  1.687773  0.2730084  1.329875
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.03111746.

Which tree-based regression model gives the optimal resampling and test set performance?

From below you could see the best model looks like Gradient Boosting. This is because the RMSE value seems better than those from the other models.

summary(resamples(list(Single_True = model_rpart, Random_Forest = model_rf3, 
                       Gradient_Boosting=model_gbm1)))
## 
## Call:
## summary.resamples(object = resamples(list(Single_True =
##  model_rpart, Random_Forest = model_rf3, Gradient_Boosting = model_gbm1)))
## 
## Models: Single_True, Random_Forest, Gradient_Boosting 
## Number of resamples: 25 
## 
## MAE 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.     Max.
## Single_True       0.9721591 1.0916421 1.1544688 1.1626050 1.2438397 1.411900
## Random_Forest     0.8413057 0.9003406 0.9357733 0.9517747 0.9861940 1.187571
## Gradient_Boosting 0.8318969 0.8904796 0.9198531 0.9398576 0.9738667 1.112387
##                   NA's
## Single_True          0
## Random_Forest        0
## Gradient_Boosting    0
## 
## RMSE 
##                       Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Single_True       1.271720 1.426333 1.524448 1.516456 1.614609 1.732767    0
## Random_Forest     1.066881 1.148314 1.237786 1.236422 1.328218 1.504047    0
## Gradient_Boosting 1.066207 1.164416 1.230343 1.231950 1.269784 1.430956    0
## 
## Rsquared 
##                        Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Single_True       0.1885859 0.2973369 0.3602365 0.3578700 0.4259184 0.5371485
## Random_Forest     0.3948908 0.5083560 0.5547292 0.5521014 0.6138288 0.6830667
## Gradient_Boosting 0.3111637 0.5177063 0.5533567 0.5660252 0.6416362 0.7151902
##                   NA's
## Single_True          0
## Random_Forest        0
## Gradient_Boosting    0
test_performance <- function(models, testData, testTarget) {
  method <- c()
  res <- data.frame()
  for(model in models){
    method <- c(method, model$method)
    pred <- predict(model, newdata=testData)
    res <- rbind(res, t(postResample(pred=pred, obs=testTarget)))
  }
  row.names(res) <- method
  return(res)
}
#List te models
models <- list(model_rpart, model_rf3, model_gbm1)
#Run the function
performance <- test_performance(models, X_test, y_test)
performance
##            RMSE  Rsquared       MAE
## rpart 1.5647270 0.3765960 1.2801338
## rf    1.1313936 0.7267739 0.8847341
## gbm   0.8727583 0.8195122 0.6368033

Which predictors are most important in the optimal tree-based regression model?

Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

You’ll see below that both Gradient Boosting and SVM include a mix of biological and manufacturing predictors. In contrast, the Partial Least Squares linear model is primarily dominated by manufacturing predictors. Notably, Manufacturing Process32 appears as an important variable across all three models.

library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
model_pls <- train(x = X_train,y = y_train, method='pls', metric='RMSE',
                   tuneLength=20, trControl = trainControl(method='cv'))
(pls_imp = varImp(model_pls))
## Warning: package 'pls' was built under R version 4.3.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
## pls variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess14   38.56
## BiologicalMaterial03     37.72
## ManufacturingProcess27   35.46
## ManufacturingProcess35   35.22
## ManufacturingProcess24   34.14
## ManufacturingProcess15   32.40
## ManufacturingProcess28   31.96
## ManufacturingProcess25   30.59
## ManufacturingProcess26   29.40
## ManufacturingProcess06   29.10
## BiologicalMaterial06     28.36
## BiologicalMaterial02     27.71
## ManufacturingProcess04   27.69
## ManufacturingProcess02   21.97
## ManufacturingProcess05   18.61
## ManufacturingProcess33   17.73
## ManufacturingProcess19   15.84
## ManufacturingProcess17   14.69
## ManufacturingProcess09   13.91
set.seed(222)
svm_model <- train(x = X_train,y = y_train,
                        method = "svmRadial",
                        tuneLength=10,
                        preProc = c("center", "scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
(svm_imp = varImp(svm_model))
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## BiologicalMaterial06    100.00
## ManufacturingProcess13   98.34
## ManufacturingProcess32   96.95
## BiologicalMaterial03     81.49
## ManufacturingProcess17   75.31
## BiologicalMaterial02     73.06
## BiologicalMaterial12     71.59
## ManufacturingProcess09   70.96
## ManufacturingProcess31   67.04
## ManufacturingProcess36   66.24
## ManufacturingProcess06   55.48
## BiologicalMaterial04     54.37
## BiologicalMaterial11     53.94
## ManufacturingProcess33   49.60
## BiologicalMaterial08     48.82
## BiologicalMaterial01     46.53
## ManufacturingProcess02   44.43
## ManufacturingProcess29   40.72
## ManufacturingProcess11   36.51
## ManufacturingProcess12   32.98

Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

Range and Impact: The distribution of yield in each terminal node shows the range of outcomes given certain conditions, helping to identify which conditions lead to higher or lower yields. Non-linearity: The tree structure can uncover non-linear relationships between predictors and yield, which might not be apparent in linear models.

model_rpart$finalModel
## n= 144 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 144 472.70180 40.17111  
##   2) ManufacturingProcess32< 159.5 83 142.65450 39.19699  
##     4) BiologicalMaterial11< 145.075 39  42.89152 38.55615 *
##     5) BiologicalMaterial11>=145.075 44  69.55090 39.76500 *
##   3) ManufacturingProcess32>=159.5 61 144.12200 41.49656  
##     6) BiologicalMaterial06< 51.61 34  60.34664 40.74559 *
##     7) BiologicalMaterial06>=51.61 27  40.45527 42.44222 *