Data 624 Homework 9

library(tidyverse)
library(randomForest)
library(caret)
library(mlbench)
library(rpart)
library(rpart.plot)
library(party)
library(randomForest)
library(gbm)
library(Cubist)
library(ipred)
library(partykit)
library(kableExtra)

8.1. Recreate the simulated data from Exercise 7.2:

library(mlbench)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"

(a) Fit a random forest model to all of the predictors, then estimate the variable importance scores:

library(randomForest)
library(caret)
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)

Did the random forest model significantly use the uninformative predictors (V6 – V10)?

rfImp1

##         Overall
## V1   8.62743275
## V2   6.27437240
## V3   0.72305459
## V4   7.50258584
## V5   2.13575650
## V6   0.12395003
## V7   0.02927888
## V8  -0.11724317
## V9  -0.10344797
## V10  0.04312556

It doesn’t appear that the Random Forest model significantly used the uninformative predictors V6-V10, having an importance score less than predictors V1-V5.

(b) Now add an additional predictor that is highly correlated with one of the informative predictors. For example:

set.seed(1234)
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)

## [1] 0.941183

Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?

set.seed(1234)
model2 <- randomForest(y ~ ., data=simulated, importance=TRUE, ntree=1000)
rfImp2 <- varImp(model2, scale=FALSE)
rfImp2

##                 Overall
## V1          6.010787780
## V2          6.245656504
## V3          0.696884334
## V4          6.780480810
## V5          2.044165983
## V6          0.144665983
## V7          0.052455477
## V8         -0.044711728
## V9         -0.019943688
## V10        -0.007454706
## duplicate1  3.954858477

The importance score for V1 in the second Random Forest model decreased from the first model, from 8.62743275 in the first model to 6.891917268 in the second model. Adding the duplicate1 predictor that is highly correlated with V1 decreased V1’s importance score. Additionally, duplicate1 had a high importance score relative to the other predictors, having the fourth-highest importance score of 3.954858477.

(c) Use the `cforest` function in the party package to fit a random forest model using conditional inference trees. The party package function `varimp` can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the

same pattern as the traditional random forest model?

set.seed(1234)
model_cforest <- cforest(y ~ ., data=simulated[, c(1:11)])


set.seed(1234)
# Without conditional
cfimp3 <- varimp(model_cforest) |> sort(decreasing=TRUE) 

set.seed(1234)
# With conditional
cfimp4 <- varimp(model_cforest, conditional = TRUE) |> sort(decreasing=TRUE)  

set.seed(1234)
# Putting It Altogether
as.data.frame(cbind(Random_Forest2 = rfImp1, Conditional = cfimp3, Unconditional = cfimp4))

##         Overall Conditional Unconditional
## V1   8.62743275  8.62393856    6.20618365
## V2   6.27437240  7.43159970    6.19715850
## V3   0.72305459  6.19784031    5.19915216
## V4   7.50258584  2.09293290    1.42316671
## V5   2.13575650  0.17857580    0.02138927
## V6   0.12395003  0.16618590   -0.07948875
## V7   0.02927888  0.13835424   -0.14755542
## V8  -0.11724317  0.07919501   -0.18758703
## V9  -0.10344797 -0.15857574   -0.19893301
## V10  0.04312556 -0.16615550   -0.21970304

The importance scores in the Conditional and Unconditional models follow the same pattern as the Random Forest model when it comes to the V6 to V10 predictors, having lower importance scores than predictors V1-V5. The V3 predictor in the traditional Random Forest had a low importance score, while the importance score for the V3 predictor in the cforest conditional and unconditional models was high relative to the other predictors in the models.

(d) Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?

Boosted Trees

set.seed(1234)
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
                       n.trees = seq(100, 1000, by = 50),
                       shrinkage = c(0.01, 0.1),
                       n.minobsinnode = 10)
set.seed(100)

gbm_model_sim <- train(y ~ ., data = simulated[, c(1:11)],
                 method = "gbm",
                 tuneGrid = gbmGrid,
                 verbose = FALSE)

rfImp3 <- varImp(gbm_model_sim$finalModel, scale = FALSE)

rfImp3

##       Overall
## V1  4634.0234
## V2  4316.6044
## V3  1310.7178
## V4  4287.0793
## V5  1844.2147
## V6   416.3966
## V7   368.9383
## V8   224.4769
## V9   246.1400
## V10  250.6263

Cubist

set.seed(1234)
cubist_model_sim <- train(y ~ ., data = simulated[, c(1:11)], method = "cubist")
rfImp4 <- varImp(cubist_model_sim$finalModel, scale = FALSE)
rfImp4

##     Overall
## V1     72.0
## V3     42.0
## V2     54.5
## V4     49.0
## V5     40.0
## V6     11.0
## V7      0.0
## V8      0.0
## V9      0.0
## V10     0.0

set.seed(1234)
df = as.data.frame(cbind(rfImp3, rfImp4, rfImp1))
names(df) = c("Boosted", "Cubist", "Random Forest")
df

##       Boosted Cubist Random Forest
## V1  4634.0234   72.0    8.62743275
## V2  4316.6044   42.0    6.27437240
## V3  1310.7178   54.5    0.72305459
## V4  4287.0793   49.0    7.50258584
## V5  1844.2147   40.0    2.13575650
## V6   416.3966   11.0    0.12395003
## V7   368.9383    0.0    0.02927888
## V8   224.4769    0.0   -0.11724317
## V9   246.1400    0.0   -0.10344797
## V10  250.6263    0.0    0.04312556

The predictors in the Cubist model comes close to following the same pattern of importance scores, with V6-V10 having a lower importance score than predictors V1-V5. The V6 predictor, however, had a higher importance score than the predictor in the original Random Forest model. In the Boosted model, each of the predictors had significantly higher importance scores than the Cubist and Random Forest models. When comparing V6-V10 predictors with V1-V5 predictors within the Boosted model, however, the pattern appears to remain the same, with V6-V10 predictors having lower importance scores and not being significantly used as uninformative predictors compared to V1-V5 predictors.

8.2. Use a simulation to show tree bias with different granularities.

set.seed(1234)
X1 <- sample(1:10000 / 10000, 500, replace = TRUE) # 10000 distinct values
X2 <- sample(1:1000 / 1000, 500, replace = TRUE) # 1000 distinct values
X3 <- sample(1:100 / 100, 500, replace = TRUE) # 100 distinct values
X4 <-sample(1:10 / 10, 500, replace = TRUE) # 10 distinct values
X5 <- sample(1:5 / 5, 500, replace = TRUE) # 5 distinct values
X6 <- rep(1, 500)                          # no distinct values


Y <- X1 + X2 + X3 + X4 + X5 + X6

sim_df <- data.frame(X1, X2, X3, X4, X5, X6, Y)

# Rpart Model
rpart_model <- rpart(Y ~., data=sim_df)
varImp(rpart_model)

##     Overall
## X1 3.487031
## X2 1.861743
## X3 4.506291
## X4 2.723413
## X5 4.221764
## X6 0.000000

X3 had the highest importance score of 4.506291, followed by X5 and X1. X6 had the lowest importance score, which may be the result of having no distinct values.

rpart.plot::rpart.plot(rpart_model, box.palette = "RdBu", shadow.col = "blue", nn = TRUE)

8.3 In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9:

knitr::include_graphics("/Users/mohamedhassan/Documents/HW9_Image.png")

(a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?

The higher bagging fraction and learning rate on the predictors of the model on the right causes a more concentrated and higher importance score on the first few predictors and disregarding the other variables. The model on the left has a small learning rate and bagging fraction, which causes the algorithm to take many iterations of the other variables to find the minimum point of the gradient.

(b) Which model do you think would be more predictive of other samples?

I think the model on the left would be more predictive of other samples since having smaller values of bagging fraction and learning rates reduces the chance of overfitting. Additionally, the model on the left would add more weight to the other variables in the model, rather than concentrated on only a few variables as the model on the right illustrates. With its low learning rate and bagging fraction, the model on the left would be more robust to the specific characteristics of each individual tree, allowing it to generalize effectively.

(c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?

Increasing interaction depth would affect the slope of predictor importance for either model by giving more weight to the other variables that initially had low variable importance in the models. Increasing the depth allows the algorithm to capture more unique interactions among the variables, but may increase the chance of overfitting and isn’t computationally efficient. The slope of the predictor importance would increase as a result.

Source Consulted: https://bradleyboehmke.github.io/HOML/gbm.html

8.7 Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several tree-based models:

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

Preprocessing

set.seed(123)

bag_model <- preProcess(ChemicalManufacturingProcess, "bagImpute")
bag_df_model <- predict(bag_model, ChemicalManufacturingProcess)

set.seed(123)
bag_df_model2 <- bag_df_model[,-nearZeroVar(bag_df_model)]
dim(bag_df_model2)

## [1] 176  57

set.seed(123)
train_data <- createDataPartition(bag_df_model2$Yield, times = 1, p = 0.8, list = FALSE)

x_train <- bag_df_model2[train_data, -1]
y_train <- bag_df_model2[train_data, 1]

x_test <- bag_df_model2[-train_data, -1]
y_test <- bag_df_model2[-train_data, 1]

Single Decision Tree

set.seed(1234)

cart_model <- train(x_train, y_train,
                  method = "rpart",
                  tuneLength = 10,
                  trControl = trainControl(method = "cv"))

cartPred <- predict(cart_model, x_test)

CART_Model <- postResample(cartPred, y_test)
CART_Model

##     RMSE Rsquared      MAE 
## 1.777762 0.207447 1.310021

Random Forest

set.seed(1234)
rf_model <- randomForest(x_train, y_train, 
                        importance = TRUE,
                        ntree = 1000)


rfPred <- predict(rf_model, x_test)

RF_Model <- postResample(rfPred, y_test)
RF_Model

##      RMSE  Rsquared       MAE 
## 1.2372766 0.5832409 0.9556687

Gradient Boosting

set.seed(1234)
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
                       n.trees = seq(100, 1000, by = 50),
                       shrinkage = c(0.01, 0.1),
                       n.minobsinnode = 10)
set.seed(1234)

gb_model <- train(x_train, y_train,
                 method = "gbm",
                 tuneGrid = gbmGrid,
                 verbose = FALSE)

gb_Pred <- predict(gb_model, x_test)

Grad_Boost_Model <- postResample(gb_Pred, y_test)
Grad_Boost_Model

##      RMSE  Rsquared       MAE 
## 1.1378764 0.6190091 0.8366554

Bagged Trees

set.seed(1234)
bagged_tree_model <- ipredbagg(y_train, x_train)
baggedPred <- predict(bagged_tree_model, x_test)

Bagged_Model <- postResample(baggedPred, y_test)
Bagged_Model

##      RMSE  Rsquared       MAE 
## 1.3830582 0.4308972 1.0359350

Cubist

set.seed(1234)
cubist_model <- train(x_train, y_train, 
                     method = "cubist")

cubistPred <- predict(cubist_model, x_test)

Cubist_Model <- postResample(cubistPred, y_test)
Cubist_Model

##      RMSE  Rsquared       MAE 
## 0.9338337 0.7469012 0.7573570

set.seed(1234)
df_models <- as.data.frame(rbind(CART_Model, RF_Model, Grad_Boost_Model,Bagged_Model, Cubist_Model))

df_models %>%
  arrange(desc(Rsquared))

##                       RMSE  Rsquared       MAE
## Cubist_Model     0.9338337 0.7469012 0.7573570
## Grad_Boost_Model 1.1378764 0.6190091 0.8366554
## RF_Model         1.2372766 0.5832409 0.9556687
## Bagged_Model     1.3830582 0.4308972 1.0359350
## CART_Model       1.7777623 0.2074470 1.3100214

(a) Which tree-based regression model gives the optimal resampling and test set performance?

The Cubist model provides the optimal resampling and test set performance, having the highest Rsquared value of 0.7469012. The Cubist model also had the lowest RMSE and MAE values, with a RMSE of 0.9338337 and a MAE of 0.7573570.

(b) Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?

varImp(cubist_model)

## cubist variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess17   85.71
## ManufacturingProcess13   36.73
## ManufacturingProcess09   36.73
## BiologicalMaterial06     35.71
## BiologicalMaterial02     30.61
## ManufacturingProcess04   26.53
## BiologicalMaterial12     24.49
## ManufacturingProcess11   23.47
## ManufacturingProcess33   22.45
## BiologicalMaterial04     20.41
## ManufacturingProcess39   19.39
## ManufacturingProcess26   19.39
## ManufacturingProcess37   17.35
## ManufacturingProcess14   16.33
## ManufacturingProcess25   15.31
## ManufacturingProcess28   13.27
## ManufacturingProcess45   12.24
## ManufacturingProcess29   12.24
## ManufacturingProcess27   12.24

set.seed(1234)
cubist_model_importance <- varImp(cubist_model)$importance |>
  as.data.frame() |>
  rownames_to_column("Variable") |>
  top_n(10) |>
  arrange(desc(Overall)) |>
  mutate(importance = row_number())

set.seed(1234)
varImp(cubist_model) %>%
  plot(., top = max(cubist_model_importance$importance), main = "Top Ten Informative Predictors of Cubist Model")

set.seed(1234)
cubist_model_importance |>
  mutate(Variable = str_replace_all(Variable, "[0-9]+", "")) |>
  group_by(Variable) |>
  count() |>
  arrange(desc(n)) |>
  kbl() |>
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))

Variable	n
ManufacturingProcess	7
BiologicalMaterial	3

knitr::include_graphics("/Users/mohamedhassan/Downloads/Linear_Model.png")

knitr::include_graphics("/Users/mohamedhassan/Downloads/SVM_Model_Nonlinear.png")

set.seed(1234)
varImp(cubist_model) %>%
  plot(., top = max(cubist_model_importance$importance), main = "Top Ten Informative Predictors of Cubist Model")

The top 10 predictors in the Cubist model differs from the top 10 predictors in the linear and nonlinear models. The top ten predictors in the linear and nonlinear models have an importance score of at least 70, while the Cubist model has only two predictors with an importance score above 70. In each model, ManufacturingProcess32 has the highest importance score. The Cubist model has 7 process variables among the top ten predictors, while the linear and nonlinear models each have 6 process variables among their respective top ten predictors.

(c) Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?

cart_model$finalModel

## n= 144 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 144 490.338100 40.19250  
##    2) ManufacturingProcess32< 159.5 87 156.136500 39.22115  
##      4) ManufacturingProcess17>=33.85 59  64.242560 38.73797  
##        8) BiologicalMaterial03< 66.375 31  34.696570 38.20548  
##         16) BiologicalMaterial05>=19.795 7   5.925543 36.97714 *
##         17) BiologicalMaterial05< 19.795 24  15.128760 38.56375 *
##        9) BiologicalMaterial03>=66.375 28  11.024920 39.32750 *
##      5) ManufacturingProcess17< 33.85 28  49.094590 40.23929  
##       10) BiologicalMaterial11< 145.84 18  16.821630 39.54611 *
##       11) BiologicalMaterial11>=145.84 10   8.056210 41.48700 *
##    3) ManufacturingProcess32>=159.5 57 126.825400 41.67509  
##      6) BiologicalMaterial06< 51.61 31  56.585800 40.97000  
##       12) ManufacturingProcess23>=2.5 20  16.146380 40.35900 *
##       13) ManufacturingProcess23< 2.5 11  19.397690 42.08091 *
##      7) BiologicalMaterial06>=51.61 26  36.452630 42.51577  
##       14) ManufacturingProcess09< 47.155 19  12.897720 42.11789 *
##       15) ManufacturingProcess09>=47.155 7  12.383170 43.59571 *

set.seed(1234)
rpart.plot::rpart.plot(cart_model$finalModel, box.palette = "RdBu", shadow.col = "blue", nn=TRUE)

I think this type of visualization provides knowledge of which type predictor (Biological or Process) and at what specific number the split of the predictor, whether less than or greater than, will produce the amount of yield. For example, the tree shows that the split begins with ManufacturingProcess32 at 160. If it is less than 160, the yield will be 39. If it is more than 160, the yield will be 42. At the next node after ManufacturingProcess32 is greater than 160, if BiologicalMaterial06 is less than 52, the yield will be 41 and if greater than 52 the yield will be 42. Providing these splits of the biological and process variables in the terminal nodes can help explain the relationship with the amount of yield produced.