library(tidyverse)
library(mlbench)
library(randomForest)
library(caret)
library(rpart)
library(rpart.plot)
library(party)
library(partykit)
library(gbm)
library(AppliedPredictiveModeling)
library(imager)
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
Fit a random forest model to all of the predictors, then estimate the variable importance scores:
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
Did the random forest model significantly use the uninformative predictors (V6 – V10)?
rfImp1
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
Now add an additional predictor that is highly correlated with one of the informative predictors. For example:
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
Fit another random forest model to these data. Did the importance score for V1 change? What happens when you add another predictor that is also highly correlated with V1?
model2 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
## Overall
## V1 5.69119973
## V2 6.06896061
## V3 0.62970218
## V4 7.04752238
## V5 1.87238438
## V6 0.13569065
## V7 -0.01345645
## V8 -0.04370565
## V9 0.00840438
## V10 0.02894814
## duplicate1 4.28331581
Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional importance measure and the modified version described in Strobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?
set.seed(1227)
m_cforest <- cforest(y ~ ., data=simulated[, c(1:11)])
set.seed(1227)
# unconditional
cfimp3 <- varimp(m_cforest) |> sort(decreasing=TRUE)
set.seed(1227)
# conditional
cfimp4 <- varimp(m_cforest, conditional = TRUE) |> sort(decreasing=TRUE)
set.seed(1227)
# Putting It Altogether
as.data.frame(cbind(Random_Forest2 = rfImp1, Conditional = cfimp3, Unconditional = cfimp4))
## Overall Conditional Unconditional
## V1 8.732235404 7.95164878 6.385045404
## V2 6.415369387 7.53775032 6.027948668
## V3 0.763591825 6.08973017 5.042993958
## V4 7.615118809 2.08084399 1.371959664
## V5 2.023524577 0.17051232 0.002682479
## V6 0.165111172 0.07162336 -0.181682023
## V7 -0.005961659 0.03530489 -0.252321774
## V8 -0.166362581 -0.10150647 -0.286870080
## V9 -0.095292651 -0.14927734 -0.375866134
## V10 -0.074944788 -0.16181598 -0.429541068
Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?
# Boosted Trees
set.seed(1227)
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
n.trees = seq(100, 1000, by = 50),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
set.seed(101)
gbm_model_sim <- train(y ~ ., data = simulated[, c(1:11)],
method = "gbm",
tuneGrid = gbmGrid,
verbose = FALSE)
rfImp3 <- varImp(gbm_model_sim$finalModel, scale = FALSE)
rfImp3
## Overall
## V1 40589.8688
## V2 34914.4092
## V3 11629.5115
## V4 42462.4546
## V5 17658.3155
## V6 1535.5005
## V7 1633.9422
## V8 767.1982
## V9 975.7857
## V10 699.5149
# Cubist
set.seed(1227)
cubist_model_sim <- train(y ~ ., data = simulated[, c(1:11)], method = "cubist")
rfImp4 <- varImp(cubist_model_sim$finalModel, scale = FALSE)
rfImp4
## Overall
## V1 72.0
## V3 42.0
## V2 54.5
## V4 49.0
## V5 40.0
## V6 11.0
## V7 0.0
## V8 0.0
## V9 0.0
## V10 0.0
set.seed(1234)
hw9df = as.data.frame(cbind(rfImp3, rfImp4, rfImp1))
names(hw9df) = c("Boosted", "Cubist", "Random Forest")
hw9df
## Boosted Cubist Random Forest
## V1 40589.8688 72.0 8.732235404
## V2 34914.4092 42.0 6.415369387
## V3 11629.5115 54.5 0.763591825
## V4 42462.4546 49.0 7.615118809
## V5 17658.3155 40.0 2.023524577
## V6 1535.5005 11.0 0.165111172
## V7 1633.9422 0.0 -0.005961659
## V8 767.1982 0.0 -0.166362581
## V9 975.7857 0.0 -0.095292651
## V10 699.5149 0.0 -0.074944788
set.seed(227)
x1 <- sample(1:10 / 10, 500, replace = TRUE)
x2 <- sample(1:100 / 100, 500, replace = TRUE)
x3 <- sample(1:1000 / 1000, 500, replace = TRUE)
x4 <- sample(1:10000 / 10000, 500, replace = TRUE)
x5 <- sample(1:100000 / 100000, 500, replace = TRUE)
y <- x1 + x2 + x3 + x4 + x5
dfsim <- data.frame(x1,x2,x3,x4,x5,y)
rpartTree <- rpart(y ~ ., data = dfsim)
plot(as.party(rpartTree), gp = gpar(fontsize = 9))
im = load.image("C:/Users/vitug/OneDrive/Desktop/CUNY Masters/DATA_624/8.3.png")
plot(im)
Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors?
Which model do you think would be more predictive of other samples?
How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24?
# Pre-Processing
data(ChemicalManufacturingProcess)
set.seed(246)
model_87 <- preProcess(ChemicalManufacturingProcess, "bagImpute")
df_model87 <- predict(model_87, ChemicalManufacturingProcess)
set.seed(246)
df2_model87 <- df_model87[,-nearZeroVar(df_model87)]
dim(df2_model87)
## [1] 176 57
# Training
set.seed(246)
train_data <- createDataPartition(df2_model87$Yield, times = 1, p = 0.8, list = FALSE)
x_train <- df2_model87[train_data, -1]
y_train <- df2_model87[train_data, 1]
x_test <- df2_model87[-train_data, -1]
y_test <- df2_model87[-train_data, 1]
set.seed(1227)
random_model <- randomForest(x_train, y_train,
importance = TRUE,
ntree = 1000)
randomPred <- predict(random_model, x_test)
forest_Model <- postResample(randomPred, y_test)
forest_Model
## RMSE Rsquared MAE
## 1.2265782 0.6827148 0.8937010
set.seed(1227)
cube_model <- train(x_train, y_train,
method = "cubist")
cubepred <- predict(cube_model, x_test)
cubist_model <- postResample(cubepred, y_test)
cubist_model
## RMSE Rsquared MAE
## 1.1144185 0.7241123 0.7404627
set.seed(1227)
single_model <- train(x_train, y_train,
method = "rpart",
tuneLength = 10,
trControl = trainControl(method = "cv"))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
singlepred <- predict(single_model, x_test)
tree_model <- postResample(singlepred, y_test)
tree_model
## RMSE Rsquared MAE
## 1.499437 0.498557 1.126403
Which tree-based regression model gives the optimal resampling and test set performance?
Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?
varImp(cube_model)
## cubist variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess33 49.091
## BiologicalMaterial03 42.727
## ManufacturingProcess17 39.091
## ManufacturingProcess09 38.182
## ManufacturingProcess31 29.091
## ManufacturingProcess04 28.182
## BiologicalMaterial02 20.909
## BiologicalMaterial06 15.455
## ManufacturingProcess29 15.455
## BiologicalMaterial09 14.545
## BiologicalMaterial08 13.636
## ManufacturingProcess11 12.727
## BiologicalMaterial04 11.818
## ManufacturingProcess14 10.909
## ManufacturingProcess13 10.909
## ManufacturingProcess10 10.000
## ManufacturingProcess39 10.000
## ManufacturingProcess16 9.091
## BiologicalMaterial05 9.091
set.seed(1234)
cubist_model_importance <- varImp(cube_model)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
top_n(10) |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
## Selecting by Overall
set.seed(1234)
varImp(cube_model) %>%
plot(., top = max(cubist_model_importance$importance), main = "Top Ten Informative Predictors of Cubist Model")
Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
single_model$finalModel
## n= 144
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 144 457.677200 40.17465
## 2) ManufacturingProcess32< 159.5 83 142.953900 39.22349
## 4) BiologicalMaterial11< 145.565 43 44.811320 38.54860
## 8) ManufacturingProcess34< 2.509021 36 29.626320 38.29722 *
## 9) ManufacturingProcess34>=2.509021 7 1.210286 39.84143 *
## 5) BiologicalMaterial11>=145.565 40 57.502760 39.94900
## 10) ManufacturingProcess11< 9.95 29 17.689100 39.40034 *
## 11) ManufacturingProcess11>=9.95 11 8.069473 41.39545 *
## 3) ManufacturingProcess32>=159.5 61 137.460800 41.46885
## 6) ManufacturingProcess09< 44.93 12 19.518090 40.04417 *
## 7) ManufacturingProcess09>=44.93 49 87.621050 41.81776
## 14) ManufacturingProcess06< 208.1 29 50.181820 41.32690
## 28) ManufacturingProcess04< 933.5 21 28.509320 40.90190 *
## 29) ManufacturingProcess04>=933.5 8 7.922950 42.44250 *
## 15) ManufacturingProcess06>=208.1 20 20.320290 42.52950 *
set.seed(1234)
rpart.plot::rpart.plot(single_model$finalModel, box.palette = "RdBu", shadow.col = "orange", nn=TRUE)