library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(party)
## Warning: package 'party' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.3
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## Attaching package: 'party'
##
## The following object is masked from 'package:dplyr':
##
## where
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.3.3
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:modeltools':
##
## prior
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.3.3
library(rpart)
library(ranger)
## Warning: package 'ranger' was built under R version 4.3.3
##
## Attaching package: 'ranger'
##
## The following object is masked from 'package:randomForest':
##
## importance
library(partykit)
## Warning: package 'partykit' was built under R version 4.3.3
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 4.3.3
##
## Attaching package: 'partykit'
##
## The following objects are masked from 'package:party':
##
## cforest, ctree, ctree_control, edge_simple, mob, mob_control,
## node_barplot, node_bivplot, node_boxplot, node_inner, node_surv,
## node_terminal, varimp
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
model1 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree =1000)
rfimp1 <- varImp(model1, scale = FALSE)
rfimp1
simulated$duplicate1 <- simulated$V1 +rnorm(200) * .1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
set.seed(334)
model2 <- randomForest(y ~ ., data = simulated,
importance = TRUE,
ntree = 1000)
rfImp2 <- varImp(model2, scale = FALSE)
rfImp2
simulated$duplicate2 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate2, simulated$V1)
## [1] 0.93512
There is a slight decline in the importance values of all predictors, creating a more balanced distribution. Interestingly, however, the duplicate of V1 currently ranks as the 4th most significant predictor.
model3 <- cforest(y ~ ., data = simulated[, c(1:11)])
rfImp3 <- varimp(model3, conditional = TRUE)
rfImp3
## V1 V2 V3 V4 V5 V6 V7
## 6.2077667 5.2073432 0.1852956 6.1055620 1.3268631 -0.1120681 -0.1247807
## V8 V9 V10
## -0.3175298 -0.3357481 -0.2707247
The importance values follow a similar pattern to those observed in a traditional random forest model, with V1, V2, and V4 emerging as the strongest predictors. Meanwhile, V6 through V10 display lower importance values and would likely be excluded during parameter tuning.
Boosted Trees
gbmGrid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
n.trees = seq(100, 1000, by = 50),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 10)
set.seed(334)
gbmTune <- train(y ~ ., data = simulated[, c(1:11)],
method = "gbm",
tuneGrid = gbmGrid,
verbose = FALSE)
rfImp4 <- varImp(gbmTune$finalModel, scale = FALSE)
rfImp4
Cubist
set.seed(334)
cubistTuned <- train(y ~ ., data = simulated[, c(1:11)], method = "cubist")
rfImp5 <- varImp(cubistTuned$finalModel, scale = FALSE)
rfImp5
A similar pattern appears in both Boosted Trees and Cubist models, where V1, V3, and V4 consistently stand out as the most important predictors.
set.seed(21)
a <- sample(1:10 / 10, 500, replace = TRUE)
b <- sample(1:100 / 100, 500, replace = TRUE)
c <- sample(1:1000 / 1000, 500, replace = TRUE)
d <- sample(1:10000 / 10000, 500, replace = TRUE)
e <- sample(1:100000 / 100000, 500, replace = TRUE)
y <- a + b + c + d + e
simData <- data.frame(a, b, c, d, e, y)
# Fit Decision tree model
rpartTree <- rpart(y ~ ., data = simData)
tree_party <- as.party(rpartTree)
plot(tree_party, gp = gpar(fontsize = 7))
The model on the right appears to place greater emphasis on a small number of predictors, likely due to overfitting. This behavior is associated with its higher learning rate and bagging fraction. The learning rate determines the proportion of the current predicted value added to the previous iteration’s prediction, influencing the model’s speed of learning. The bagging fraction represents the percentage of training data utilized by the model.
With a higher learning rate, the model on the right increases the weight of each predictor, resulting in concentrated importance. In contrast, the model on the left has a lower learning rate and bagging fraction, leading to slower learning and better performance. By using a smaller subset of data, the model distributes importance more evenly across predictors rather than heavily weighting a select few.
The model on the left is likely to perform better at predicting new samples because it undergoes more iterations. With more iterations, the weight assigned to each individual predictor is reduced, allowing the model to generalize more effectively. As a result, the model on the left achieves greater overall accuracy.
Interaction depth refers to the number of splits made in a tree or the maximum number of nodes in each tree. As the interaction depth increases, the importance of each predictor also rises, causing less significant predictors to have more evenly distributed weights. Consequently, the slope of predictor importance for either model becomes steeper or shows a greater increase.
data(ChemicalManufacturingProcess)
# Utilizing Knn imputation
impute <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
imputed <- predict(impute, ChemicalManufacturingProcess)
imputed <- imputed %>% select_at(vars(-one_of(nearZeroVar(., names = TRUE))))
set.seed(22)
X <- dplyr::select(imputed, -Yield)
y <- imputed$Yield
index <- createDataPartition(y, p = .8, list = FALSE)
train_X <- X[index, ] %>% as.matrix()
test_X <- X[-index, ] %>% as.matrix()
train_y <- y[index]
test_y <- y[-index]
Bagged Model
set.seed(17)
bag_setup = bagControl(fit = ctreeBag$fit, predict = ctreeBag$pred, aggregate = ctreeBag$aggregate)
bag_model_fit <- train(x = train_X, y = train_y, method="bag", bagControl = bag_setup,
center = TRUE,
scale = TRUE,
trControl = trainControl("cv", number = 10),importance = "permutation",
tuneLength = 25)
## Warning: executing %dopar% sequentially: no parallel backend registered
bag_pred <- predict(bag_model_fit, test_X)
postResample(bag_pred, test_y)
## RMSE Rsquared MAE
## 0.6213009 0.4955786 0.4969667
Random Forest
set.seed(17)
rf_model_fit <- train(x = train_X, y = train_y, method = "ranger",
preProcess = c("center", "scale"),
trControl = trainControl(method = "cv", number = 10),
importance = "permutation",
tuneLength = 25)
rf_pred <- predict(rf_model_fit, test_X)
postResample(rf_pred, test_y)
## RMSE Rsquared MAE
## 0.5387580 0.6069437 0.4343783
GBM Model
grid_params <- expand.grid(n.trees=c(50, 100),
interaction.depth=c(1, 5, 10),
shrinkage=c(0.01, 0.1, 0.2),
n.minobsinnode=c(5, 10))
gmb_model_fit<- train(x = train_X, y = train_y,
method = 'gbm',
tuneGrid = grid_params,
verbose = FALSE)
gmb_pred <- predict(gmb_model_fit, test_X)
postResample(gmb_pred, test_y)
## RMSE Rsquared MAE
## 0.4912689 0.6838583 0.4018570
plot(varImp(gmb_model_fit),
top=10,
main="Top 10 Important Features with GBM Model")
plot(varImp(rf_model_fit),
top=10,
main="Top 10 Important Features with Random Forest Model")
plot(varImp(bag_model_fit),
top=10,
main="Top 10 Important Features with Bagging Model")
Interestingly, GBM, which was the best-performing model, shows the highest ratio of Manufacturing Process predictors to Biological Materials at 7:3. Random Forest and Bagging both have an even split of 5:5 predictors. Since the optimal linear and non-linear models previously highlighted more Manufacturing Process predictors in their top 10, a clear trend emerges: relying on Manufacturing Process predictors tends to result in better-performing models. Additionally, it’s worth noting that across all models in this dataset, Manufacturing Process 32 consistently ranks as the top predictor.
model <- train(x = train_X, y = train_y, method = "rpart",
trControl = trainControl("cv", number = 10),
tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
finalRpartModel <- model$finalModel
tree_party <- as.party(finalRpartModel)
plot(tree_party, main = "Optimal Single Tree Plot", gp = gpar(fontsize = 7))
This perspective helps illustrate how Manufacturing Process predictors continue to dominate after MP32. Specifically, when the value is either below or above 0.192, it diverges: values under 0.192 lean toward Biological Material 11, while those above 0.192 align with Manufacturing Process 06. Notably, the best outcomes seem to occur when the model incorporates three Manufacturing Process predictors.