library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## âś” dplyr 1.1.2 âś” readr 2.1.4
## âś” forcats 1.0.0 âś” stringr 1.5.0
## âś” ggplot2 3.4.4 âś” tibble 3.2.1
## âś” lubridate 1.9.2 âś” tidyr 1.3.0
## âś” purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## âś– dplyr::filter() masks stats::filter()
## âś– dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(base)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
library(AppliedPredictiveModeling)
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.2
set.seed(200)
simulated <- mlbench.friedman1(200, sd = 1)
simulated <- cbind(simulated$x, simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated)[ncol(simulated)] <- "y"
The model did not give importance to V6 to V10 predictives based on the graph below/
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
model1 <- randomForest(y ~ ., data = simulated, importance = TRUE, ntree = 1000)
rfImp1 <- varImp(model1, scale = FALSE)
rfImp1 %>%
mutate (var = rownames(rfImp1)) %>%
ggplot(aes(Overall, reorder(var, Overall, sum), var)) +
geom_col(fill = 'purple') +
labs(title = 'Variable Importance' , y = 'Variable')
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
model3 <- cforest(y ~ ., data = simulated)
rfImp3 <- varimp(model3, conditional = TRUE) %>% as.data.frame()
rfImp3 %>%
rename(Overall = '.') %>%
mutate (var = rownames(rfImp3)) %>%
ggplot(aes(Overall, reorder(var, Overall, sum), var)) +
geom_col(fill = 'purple') +
labs(title = 'Variable Importance' , y = 'Variable')
## (d) Repeat this process with different tree models, such as boosted
trees and Cubist. Does the same pattern occur?
library(Cubist)
## Warning: package 'Cubist' was built under R version 4.3.3
model4 <- cubist(simulated[, colnames(simulated)[colnames(simulated) != 'y']],
simulated$y)
rfImp4 <- varImp(model4, scale = FALSE)
rfImp4 %>%
mutate (var = rownames(rfImp4)) %>%
ggplot(aes(Overall, reorder(var, Overall, sum), var)) +
geom_col(fill = 'purple') +
labs(title = 'Variable Importance' , y = 'Variable')
The most significant variable is V3 based on the graph below.
set.seed(88)
V1 <- runif(500, 2,500)
V2 <- rnorm(500, 2,10)
V3 <- rnorm(500, 1,1000)
y <- V2 + V3
df <- data.frame(V1, V2, V3, y)
test_model <- cforest(y ~ ., data = df, ntree = 10)
test_model_imp <- varimp(test_model, conditional = FALSE)
barplot(sort(test_model_imp),horiz = TRUE, col = rainbow(5))
will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 and 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-hand plot has both parameters set to 0.1, and the right-hand plot has both set to 0.9: ## (a) Why does the model on the right focus its importance on just the first few of predictors, whereas the model on the left spreads importance across more predictors? The model on the right concentrates its importance on a few predictors because it has a higher learning rate and a higher bagging rate. This means it uses a larger subset of the data, increasing the correlation at each iteration. Consequently, only a few variables end up with high importance. ## (b) Which model do you think would be more predictive of other samples? The model with the higher learning and bagging rates is likely to overfit because it considers fewer variables. Therefore, the model on the left, which spreads importance across more predictors, is more likely to be predictive of other samples. ## (c) How would increasing interaction depth affect the slope of predictor importance for either model in Fig. 8.24? Increasing the interaction depth would include more predictors in the model. This would lower the RMSE (Root Mean Squared Error) and result in a steeper slope of predictor importance.
data(ChemicalManufacturingProcess)
cmp_impute <- preProcess(ChemicalManufacturingProcess[,-c(1)], method=c('bagImpute'))
cmp <- predict(cmp_impute, ChemicalManufacturingProcess[,-c(1)])
set.seed(480)
train_r <- createDataPartition(ChemicalManufacturingProcess$Yield, p=0.8, list=FALSE)
X_train <- cmp[train_r,]
y_train <- ChemicalManufacturingProcess$Yield[train_r]
X_test <- cmp[-train_r,]
y_test <- ChemicalManufacturingProcess$Yield[-train_r]
set.seed(44)
grid <- expand.grid(n.trees=c(50, 100, 150, 200),
interaction.depth=c(1, 5, 10, 15),
shrinkage=c(0.01, 0.1, 0.5),
n.minobsinnode=c(5, 10, 15))
model_gbm1 <- train(x = X_train,y = y_train, method = 'gbm',tuneGrid = grid, verbose = FALSE)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
model_gbm1$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 76 200 10 0.1 5
set.seed(77)
model_rf3<- train(X_train, y_train, method='rf', tuneLength = 10)
model_rf3
## Random Forest
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.334811 0.5276895 1.0719915
## 8 1.254487 0.5561364 0.9842596
## 14 1.237085 0.5602251 0.9599156
## 20 1.238068 0.5549427 0.9565511
## 26 1.236422 0.5521014 0.9517747
## 32 1.240043 0.5464730 0.9510833
## 38 1.244768 0.5418790 0.9518807
## 44 1.251081 0.5360229 0.9552270
## 50 1.258535 0.5292140 0.9582833
## 57 1.268117 0.5218864 0.9645965
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 26.
library(rpart)
set.seed(66)
model_rpart <- train(x= X_train, y= y_train, method="rpart", tuneLength=10, control= rpart.control(maxdepth=2))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
model_rpart
## CART
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.01684157 1.516456 0.3578700 1.162605
## 0.02137409 1.516456 0.3578700 1.162605
## 0.02341864 1.516456 0.3578700 1.162605
## 0.02532733 1.516456 0.3578700 1.162605
## 0.02890208 1.516456 0.3578700 1.162605
## 0.03111746 1.516456 0.3578700 1.162605
## 0.04559790 1.517792 0.3567906 1.165215
## 0.06391370 1.532447 0.3473686 1.174197
## 0.09164355 1.543158 0.3396923 1.183760
## 0.39332469 1.687773 0.2730084 1.329875
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.03111746.
From below you could see the best model looks like Gradient Boosting. This is because the RMSE value seems better than those from the other models.
summary(resamples(list(Single_True = model_rpart, Random_Forest = model_rf3,
Gradient_Boosting=model_gbm1)))
##
## Call:
## summary.resamples(object = resamples(list(Single_True =
## model_rpart, Random_Forest = model_rf3, Gradient_Boosting = model_gbm1)))
##
## Models: Single_True, Random_Forest, Gradient_Boosting
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Single_True 0.9721591 1.0916421 1.1544688 1.1626050 1.2438397 1.411900
## Random_Forest 0.8413057 0.9003406 0.9357733 0.9517747 0.9861940 1.187571
## Gradient_Boosting 0.8318969 0.8904796 0.9198531 0.9398576 0.9738667 1.112387
## NA's
## Single_True 0
## Random_Forest 0
## Gradient_Boosting 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## Single_True 1.271720 1.426333 1.524448 1.516456 1.614609 1.732767 0
## Random_Forest 1.066881 1.148314 1.237786 1.236422 1.328218 1.504047 0
## Gradient_Boosting 1.066207 1.164416 1.230343 1.231950 1.269784 1.430956 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Single_True 0.1885859 0.2973369 0.3602365 0.3578700 0.4259184 0.5371485
## Random_Forest 0.3948908 0.5083560 0.5547292 0.5521014 0.6138288 0.6830667
## Gradient_Boosting 0.3111637 0.5177063 0.5533567 0.5660252 0.6416362 0.7151902
## NA's
## Single_True 0
## Random_Forest 0
## Gradient_Boosting 0
test_performance <- function(models, testData, testTarget) {
method <- c()
res <- data.frame()
for(model in models){
method <- c(method, model$method)
pred <- predict(model, newdata=testData)
res <- rbind(res, t(postResample(pred=pred, obs=testTarget)))
}
row.names(res) <- method
return(res)
}
#List te models
models <- list(model_rpart, model_rf3, model_gbm1)
#Run the function
performance <- test_performance(models, X_test, y_test)
performance
## RMSE Rsquared MAE
## rpart 1.5647270 0.3765960 1.2801338
## rf 1.1313936 0.7267739 0.8847341
## gbm 0.8727583 0.8195122 0.6368033
Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and nonlinear models?
You’ll see below that both Gradient Boosting and SVM include a mix of biological and manufacturing predictors. In contrast, the Partial Least Squares linear model is primarily dominated by manufacturing predictors. Notably, Manufacturing Process32 appears as an important variable across all three models.
library(gbm)
## Warning: package 'gbm' was built under R version 4.3.3
## Loaded gbm 2.1.9
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
model_pls <- train(x = X_train,y = y_train, method='pls', metric='RMSE',
tuneLength=20, trControl = trainControl(method='cv'))
(pls_imp = varImp(model_pls))
## Warning: package 'pls' was built under R version 4.3.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess14 38.56
## BiologicalMaterial03 37.72
## ManufacturingProcess27 35.46
## ManufacturingProcess35 35.22
## ManufacturingProcess24 34.14
## ManufacturingProcess15 32.40
## ManufacturingProcess28 31.96
## ManufacturingProcess25 30.59
## ManufacturingProcess26 29.40
## ManufacturingProcess06 29.10
## BiologicalMaterial06 28.36
## BiologicalMaterial02 27.71
## ManufacturingProcess04 27.69
## ManufacturingProcess02 21.97
## ManufacturingProcess05 18.61
## ManufacturingProcess33 17.73
## ManufacturingProcess19 15.84
## ManufacturingProcess17 14.69
## ManufacturingProcess09 13.91
set.seed(222)
svm_model <- train(x = X_train,y = y_train,
method = "svmRadial",
tuneLength=10,
preProc = c("center", "scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
(svm_imp = varImp(svm_model))
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## BiologicalMaterial06 100.00
## ManufacturingProcess13 98.34
## ManufacturingProcess32 96.95
## BiologicalMaterial03 81.49
## ManufacturingProcess17 75.31
## BiologicalMaterial02 73.06
## BiologicalMaterial12 71.59
## ManufacturingProcess09 70.96
## ManufacturingProcess31 67.04
## ManufacturingProcess36 66.24
## ManufacturingProcess06 55.48
## BiologicalMaterial04 54.37
## BiologicalMaterial11 53.94
## ManufacturingProcess33 49.60
## BiologicalMaterial08 48.82
## BiologicalMaterial01 46.53
## ManufacturingProcess02 44.43
## ManufacturingProcess29 40.72
## ManufacturingProcess11 36.51
## ManufacturingProcess12 32.98
Range and Impact: The distribution of yield in each terminal node shows the range of outcomes given certain conditions, helping to identify which conditions lead to higher or lower yields. Non-linearity: The tree structure can uncover non-linear relationships between predictors and yield, which might not be apparent in linear models.
model_rpart$finalModel
## n= 144
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 144 472.70180 40.17111
## 2) ManufacturingProcess32< 159.5 83 142.65450 39.19699
## 4) BiologicalMaterial11< 145.075 39 42.89152 38.55615 *
## 5) BiologicalMaterial11>=145.075 44 69.55090 39.76500 *
## 3) ManufacturingProcess32>=159.5 61 144.12200 41.49656
## 6) BiologicalMaterial06< 51.61 34 60.34664 40.74559 *
## 7) BiologicalMaterial06>=51.61 27 40.45527 42.44222 *