library(tidyverse)
library(reactable)
library(caret)
library(pls)
library(elasticnet)
library(glmnet)
library(modelsummary)
library(Metrics)
library(kableExtra)
library(corrplot)
library(AppliedPredictiveModeling)
data(permeability)
reactable(permeability)
reactable(fingerprints)
The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
nearZeroVar
function from the caret package. How many predictors are left for
modeling?finger_pred <- nearZeroVar(fingerprints)
pred_left <- fingerprints[,-finger_pred]
cat("Predictors that have low frequency:", length(finger_pred), "\n")
## Predictors that have low frequency: 719
cat("Predictors that are left for modeling:", dim(pred_left)[2])
## Predictors that are left for modeling: 388
df <- as.data.frame(fingerprints[, nearZeroVar(fingerprints)]) |>
mutate(y = permeability)
set.seed(123)
in_train <- createDataPartition(df$y, times = 1, p = 0.8, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]
#ctrl <- trainControl(method = "repeatedcv", repeats = 3)
pls_model <- train(
y ~ ., data = train_df, method = "pls",
center = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
pls_model
## Partial Least Squares
##
## 133 samples
## 719 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 15.05182 0.2150523 11.48709
## 2 14.70895 0.2298232 11.07769
## 3 14.56647 0.2808471 10.84294
## 4 15.12818 0.2359637 11.17423
## 5 14.82068 0.2492830 11.21548
## 6 14.73329 0.2556598 11.03924
## 7 14.96640 0.2444540 11.19515
## 8 15.22244 0.2437605 11.31082
## 9 15.51595 0.2448334 11.55775
## 10 15.68169 0.2479864 11.74880
## 11 15.65403 0.2584212 11.75268
## 12 15.67149 0.2678684 11.82700
## 13 15.87459 0.2632575 11.92749
## 14 15.81526 0.2677245 11.82048
## 15 15.81498 0.2677419 11.76343
## 16 15.74250 0.2731821 11.69094
## 17 15.70578 0.2751794 11.58523
## 18 15.69973 0.2750557 11.56265
## 19 15.73661 0.2732548 11.57690
## 20 15.72602 0.2746024 11.56433
## 21 15.74717 0.2752788 11.55663
## 22 15.77552 0.2755853 11.58918
## 23 15.80427 0.2751538 11.62826
## 24 15.84191 0.2757263 11.66490
## 25 15.86088 0.2757210 11.68947
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
set.seed(123)
pls_model$results |>
filter(ncomp == pls_model$bestTune$ncomp) |>
select(ncomp, RMSE, Rsquared, MAE) |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| ncomp | RMSE | Rsquared | MAE |
|---|---|---|---|
| 3 | 14.56647 | 0.2808471 | 10.84294 |
ggplot(pls_model)
The optimal variables that are optimal is 3, indicated by the value
of ncomp. The estimated \(R^2\) value is 0.2808471.
set.seed(123)
pls_predictions <- predict(pls_model, test_df)
results <- data.frame(Model = "PLS",
RMSE = caret::RMSE(pls_predictions, test_df$y),
Rsquared = caret::R2(pls_predictions, test_df$y),
MAE = caret::MAE(pls_predictions, test_df$y))
set.seed(123)
results |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| Model | RMSE | Rsquared | MAE | |
|---|---|---|---|---|
| permeability | PLS | 11.52993 | 0.1796387 | 8.665959 |
The test set estimate of \(R^2\) is 0.1796387, which is worse than the train set estimate \(R^2\) value of 0.2808471
Ridge
set.seed(123)
x <- model.matrix(y ~ ., train_df)
x_test <- model.matrix(y ~ ., test_df)
rr_cv <- cv.glmnet(x, train_df$y, alpha = 0)
rr_model <- glmnet(x, train_df$y, alpha = 0, lambda = rr_cv$lambda.min)
rr_predictions <- as.vector(predict(rr_model, x_test))
rr_results <- data.frame(Model = "Ridge Regression",
RMSE = caret::RMSE(rr_predictions, test_df$y),
Rsquared = caret::R2(rr_predictions, test_df$y),
MAE = caret::MAE(rr_predictions, test_df$y))
set.seed(123)
rr_results |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| Model | RMSE | Rsquared | MAE | |
|---|---|---|---|---|
| permeability | Ridge Regression | 11.85528 | 0.1301651 | 9.125384 |
Lasso
set.seed(123)
lr_cv <- cv.glmnet(x, train_df$y, alpha = 1)
lr_model <- glmnet(x, train_df$y, alpha = 1, lambda = lr_cv$lambda.min)
lr_predictions <- as.vector(predict(lr_model, x_test))
lr_results <- data.frame(Model = "Lasso Regression",
RMSE = caret::RMSE(lr_predictions, test_df$y),
Rsquared = caret::R2(lr_predictions, test_df$y),
MAE = caret::MAE(lr_predictions, test_df$y))
set.seed(123)
lr_results |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| Model | RMSE | Rsquared | MAE | |
|---|---|---|---|---|
| permeability | Lasso Regression | 9.199298 | 0.5557454 | 7.834985 |
Enet
set.seed(123)
enet_model <- train(
y ~ ., data = train_df, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneLength = 20
)
# Best tuning parameters
enet_model$bestTune
## alpha lambda
## 15 0.1 4.021414
set.seed(123)
enet_predictions <- enet_model |>
predict(x_test)
enet_results <- data.frame(Model = "Elastic Net Regression",
RMSE = caret::RMSE(enet_predictions, test_df$y),
Rsquared = caret::R2(enet_predictions, test_df$y),
MAE = caret::MAE(enet_predictions, test_df$y))
set.seed(123)
enet_results |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| Model | RMSE | Rsquared | MAE | |
|---|---|---|---|---|
| permeability | Elastic Net Regression | 10.13767 | 0.3968851 | 8.111249 |
Summary of All Models
pls_model$results |>
filter(ncomp == pls_model$bestTune$ncomp) |>
mutate("Model" = "PLS") |>
select(Model, RMSE, Rsquared, MAE) |>
as.data.frame() |>
bind_rows(rr_results) |>
bind_rows(lr_results) |>
bind_rows(enet_results) |>
arrange(desc(Rsquared)) |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| Model | RMSE | Rsquared | MAE | |
|---|---|---|---|---|
| permeability…1 | Lasso Regression | 9.199298 | 0.5557454 | 7.834985 |
| permeability…2 | Elastic Net Regression | 10.137674 | 0.3968851 | 8.111249 |
| …3 | PLS | 14.566470 | 0.2808471 | 10.842938 |
| permeability…4 | Ridge Regression | 11.855285 | 0.1301651 | 9.125384 |
The Ridge Regression did not perform better than the PLS model in terms of its \(R^2\) value. However, it had lower RMSE and MAE values than the PLS model. The Elastic Net Regression model had the second-highest \(R^2\) and second-lowest RMSE and MAE. The Lasso Regression performed better than the all of the models, having a higher \(R^2\) value and lower RMSE and MAE values.
I would recommend creating another model and evaluating its performance in comparison with the other models to determine which would perform best. If time is a factor, however, I think the Lasso model would be the suggested model, due to its high \(R^2\) performance and low RMSE and MAE values in comparison with the other models.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
The matrix processPredictors contains the 57 predictors
(12 describing the input biological material and 45 describing the
process predictors) for the 176 manufacturing runs. yield contains the
percent yield for each run.
set.seed(123)
bag_model <- preProcess(ChemicalManufacturingProcess, "bagImpute")
bag_df_model <- predict(bag_model, ChemicalManufacturingProcess)
Filtering out the predictors that have low frequencies using the
nearZeroVar function:
set.seed(123)
bag_df_model2 <- bag_df_model[,-nearZeroVar(bag_df_model)]
dim(bag_df_model2)
## [1] 176 57
set.seed(123)
train_data <- createDataPartition(bag_df_model2$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- bag_df_model2[train_data, ]
test_df <- bag_df_model2[-train_data, ]
set.seed(123)
ctrl <- trainControl(
method = "repeatedcv",
repeats = 3
)
set.seed(123)
plsFit <- train(
Yield ~ .,
data = train_df,
method = "pls",
preProc = c("center", "scale"),
tuneLength = 20,
trControl = ctrl
)
plsFit
## Partial Least Squares
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.476947 0.4274570 1.164510
## 2 1.850791 0.5070859 1.224986
## 3 1.355649 0.6001753 1.026442
## 4 1.430370 0.5817934 1.045561
## 5 1.640194 0.5694359 1.104722
## 6 1.706507 0.5620935 1.135600
## 7 1.928679 0.5487040 1.224008
## 8 2.169979 0.5335025 1.320986
## 9 2.391891 0.5175290 1.413369
## 10 2.673825 0.5000532 1.522270
## 11 3.021238 0.4760784 1.658234
## 12 3.334262 0.4554791 1.777413
## 13 3.478820 0.4466374 1.824949
## 14 3.537928 0.4377883 1.851143
## 15 3.534322 0.4332394 1.851400
## 16 3.517697 0.4344983 1.844488
## 17 3.462842 0.4365876 1.830382
## 18 3.438765 0.4356924 1.832021
## 19 3.419590 0.4366412 1.838496
## 20 3.392253 0.4402583 1.833642
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
set.seed(123)
plsFit$results |>
filter(ncomp == plsFit$bestTune$ncomp) |>
select(ncomp, RMSE, Rsquared, MAE) |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| ncomp | RMSE | Rsquared | MAE |
|---|---|---|---|
| 3 | 1.355649 | 0.6001753 | 1.026442 |
ggplot(plsFit)
The optimal value of the performance metric 3, indicated by the value
of ncomp.
set.seed(123)
pls_pred_model <- predict(plsFit, test_df)
pls_pred_results <- data.frame("RMSE" = caret::RMSE(pls_pred_model, test_df$Yield),
"Rsquared" = caret::R2(pls_pred_model, test_df$Yield),
"MAE" = caret::MAE(pls_pred_model, test_df$Yield))
pls_pred_results |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| RMSE | Rsquared | MAE |
|---|---|---|
| 1.369543 | 0.4761081 | 1.161469 |
Compared to the performance on the training set, the response for the test set did not perform better. The RMSE and MAE values increased, while the \(R^2\) decreased.
varImp(plsFit)
## pls variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess17 87.52
## ManufacturingProcess13 86.38
## ManufacturingProcess09 86.21
## ManufacturingProcess36 85.50
## ManufacturingProcess06 69.80
## ManufacturingProcess33 63.73
## BiologicalMaterial06 62.09
## BiologicalMaterial03 60.97
## BiologicalMaterial02 60.38
## BiologicalMaterial08 60.33
## ManufacturingProcess11 56.69
## BiologicalMaterial12 55.53
## BiologicalMaterial11 55.43
## BiologicalMaterial01 52.39
## BiologicalMaterial04 50.93
## ManufacturingProcess28 48.89
## ManufacturingProcess12 46.56
## ManufacturingProcess37 45.17
## BiologicalMaterial10 42.24
set.seed(123)
pls_model_importance <- varImp(plsFit)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
filter(Overall >= 50) |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
set.seed(123)
varImp(plsFit) %>%
plot(., top = max(pls_model_importance$importance), main = "Important Variables - 50% or Greater")
Manufacturing Process 32, 17, 13, 09, and 36 were the most important to the model, with each having an importance rating of greater than 80.
set.seed(123)
pls_model_importance |>
mutate(Variable = str_replace_all(Variable, "[0-9]+", "")) |>
group_by(Variable) |>
count() |>
arrange(desc(n)) |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| Variable | n |
|---|---|
| BiologicalMaterial | 8 |
| ManufacturingProcess | 8 |
The Manufacturing Process predictors and Biological Material predictors had the same amount of important predictors.
set.seed(123)
top_10_pred <- varImp(plsFit)$importance |>
arrange(desc(Overall)) |>
head(10)
bag_df_model2 |>
select(c("Yield", row.names(top_10_pred))) |>
cor() |>
corrplot(method="square", tl.cex = 0.9, number.cex = 0.6, addCoef.col="black")
This information can help improve yield in future runs of the manufacturing process by helping identify important predictors that may be highly correlated with other important predictors, filtering out the predictors with high multicollinearity.