library(tidyverse)
library(reactable)
library(caret)
library(pls)
library(elasticnet)
library(glmnet)
library(modelsummary)
library(Metrics)
library(kableExtra)
library(corrplot)

6.2. Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

(a) Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(permeability)
reactable(permeability)
reactable(fingerprints)

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

(b) The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

finger_pred <- nearZeroVar(fingerprints)
pred_left <- fingerprints[,-finger_pred]
cat("Predictors that have low frequency:", length(finger_pred), "\n")
## Predictors that have low frequency: 719
cat("Predictors that are left for modeling:", dim(pred_left)[2])
## Predictors that are left for modeling: 388

(c) Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of \(R^2\)?

df <- as.data.frame(fingerprints[, nearZeroVar(fingerprints)]) |>
  mutate(y = permeability)


set.seed(123)

in_train <- createDataPartition(df$y, times = 1, p = 0.8, list = FALSE)
train_df <- df[in_train, ]
test_df <- df[-in_train, ]


#ctrl <- trainControl(method = "repeatedcv", repeats = 3)
pls_model <- train(
  y ~ ., data = train_df, method = "pls",
  center = TRUE,
  trControl = trainControl("cv", number = 10),
  tuneLength = 25
)
pls_model
## Partial Least Squares 
## 
## 133 samples
## 719 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     15.05182  0.2150523  11.48709
##    2     14.70895  0.2298232  11.07769
##    3     14.56647  0.2808471  10.84294
##    4     15.12818  0.2359637  11.17423
##    5     14.82068  0.2492830  11.21548
##    6     14.73329  0.2556598  11.03924
##    7     14.96640  0.2444540  11.19515
##    8     15.22244  0.2437605  11.31082
##    9     15.51595  0.2448334  11.55775
##   10     15.68169  0.2479864  11.74880
##   11     15.65403  0.2584212  11.75268
##   12     15.67149  0.2678684  11.82700
##   13     15.87459  0.2632575  11.92749
##   14     15.81526  0.2677245  11.82048
##   15     15.81498  0.2677419  11.76343
##   16     15.74250  0.2731821  11.69094
##   17     15.70578  0.2751794  11.58523
##   18     15.69973  0.2750557  11.56265
##   19     15.73661  0.2732548  11.57690
##   20     15.72602  0.2746024  11.56433
##   21     15.74717  0.2752788  11.55663
##   22     15.77552  0.2755853  11.58918
##   23     15.80427  0.2751538  11.62826
##   24     15.84191  0.2757263  11.66490
##   25     15.86088  0.2757210  11.68947
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
set.seed(123)
pls_model$results |>
  filter(ncomp == pls_model$bestTune$ncomp) |>
  select(ncomp, RMSE, Rsquared, MAE) |>
  kbl() |>
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
ncomp RMSE Rsquared MAE
3 14.56647 0.2808471 10.84294
ggplot(pls_model)

The optimal variables that are optimal is 3, indicated by the value of ncomp. The estimated \(R^2\) value is 0.2808471.

(d) Predict the response for the test set. What is the test set estimate of \(R^2\)?

set.seed(123)
pls_predictions <- predict(pls_model, test_df)
results <- data.frame(Model = "PLS",
                      RMSE = caret::RMSE(pls_predictions, test_df$y),
                      Rsquared = caret::R2(pls_predictions, test_df$y), 
                      MAE = caret::MAE(pls_predictions, test_df$y))

set.seed(123)
results |>
  kbl() |>
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Model RMSE Rsquared MAE
permeability PLS 11.52993 0.1796387 8.665959

The test set estimate of \(R^2\) is 0.1796387, which is worse than the train set estimate \(R^2\) value of 0.2808471

(e) Try building other models discussed in this chapter. Do any have better predictive performance?

Ridge

set.seed(123)
x <- model.matrix(y ~ ., train_df)
x_test <- model.matrix(y ~ ., test_df)
rr_cv <- cv.glmnet(x, train_df$y, alpha = 0)
rr_model <- glmnet(x, train_df$y, alpha = 0, lambda = rr_cv$lambda.min)
rr_predictions <- as.vector(predict(rr_model, x_test))
rr_results <- data.frame(Model = "Ridge Regression",
                         RMSE = caret::RMSE(rr_predictions, test_df$y),
                         Rsquared = caret::R2(rr_predictions, test_df$y), 
                         MAE = caret::MAE(rr_predictions, test_df$y))
set.seed(123)
rr_results |>
  kbl() |>
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Model RMSE Rsquared MAE
permeability Ridge Regression 11.85528 0.1301651 9.125384

Lasso

set.seed(123)
lr_cv <- cv.glmnet(x, train_df$y, alpha = 1)
lr_model <- glmnet(x, train_df$y, alpha = 1, lambda = lr_cv$lambda.min)
lr_predictions <- as.vector(predict(lr_model, x_test))
lr_results <- data.frame(Model = "Lasso Regression",
                         RMSE = caret::RMSE(lr_predictions, test_df$y),
                         Rsquared = caret::R2(lr_predictions, test_df$y),
                         MAE = caret::MAE(lr_predictions, test_df$y))
set.seed(123)
lr_results |>
  kbl() |>
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Model RMSE Rsquared MAE
permeability Lasso Regression 9.199298 0.5557454 7.834985

Enet

set.seed(123)
enet_model <- train(
  y ~ ., data = train_df, method = "glmnet",
  trControl = trainControl("cv", number = 10),
  tuneLength = 20
)
# Best tuning parameters
enet_model$bestTune
##    alpha   lambda
## 15   0.1 4.021414
set.seed(123)
enet_predictions <- enet_model |> 
  predict(x_test)

enet_results <- data.frame(Model = "Elastic Net Regression",
                         RMSE = caret::RMSE(enet_predictions, test_df$y),
                         Rsquared = caret::R2(enet_predictions, test_df$y),
                         MAE = caret::MAE(enet_predictions, test_df$y))
set.seed(123)
enet_results |>
  kbl() |>
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Model RMSE Rsquared MAE
permeability Elastic Net Regression 10.13767 0.3968851 8.111249

Summary of All Models

pls_model$results |>
  filter(ncomp == pls_model$bestTune$ncomp) |>
  mutate("Model" = "PLS") |>
  select(Model, RMSE, Rsquared, MAE) |>
  as.data.frame() |>
  bind_rows(rr_results) |>
  bind_rows(lr_results) |>
  bind_rows(enet_results) |>
  arrange(desc(Rsquared)) |>
  kbl() |>
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Model RMSE Rsquared MAE
permeability…1 Lasso Regression 9.199298 0.5557454 7.834985
permeability…2 Elastic Net Regression 10.137674 0.3968851 8.111249
…3 PLS 14.566470 0.2808471 10.842938
permeability…4 Ridge Regression 11.855285 0.1301651 9.125384

The Ridge Regression did not perform better than the PLS model in terms of its \(R^2\) value. However, it had lower RMSE and MAE values than the PLS model. The Elastic Net Regression model had the second-highest \(R^2\) and second-lowest RMSE and MAE. The Lasso Regression performed better than the all of the models, having a higher \(R^2\) value and lower RMSE and MAE values.

(f) Would you recommend any of your models to replace the permeability laboratory experiment?

I would recommend creating another model and evaluating its performance in comparison with the other models to determine which would perform best. If time is a factor, however, I think the Lasso model would be the suggested model, due to its high \(R^2\) performance and low RMSE and MAE values in comparison with the other models.

6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

(a) Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

(b) A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

set.seed(123)

bag_model <- preProcess(ChemicalManufacturingProcess, "bagImpute")
bag_df_model <- predict(bag_model, ChemicalManufacturingProcess)

(c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

Filtering out the predictors that have low frequencies using the nearZeroVar function:

set.seed(123)
bag_df_model2 <- bag_df_model[,-nearZeroVar(bag_df_model)]
dim(bag_df_model2)
## [1] 176  57
set.seed(123)
train_data <- createDataPartition(bag_df_model2$Yield, times = 1, p = 0.8, list = FALSE)
train_df <- bag_df_model2[train_data, ]
test_df <- bag_df_model2[-train_data, ]
set.seed(123)
ctrl <- trainControl(
  method = "repeatedcv", 
  repeats = 3
)

set.seed(123)
plsFit <- train(
  Yield ~ .,
  data = train_df,
  method = "pls",
  preProc = c("center", "scale"),
  tuneLength = 20,
  trControl = ctrl
)
plsFit
## Partial Least Squares 
## 
## 144 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     1.476947  0.4274570  1.164510
##    2     1.850791  0.5070859  1.224986
##    3     1.355649  0.6001753  1.026442
##    4     1.430370  0.5817934  1.045561
##    5     1.640194  0.5694359  1.104722
##    6     1.706507  0.5620935  1.135600
##    7     1.928679  0.5487040  1.224008
##    8     2.169979  0.5335025  1.320986
##    9     2.391891  0.5175290  1.413369
##   10     2.673825  0.5000532  1.522270
##   11     3.021238  0.4760784  1.658234
##   12     3.334262  0.4554791  1.777413
##   13     3.478820  0.4466374  1.824949
##   14     3.537928  0.4377883  1.851143
##   15     3.534322  0.4332394  1.851400
##   16     3.517697  0.4344983  1.844488
##   17     3.462842  0.4365876  1.830382
##   18     3.438765  0.4356924  1.832021
##   19     3.419590  0.4366412  1.838496
##   20     3.392253  0.4402583  1.833642
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
set.seed(123)
plsFit$results |>
  filter(ncomp == plsFit$bestTune$ncomp) |>
  select(ncomp, RMSE, Rsquared, MAE) |>
  kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
ncomp RMSE Rsquared MAE
3 1.355649 0.6001753 1.026442
ggplot(plsFit)

The optimal value of the performance metric 3, indicated by the value of ncomp.

(d) Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

set.seed(123)
pls_pred_model <- predict(plsFit, test_df)
pls_pred_results <- data.frame("RMSE" = caret::RMSE(pls_pred_model, test_df$Yield),
                               "Rsquared" = caret::R2(pls_pred_model, test_df$Yield), 
                               "MAE" = caret::MAE(pls_pred_model, test_df$Yield))
pls_pred_results |>
  kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
RMSE Rsquared MAE
1.369543 0.4761081 1.161469

Compared to the performance on the training set, the response for the test set did not perform better. The RMSE and MAE values increased, while the \(R^2\) decreased.

(e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

varImp(plsFit)
## pls variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess17   87.52
## ManufacturingProcess13   86.38
## ManufacturingProcess09   86.21
## ManufacturingProcess36   85.50
## ManufacturingProcess06   69.80
## ManufacturingProcess33   63.73
## BiologicalMaterial06     62.09
## BiologicalMaterial03     60.97
## BiologicalMaterial02     60.38
## BiologicalMaterial08     60.33
## ManufacturingProcess11   56.69
## BiologicalMaterial12     55.53
## BiologicalMaterial11     55.43
## BiologicalMaterial01     52.39
## BiologicalMaterial04     50.93
## ManufacturingProcess28   48.89
## ManufacturingProcess12   46.56
## ManufacturingProcess37   45.17
## BiologicalMaterial10     42.24
set.seed(123)
pls_model_importance <- varImp(plsFit)$importance |>
  as.data.frame() |>
  rownames_to_column("Variable") |>
  filter(Overall >= 50) |>
  arrange(desc(Overall)) |>
  mutate(importance = row_number())

set.seed(123)
varImp(plsFit) %>%
  plot(., top = max(pls_model_importance$importance), main = "Important Variables - 50% or Greater")

Manufacturing Process 32, 17, 13, 09, and 36 were the most important to the model, with each having an importance rating of greater than 80.

set.seed(123)
pls_model_importance |>
  mutate(Variable = str_replace_all(Variable, "[0-9]+", "")) |>
  group_by(Variable) |>
  count() |>
  arrange(desc(n)) |>
  kbl() |>
  kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
Variable n
BiologicalMaterial 8
ManufacturingProcess 8

The Manufacturing Process predictors and Biological Material predictors had the same amount of important predictors.

(f) Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

set.seed(123)
top_10_pred <- varImp(plsFit)$importance |>
  arrange(desc(Overall)) |>
  head(10)


bag_df_model2 |>
  select(c("Yield", row.names(top_10_pred))) |>
  cor() |>
  corrplot(method="square", tl.cex = 0.9, number.cex = 0.6, addCoef.col="black")

This information can help improve yield in future runs of the manufacturing process by helping identify important predictors that may be highly correlated with other important predictors, filtering out the predictors with high multicollinearity.