library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(tidyverse)
library(corrplot)
library(knitr)
library(glmnet)## [1] 165 1107
## [1] 165
## permeability
## Min. : 0.06
## 1st Qu.: 1.55
## Median : 4.91
## Mean :12.24
## 3rd Qu.:15.47
## Max. :55.60
The dataset contains 165 compounds described by 1107 binary fingerprint predictors, with a continuous permeability response.
Many binary fingerprint predictors are sparse and most molecules
don’t contain a given substructure, so we use nearZeroVar()
to remove uninformative columns.
nzv_idx <- nearZeroVar(fingerprints)
cat("Predictors removed (near-zero variance):", length(nzv_idx), "\n")## Predictors removed (near-zero variance): 719
## Predictors remaining: 388
After filtering, 388 predictors remain for modeling.
We log-transform the response (permeability is right-skewed), split 80/20, and tune PLS via 10-fold cross-validation.
set.seed(123)
# Log-transform the skewed response
log_perm <- log(permeability)
# 80/20 train/test split (stratified on the log response)
train_idx <- createDataPartition(log_perm, p = 0.80, list = FALSE)
X_train <- fp_filtered[train_idx, ]
X_test <- fp_filtered[-train_idx, ]
y_train <- log_perm[train_idx]
y_test <- log_perm[-train_idx]
cat("Training set:", nrow(X_train), "samples\n")## Training set: 133 samples
## Test set: 32 samples
# 10-fold CV control
ctrl <- trainControl(method = "cv", number = 10)
# Tune PLS (search over 1–20 latent variables)
set.seed(123)
pls_fit <- train(
x = X_train,
y = y_train,
method = "pls",
tuneGrid = data.frame(ncomp = 1:20),
trControl = ctrl,
preProcess = c("center", "scale")
)
pls_fit## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 119, 118, 120, 121, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.318623 0.2878538 1.0936803
## 2 1.175250 0.4223872 0.9277603
## 3 1.143653 0.4607692 0.9003402
## 4 1.121302 0.4889732 0.9181559
## 5 1.120204 0.5001917 0.8905865
## 6 1.095501 0.5310705 0.8497398
## 7 1.071736 0.5497272 0.8402284
## 8 1.057515 0.5679785 0.8305465
## 9 1.083285 0.5624847 0.8530669
## 10 1.134648 0.5583097 0.8964011
## 11 1.146658 0.5665197 0.9089386
## 12 1.166392 0.5571221 0.9368057
## 13 1.177877 0.5496513 0.9342370
## 14 1.196855 0.5450016 0.9461325
## 15 1.205878 0.5472701 0.9557193
## 16 1.196609 0.5634834 0.9398086
## 17 1.212974 0.5541142 0.9606699
## 18 1.223674 0.5468819 0.9721771
## 19 1.222901 0.5542604 0.9704077
## 20 1.227794 0.5532549 0.9667679
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.
best_ncomp <- pls_fit$bestTune$ncomp
best_r2 <- pls_fit$results$Rsquared[which.min(pls_fit$results$RMSE)]
cat(sprintf("Optimal latent variables: %d\n", best_ncomp))## Optimal latent variables: 8
## Resampled R^2: 0.5680
The optimal PLS model uses 8 latent variable(s) with a resampled R^2 of approximately 0.568 (selected by minimum RMSE).
pls_pred <- predict(pls_fit, newdata = X_test)
test_r2 <- cor(pls_pred, y_test)^2
test_rmse <- RMSE(pls_pred, y_test)
cat(sprintf("Test set R^2: %.4f\n", test_r2))## Test set R^2: 0.3922
## Test set RMSE: 1.1973
# Observed vs. predicted plot
tibble(Observed = y_test, Predicted = pls_pred) %>%
ggplot(aes(Observed, Predicted)) +
geom_point(color = "steelblue", alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "PLS: Observed vs. Predicted (Test Set)",
subtitle = sprintf("R^2 = %.3f | RMSE = %.3f", test_r2, test_rmse),
x = "Observed log(Permeability)",
y = "Predicted log(Permeability)") +
theme_minimal()This is notably lower than the resampled R^2 of 0.562, indicating the model does not generalize as well to new compounds. With only 32 test samples, some of this drop may reflect the small test set size, but the scatter plot shows several poorly predicted compounds, particularly at the extremes of the permeability range.
We also try Ridge Regression and Elastic Net, which are well-suited to high-dimensional, correlated predictors.
set.seed(123)
ridge_grid <- expand.grid(
alpha = 0,
lambda = 10^seq(-3, 2, length.out = 20)
)
ridge_fit <- train(
x = X_train,
y = y_train,
method = "glmnet",
tuneGrid = ridge_grid,
trControl = ctrl,
preProcess = c("center", "scale")
)
ridge_pred <- predict(ridge_fit, newdata = X_test)
ridge_r2 <- cor(ridge_pred, y_test)^2
ridge_rmse <- RMSE(ridge_pred, y_test)
cat(sprintf("Ridge — Test R^2: %.4f | RMSE: %.4f\n", ridge_r2, ridge_rmse))## Ridge — Test R^2: 0.3229 | RMSE: 1.2375
set.seed(123)
enet_grid <- expand.grid(
alpha = c(0.1, 0.5, 0.9),
lambda = 10^seq(-2, 1, length.out = 5)
)
enet_fit <- train(
x = X_train,
y = y_train,
method = "glmnet",
tuneGrid = enet_grid,
trControl = ctrl,
preProcess = c("center", "scale")
)
enet_pred <- predict(enet_fit, newdata = X_test)
enet_r2 <- cor(enet_pred, y_test)^2
enet_rmse <- RMSE(enet_pred, y_test)
cat(sprintf("Elastic Net — Test R^2: %.4f | RMSE: %.4f\n", enet_r2, enet_rmse))## Elastic Net — Test R^2: 0.5021 | RMSE: 1.0816
compare_62 <- tibble(
Model = c("PLS", "Ridge", "Elastic Net"),
`Test R²` = round(c(test_r2, ridge_r2, enet_r2), 4),
`Test RMSE` = round(c(test_rmse, ridge_rmse, enet_rmse), 4)
)
kable(compare_62, caption = "Model Comparison on Test Set")| Model | Test R² | Test RMSE |
|---|---|---|
| PLS | 0.3922 | 1.1973 |
| Ridge | 0.3229 | 1.2375 |
| Elastic Net | 0.5021 | 1.0816 |
best_model <- compare_62$Model[which.max(compare_62$`Test R²`)]
cat("Best performing model:", best_model, "\n")## Best performing model: Elastic Net
Elastic net has the best test set performance (R² = 0.502, RMSE = 1.082), but we would not recommend replacing the laboratory permeability experiment with any of these models because:
However, these models could serve as a useful early-stage screening tool to flag low-permeability candidates before expensive lab testing, rather than as a full replacement.
Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch.
## [1] 176 58
## [1] "Yield" "BiologicalMaterial01" "BiologicalMaterial02"
## [4] "BiologicalMaterial03" "BiologicalMaterial04"
The dataset contains 176 manufacturing runs with
57 predictors (12 biological material, 45 manufacturing
process) and a Yield response.
# Separate response and predictors
yield <- ChemicalManufacturingProcess$Yield
chem_x <- ChemicalManufacturingProcess %>% dplyr::select(-Yield)
cat("Response range:", round(min(yield), 2), "to", round(max(yield), 2), "\n")## Response range: 35.25 to 46.34
## Missing values per predictor (top 10):
## ManufacturingProcess03 ManufacturingProcess11 ManufacturingProcess10
## 15 10 9
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 5 5 5
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 5 5 5
## ManufacturingProcess31
## 5
Yield ranges from 35.25 to 46.34% — a fairly narrow band, which means even small prediction errors matter economically given the $100k/batch revenue impact.
ManufacturingProcess03 has the most missing values (15 out of 176 runs = ~8.5%), which is notable but manageable with imputation.
Missing values are concentrated in the process predictors, not the biological ones — suggesting occasional sensor failures or recording gaps in the manufacturing process.
A small fraction of predictor cells are missing. We use k-nearest
neighbors imputation via preProcess.
# Count missing before imputation
total_missing <- sum(is.na(chem_x))
pct_missing <- round(100 * total_missing / prod(dim(chem_x)), 2)
cat(sprintf("Total missing cells: %d (%.2f%% of predictor matrix)\n",
total_missing, pct_missing))## Total missing cells: 106 (1.06% of predictor matrix)
# KNN imputation
impute_pp <- preProcess(chem_x, method = "knnImpute")
chem_x_imp <- predict(impute_pp, chem_x)
cat("Missing values after imputation:", sum(is.na(chem_x_imp)), "\n")## Missing values after imputation: 0
106 missing cells represent only 1.06% of the predictor matrix, so the missingness is minor and KNN imputation is appropriate. All missing values were successfully filled — 0 remain after imputation.
We split 80/20 and tune a Partial Least Squares (PLS) model via 10-fold CV. PLS is appropriate here given possible collinearity between process predictors.
set.seed(42)
train_idx2 <- createDataPartition(yield, p = 0.80, list = FALSE)
X_train2 <- chem_x_imp[train_idx2, ]
X_test2 <- chem_x_imp[-train_idx2, ]
y_train2 <- yield[train_idx2]
y_test2 <- yield[-train_idx2]
cat("Training set:", nrow(X_train2), "samples\n")## Training set: 144 samples
## Test set: 32 samples
ctrl2 <- trainControl(method = "cv", number = 10)
set.seed(42)
pls_fit2 <- train(
x = X_train2,
y = y_train2,
method = "pls",
tuneGrid = data.frame(ncomp = 1:15),
trControl = ctrl2,
preProcess = c("center", "scale")
)
pls_fit2## Partial Least Squares
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 129, 130, 129, 130, 130, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.646603 0.3824172 1.238103
## 2 2.091968 0.4563824 1.271448
## 3 1.343115 0.5577216 1.020864
## 4 1.728039 0.5094557 1.154045
## 5 2.298369 0.4673390 1.313287
## 6 2.323376 0.4696938 1.321661
## 7 2.440565 0.4849592 1.352095
## 8 2.434288 0.4933797 1.356165
## 9 2.827671 0.4857789 1.493314
## 10 2.943715 0.4687688 1.529403
## 11 3.067121 0.4601724 1.563973
## 12 3.169587 0.4485196 1.609779
## 13 3.206828 0.4448435 1.633663
## 14 3.227088 0.4401647 1.655101
## 15 3.183676 0.4326873 1.642474
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
The optimal PLS model uses 3 latent variables with a resampled RMSE of 1.343 and R^2 of 0.558 (selected by minimum RMSE). RMSE rises sharply beyond 3 components, indicating overfitting with additional latent variables.
best_ncomp2 <- pls_fit2$bestTune$ncomp
best_rmse2 <- min(pls_fit2$results$RMSE)
best_r2_2 <- pls_fit2$results$Rsquared[which.min(pls_fit2$results$RMSE)]
cat(sprintf("Optimal latent variables: %d\n", best_ncomp2))## Optimal latent variables: 3
## Resampled RMSE: 1.3431
## Resampled R^2: 0.5577
The optimal model uses 3 latent variable(s) with a resampled RMSE of 1.343 and R² of 0.558.
pls_pred2 <- predict(pls_fit2, newdata = X_test2)
test_r2_2 <- cor(pls_pred2, y_test2)^2
test_rmse2 <- RMSE(pls_pred2, y_test2)
cat(sprintf("Test set R²: %.4f\n", test_r2_2))## Test set R²: 0.6866
## Test set RMSE: 1.1288
tibble(Observed = y_test2, Predicted = pls_pred2) %>%
ggplot(aes(Observed, Predicted)) +
geom_point(color = "darkorange", alpha = 0.8) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = "PLS: Observed vs. Predicted Yield (Test Set)",
subtitle = sprintf("R² = %.3f | RMSE = %.3f", test_r2_2, test_rmse2),
x = "Observed Yield (%)",
y = "Predicted Yield (%)") +
theme_minimal()The test set R^2 of 0.687 and RMSE of 1.129 are notably better than the resampled estimates (R² = 0.558, RMSE = 1.343). This is somewhat unusual — CV estimates are typically conservative — but can occur with small datasets where the random train/test split happened to place easier-to-predict compounds in the test set. The scatter plot shows a reasonable linear trend with some spread, particularly in the 37.5–40% yield range where the model tends to overpredict.
Comparison: The resampled (training) RMSE was 1.343 and the test set RMSE is 1.129. The test performance is consistent with the cross-validated estimate, suggesting the model generalizes reasonably.
vi <- varImp(pls_fit2, scale = TRUE)
plot(vi, top = 20, main = "Top 20 Predictors by Importance (PLS)")# Tabulate top 20
vi_df <- vi$importance %>%
rownames_to_column("Predictor") %>%
arrange(desc(Overall)) %>%
head(20)
# Annotate predictor type
vi_df <- vi_df %>%
mutate(Type = ifelse(str_detect(Predictor, "^BiologicalMaterial"),
"Biological", "Process"))
kable(vi_df, caption = "Top 20 Predictors by Variable Importance",
digits = 2)| Predictor | Overall | Type |
|---|---|---|
| ManufacturingProcess32 | 100.00 | Process |
| ManufacturingProcess09 | 90.23 | Process |
| ManufacturingProcess13 | 84.70 | Process |
| ManufacturingProcess36 | 82.46 | Process |
| ManufacturingProcess17 | 81.80 | Process |
| ManufacturingProcess06 | 60.17 | Process |
| ManufacturingProcess11 | 56.87 | Process |
| BiologicalMaterial02 | 56.79 | Biological |
| BiologicalMaterial06 | 55.71 | Biological |
| BiologicalMaterial03 | 55.21 | Biological |
| ManufacturingProcess33 | 53.30 | Process |
| ManufacturingProcess12 | 52.67 | Process |
| BiologicalMaterial08 | 49.99 | Biological |
| BiologicalMaterial12 | 47.60 | Biological |
| ManufacturingProcess34 | 45.63 | Process |
| BiologicalMaterial11 | 45.44 | Biological |
| BiologicalMaterial01 | 45.30 | Biological |
| BiologicalMaterial04 | 43.90 | Biological |
| ManufacturingProcess04 | 39.25 | Process |
| ManufacturingProcess28 | 36.67 | Process |
# Breakdown by type
type_counts <- vi_df %>% count(Type)
kable(type_counts, caption = "Top 20: Biological vs. Process Predictors")| Type | n |
|---|---|
| Biological | 8 |
| Process | 12 |
Process predictors dominate the top 20 (12 of 20), with ManufacturingProcess32 being the single most important predictor by a clear margin. However, biological material predictors are well represented (8 of 20), with BiologicalMaterial02, 06, and 03 all appearing in the top 10. This suggests that both raw material quality and process conditions meaningfully influence yield and that neither category can be ignored.
# Select top 6 predictors
top6 <- vi_df$Predictor[1:6]
# Reconstruct full (imputed) data frame with yield for training set
train_df2 <- as.data.frame(X_train2) %>%
mutate(Yield = y_train2)
# Scatter plots for top 6 vs yield
train_df2 %>%
dplyr::select(all_of(c(top6, "Yield"))) %>%
pivot_longer(-Yield, names_to = "Predictor", values_to = "Value") %>%
ggplot(aes(Value, Yield)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 0.8) +
facet_wrap(~Predictor, scales = "free_x", ncol = 2) +
labs(title = "Top 6 Predictors vs. Yield (Training Set)",
x = "Predictor Value", y = "Yield (%)") +
theme_minimal()The scatter plots reveal actionable patterns for each top process predictor:
ManufacturingProcess32 (most important): clear positive relationship — higher values associate with higher yield. This is a prime candidate for process optimization; operators should aim to maximize this parameter.
ManufacturingProcess06: strong positive relationship with wide confidence interval at high values, suggesting yield gains are possible but variable at extreme settings.
ManufacturingProcess09: positive relationship but with considerable scatter, indicating other factors moderate its effect.
ManufacturingProcess13 and ManufacturingProcess17: both show negative relationships — higher values associate with lower yield. These are parameters to minimize or tightly control.
ManufacturingProcess36: appears largely discrete/categorical with a slight negative trend. The clustering of values suggests this may be a stepped or categorical process setting rather than truly continuous.
This information is directly actionable: increase Process32 and Process06, reduce Process13 and Process17, and investigate whether Process36 settings can be standardized to the higher-yield levels.
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] glmnet_4.1-10 Matrix_1.7-3
## [3] knitr_1.50 corrplot_0.95
## [5] lubridate_1.9.4 forcats_1.0.1
## [7] stringr_1.5.2 dplyr_1.1.4
## [9] purrr_1.1.0 readr_2.1.5
## [11] tidyr_1.3.1 tibble_3.3.0
## [13] tidyverse_2.0.0 pls_2.9-0
## [15] caret_7.0-1 lattice_0.22-6
## [17] ggplot2_4.0.0.9000 AppliedPredictiveModeling_1.1-7
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 timeDate_4052.112 farver_2.1.2
## [4] S7_0.2.0 fastmap_1.2.0 RANN_2.6.2
## [7] pROC_1.19.0.1 digest_0.6.37 rpart_4.1.24
## [10] timechange_0.3.0 lifecycle_1.0.4 cluster_2.1.8.1
## [13] survival_3.8-3 magrittr_2.0.4 compiler_4.5.0
## [16] rlang_1.1.6 sass_0.4.10 tools_4.5.0
## [19] plotrix_3.8-14 yaml_2.3.10 data.table_1.17.8
## [22] labeling_0.4.3 plyr_1.8.9 RColorBrewer_1.1-3
## [25] withr_3.0.2 nnet_7.3-20 grid_4.5.0
## [28] stats4_4.5.0 future_1.68.0 globals_0.18.0
## [31] scales_1.4.0 iterators_1.0.14 MASS_7.3-65
## [34] cli_3.6.5 ellipse_0.5.0 rmarkdown_2.30
## [37] generics_0.1.4 rstudioapi_0.17.1 future.apply_1.20.1
## [40] reshape2_1.4.4 tzdb_0.5.0 cachem_1.1.0
## [43] splines_4.5.0 parallel_4.5.0 vctrs_0.6.5
## [46] hardhat_1.4.2 jsonlite_2.0.0 hms_1.1.3
## [49] listenv_0.10.0 foreach_1.5.2 gower_1.0.2
## [52] jquerylib_0.1.4 recipes_1.3.1 glue_1.8.0
## [55] parallelly_1.46.0 codetools_0.2-20 shape_1.4.6.1
## [58] stringi_1.8.7 gtable_0.3.6 rpart.plot_3.1.4
## [61] CORElearn_1.57.3.1 pillar_1.11.1 htmltools_0.5.8.1
## [64] ipred_0.9-15 lava_1.8.2 R6_2.6.1
## [67] evaluate_1.0.5 bslib_0.9.0 class_7.3-23
## [70] Rcpp_1.1.0 nlme_3.1-168 prodlim_2025.04.28
## [73] mgcv_1.9-1 xfun_0.53 pkgconfig_2.0.3
## [76] ModelMetrics_1.2.2.2