Problem 6.2 – Permeability Data

(a) Load the Data

library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(elasticnet)  # for ridge / enet
library(MASS)        # for ridge regression
library(tidyverse)

data(permeability)

# Dimensions
cat("fingerprints dimensions:", dim(fingerprints), "\n")
## fingerprints dimensions: 165 1107
cat("permeability length:    ", length(permeability), "\n")
## permeability length:     165

The fingerprints matrix has 165 compounds described by 1107 binary predictors. The response permeability is a continuous numeric vector of length 165.


(b) Remove Near-Zero Variance Predictors

Binary fingerprint predictors are often sparse — many substructures appear in only a handful of molecules. We use nearZeroVar() from caret to identify and remove these low-information predictors.

nzv_idx <- nearZeroVar(fingerprints)

cat("Predictors before filtering:", ncol(fingerprints), "\n")
## Predictors before filtering: 1107
cat("Near-zero variance predictors removed:", length(nzv_idx), "\n")
## Near-zero variance predictors removed: 719
fingerprints_filtered <- fingerprints[, -nzv_idx]

cat("Predictors remaining after filtering:", ncol(fingerprints_filtered), "\n")
## Predictors remaining after filtering: 388

After removing near-zero variance predictors, 388 predictors remain for modeling.


(c) Train/Test Split, Pre-processing, and PLS Tuning

We split 80 % of compounds into training and 20 % into testing, then tune a Partial Least Squares (PLS) model via 10-fold cross-validation.

set.seed(614)

train_idx <- createDataPartition(permeability, p = 0.80, list = FALSE)

X_train <- fingerprints_filtered[train_idx, ]
X_test  <- fingerprints_filtered[-train_idx, ]
y_train <- permeability[train_idx]
y_test  <- permeability[-train_idx]

# Pre-process: center and scale inside resampling via caret
ctrl <- trainControl(
  method  = "cv",
  number  = 10
)

set.seed(614)
pls_model <- train(
  x          = X_train,
  y          = y_train,
  method     = "pls",
  tuneLength = 20,
  trControl  = ctrl,
  preProcess = c("center", "scale")
)

pls_model
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 121, 120, 120, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     12.53358  0.3553953   9.567597
##    2     11.61019  0.4890810   8.091447
##    3     11.80130  0.4757242   8.778872
##    4     12.19362  0.4352315   9.219110
##    5     12.28300  0.4387307   9.390196
##    6     12.08771  0.4535351   9.055823
##    7     12.08226  0.4509900   9.120314
##    8     11.95363  0.4632350   9.268706
##    9     11.93529  0.4698718   9.422742
##   10     12.09437  0.4693474   9.568933
##   11     12.29766  0.4546334   9.622594
##   12     12.44555  0.4481673   9.786040
##   13     12.62766  0.4410592   9.850552
##   14     12.90519  0.4196692  10.058005
##   15     13.01441  0.4128496  10.109928
##   16     13.15024  0.4127095  10.329062
##   17     13.29641  0.4080838  10.480771
##   18     13.51509  0.3942294  10.772092
##   19     13.33906  0.4070753  10.558082
##   20     13.45224  0.4002228  10.674754
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.
plot(pls_model, main = "PLS – Cross-Validated RMSE by Number of Components")

best_lv  <- pls_model$bestTune$ncomp
best_r2  <- max(pls_model$results$Rsquared)

cat("Optimal number of latent variables:", best_lv, "\n")
## Optimal number of latent variables: 2
cat("Resampled R² at optimal LV:        ", round(best_r2, 4), "\n")
## Resampled R² at optimal LV:         0.4891

The optimal PLS model uses 2 latent variable(s) with a resampled cross-validated R² of approximately 0.489.


(d) Test Set Prediction and R²

pls_pred <- predict(pls_model, newdata = X_test)

# R² on test set
ss_res  <- sum((y_test - pls_pred)^2)
ss_tot  <- sum((y_test - mean(y_test))^2)
test_r2 <- 1 - ss_res / ss_tot
test_rmse <- RMSE(pls_pred, y_test)

cat("Test set R²:  ", round(test_r2,  4), "\n")
## Test set R²:   0.3539
cat("Test set RMSE:", round(test_rmse, 4), "\n")
## Test set RMSE: 13.5836
tibble(Observed = y_test, Predicted = pls_pred) %>%
  ggplot(aes(Observed, Predicted)) +
  geom_point(color = "#2C5C8F", alpha = 0.7, size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "firebrick") +
  labs(
    title = "PLS – Observed vs. Predicted (Test Set)",
    subtitle = paste0("Test R² = ", round(test_r2, 3)),
    x = "Observed Permeability",
    y = "Predicted Permeability"
  ) +
  theme_minimal(base_size = 13)

The test set R² is 0.354, which is lower than the resampled training estimate — typical for a model evaluated on truly held-out data.


(e) Alternative Models

We try Ridge Regression and Elastic Net as alternative linear models discussed in Chapter 6.

Ridge Regression

set.seed(614)
ridge_model <- train(
  x          = X_train,
  y          = y_train,
  method     = "ridge",
  tuneLength = 15,
  trControl  = ctrl,
  preProcess = c("center", "scale")
)

ridge_pred   <- predict(ridge_model, newdata = X_test)
ridge_r2     <- 1 - sum((y_test - ridge_pred)^2) / sum((y_test - mean(y_test))^2)
ridge_rmse   <- RMSE(ridge_pred, y_test)

cat("Ridge – Test R²:  ", round(ridge_r2,   4), "\n")
## Ridge – Test R²:   0.4963
cat("Ridge – Test RMSE:", round(ridge_rmse,  4), "\n")
## Ridge – Test RMSE: 11.9942

Elastic Net

set.seed(614)
enet_model <- train(
  x          = X_train,
  y          = y_train,
  method     = "enet",
  tuneLength = 15,
  trControl  = ctrl,
  preProcess = c("center", "scale")
)

enet_pred  <- predict(enet_model, newdata = X_test)
enet_r2    <- 1 - sum((y_test - enet_pred)^2) / sum((y_test - mean(y_test))^2)
enet_rmse  <- RMSE(enet_pred, y_test)

cat("Elastic Net – Test R²:  ", round(enet_r2,   4), "\n")
## Elastic Net – Test R²:   0.4588
cat("Elastic Net – Test RMSE:", round(enet_rmse,  4), "\n")
## Elastic Net – Test RMSE: 12.4324

Model Comparison Summary

comparison_6.2 <- tibble(
  Model        = c("PLS", "Ridge Regression", "Elastic Net"),
  `Test R²`    = round(c(test_r2, ridge_r2, enet_r2),   3),
  `Test RMSE`  = round(c(test_rmse, ridge_rmse, enet_rmse), 3)
) %>% arrange(desc(`Test R²`))

comparison_6.2

(f) Recommendation

The Ridge Regression achieved the highest test set R² of 0.496.

Would I recommend replacing the permeability lab experiment?

Not yet. While the model explains a meaningful portion of variance in permeability, an R² below ~0.75–0.80 still leaves substantial unexplained variance — a concern for a pharmaceutical application where prediction errors could mean pursuing non-viable drug candidates or discarding promising ones. A few considerations:

  • Cost-benefit: If the lab assay is expensive and slow, even an imperfect model could be used to screen out clearly poor candidates before lab testing, reducing overall cost without replacing the experiment entirely.
  • Data size: With only 165 compounds, the model may not generalize well to chemically diverse new molecules.
  • Recommendation: Use the model as a pre-screening filter, not as a direct replacement for the lab measurement. As more data accumulates, model accuracy should improve.


Problem 6.3 – Chemical Manufacturing Process Data

(a) Load the Data

data(ChemicalManufacturingProcess)

# The dataset is a single data frame: first column is yield, rest are predictors
chem_df <- ChemicalManufacturingProcess

cat("Dimensions:", dim(chem_df), "\n")
## Dimensions: 176 58
cat("Response:   yield (column 1)\n")
## Response:   yield (column 1)
cat("Predictors: columns 2 –", ncol(chem_df), "\n")
## Predictors: columns 2 – 58

The dataset contains 176 manufacturing runs with 1 response (Yield) and 57 predictors (12 biological + 45 process).


(b) Impute Missing Values

# Count missing values per predictor
missing_summary <- chem_df %>%
  summarise(across(everything(), ~ sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Missing") %>%
  filter(Missing > 0) %>%
  arrange(desc(Missing))

cat("Predictors with missing values:", nrow(missing_summary), "\n")
## Predictors with missing values: 28
missing_summary

We use k-Nearest Neighbor (kNN) imputation via preProcess() from caret, which is well-suited for this type of continuous numeric data.

# Separate response and predictors
yield       <- chem_df$Yield
predictors  <- chem_df %>% dplyr::select(-Yield)

# kNN imputation (k=5 neighbors)
impute_obj  <- preProcess(predictors, method = "knnImpute", k = 5)
pred_imputed <- predict(impute_obj, predictors)

cat("Missing values after imputation:", sum(is.na(pred_imputed)), "\n")
## Missing values after imputation: 0

After kNN imputation, there are 0 missing values in the predictor matrix.


(c) Train/Test Split, Pre-processing, and Model Tuning

We use an Elastic Net model, which simultaneously performs shrinkage (like ridge) and variable selection (like lasso) — well-suited to this problem with correlated predictors of mixed importance.

set.seed(63)

train_idx2 <- createDataPartition(yield, p = 0.80, list = FALSE)

X_train2 <- pred_imputed[ train_idx2, ]
X_test2  <- pred_imputed[-train_idx2, ]
y_train2 <- yield[ train_idx2]
y_test2  <- yield[-train_idx2]

ctrl2 <- trainControl(method = "cv", number = 10)

set.seed(63)
enet_chem <- train(
  x          = X_train2,
  y          = y_train2,
  method     = "enet",
  tuneLength = 20,
  trControl  = ctrl2,
  preProcess = c("center", "scale")
)

enet_chem
## Elasticnet 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 131, 130, 130, 130, 131, 129, ... 
## Resampling results across tuning parameters:
## 
##   lambda        fraction  RMSE      Rsquared   MAE      
##   0.0000000000  0.05      1.186108  0.6147880  0.9464485
##   0.0000000000  0.10      1.131212  0.6291962  0.9149650
##   0.0000000000  0.15      1.396410  0.5482895  1.0307046
##   0.0000000000  0.20      1.785212  0.5017366  1.1474710
##   0.0000000000  0.25      2.169950  0.4570006  1.2640063
##   0.0000000000  0.30      2.630179  0.4283351  1.3969039
##   0.0000000000  0.35      2.962627  0.4131297  1.5022143
##   0.0000000000  0.40      3.501100  0.4027609  1.6605875
##   0.0000000000  0.45      4.283376  0.3975417  1.8692519
##   0.0000000000  0.50      4.629154  0.3954879  1.9658172
##   0.0000000000  0.55      4.813848  0.3944340  2.0173018
##   0.0000000000  0.60      5.029671  0.3939696  2.0780288
##   0.0000000000  0.65      5.265348  0.3965452  2.1435145
##   0.0000000000  0.70      5.535822  0.4044852  2.2100021
##   0.0000000000  0.75      5.878113  0.3916772  2.3121040
##   0.0000000000  0.80      6.200252  0.3700210  2.4217128
##   0.0000000000  0.85      6.683295  0.3622836  2.5628459
##   0.0000000000  0.90      7.192252  0.3578218  2.7081781
##   0.0000000000  0.95      7.847778  0.3552880  2.8870927
##   0.0000000000  1.00      8.336394  0.3532957  3.0174684
##   0.0001000000  0.05      1.217514  0.6067799  0.9766506
##   0.0001000000  0.10      1.119281  0.6372945  0.9075049
##   0.0001000000  0.15      1.184786  0.6084910  0.9386593
##   0.0001000000  0.20      1.446029  0.5466735  1.0426481
##   0.0001000000  0.25      1.787256  0.4971469  1.1421239
##   0.0001000000  0.30      2.256608  0.4539964  1.2800889
##   0.0001000000  0.35      2.641824  0.4389425  1.3914526
##   0.0001000000  0.40      2.924071  0.4276684  1.4793189
##   0.0001000000  0.45      3.933359  0.4138696  1.7499053
##   0.0001000000  0.50      4.481777  0.4050432  1.9031085
##   0.0001000000  0.55      4.836093  0.3997405  2.0076281
##   0.0001000000  0.60      5.117117  0.3976395  2.0891472
##   0.0001000000  0.65      5.338259  0.3963177  2.1514644
##   0.0001000000  0.70      5.566175  0.3952836  2.2149197
##   0.0001000000  0.75      5.794625  0.3946125  2.2784660
##   0.0001000000  0.80      5.987644  0.3949633  2.3319189
##   0.0001000000  0.85      6.158954  0.3974844  2.3769119
##   0.0001000000  0.90      6.419971  0.4032102  2.4376950
##   0.0001000000  0.95      6.758463  0.3976828  2.5250837
##   0.0001000000  1.00      7.275880  0.3766970  2.6801222
##   0.0001467799  0.05      1.225644  0.6064609  0.9834487
##   0.0001467799  0.10      1.119681  0.6380129  0.9049313
##   0.0001467799  0.15      1.179142  0.6108015  0.9364938
##   0.0001467799  0.20      1.372086  0.5618095  1.0179678
##   0.0001467799  0.25      1.685743  0.5113035  1.1116720
##   0.0001467799  0.30      2.163798  0.4625382  1.2527618
##   0.0001467799  0.35      2.556453  0.4423037  1.3654092
##   0.0001467799  0.40      2.834946  0.4331458  1.4503989
##   0.0001467799  0.45      3.798447  0.4197879  1.7093548
##   0.0001467799  0.50      4.336548  0.4108204  1.8590943
##   0.0001467799  0.55      4.782835  0.4036562  1.9868102
##   0.0001467799  0.60      5.052960  0.3997018  2.0667124
##   0.0001467799  0.65      5.287089  0.3981186  2.1336258
##   0.0001467799  0.70      5.514842  0.3968170  2.1974570
##   0.0001467799  0.75      5.750422  0.3956389  2.2631539
##   0.0001467799  0.80      5.979914  0.3950460  2.3267366
##   0.0001467799  0.85      6.155770  0.3952349  2.3748989
##   0.0001467799  0.90      6.370050  0.3977342  2.4299118
##   0.0001467799  0.95      6.642514  0.4032959  2.4932970
##   0.0001467799  1.00      7.031994  0.3986727  2.5924600
##   0.0002154435  0.05      1.233984  0.6075134  0.9937932
##   0.0002154435  0.10      1.120361  0.6392658  0.9019755
##   0.0002154435  0.15      1.174629  0.6121090  0.9343811
##   0.0002154435  0.20      1.247354  0.5903820  0.9753086
##   0.0002154435  0.25      1.596458  0.5227810  1.0850731
##   0.0002154435  0.30      2.073855  0.4713886  1.2248748
##   0.0002154435  0.35      2.454173  0.4479132  1.3352410
##   0.0002154435  0.40      2.725787  0.4377531  1.4174638
##   0.0002154435  0.45      3.550275  0.4257805  1.6394921
##   0.0002154435  0.50      4.140398  0.4169760  1.8022658
##   0.0002154435  0.55      4.629866  0.4091566  1.9390493
##   0.0002154435  0.60      4.965547  0.4034881  2.0370184
##   0.0002154435  0.65      5.215321  0.3999876  2.1103753
##   0.0002154435  0.70      5.428620  0.3986862  2.1702093
##   0.0002154435  0.75      5.661630  0.3973712  2.2349441
##   0.0002154435  0.80      5.899915  0.3961691  2.3014857
##   0.0002154435  0.85      6.116117  0.3952579  2.3608315
##   0.0002154435  0.90      6.343428  0.3953208  2.4215598
##   0.0002154435  0.95      6.573370  0.3973439  2.4805817
##   0.0002154435  1.00      6.821532  0.4020555  2.5399918
##   0.0003162278  0.05      1.245848  0.6079478  1.0062526
##   0.0003162278  0.10      1.121299  0.6401457  0.8987805
##   0.0003162278  0.15      1.171374  0.6122070  0.9327262
##   0.0003162278  0.20      1.163956  0.6157750  0.9327256
##   0.0003162278  0.25      1.520425  0.5341429  1.0611962
##   0.0003162278  0.30      1.958086  0.4818745  1.1884075
##   0.0003162278  0.35      2.340780  0.4570534  1.3021228
##   0.0003162278  0.40      2.606751  0.4424332  1.3821450
##   0.0003162278  0.45      3.210910  0.4316914  1.5472562
##   0.0003162278  0.50      3.952195  0.4223649  1.7480802
##   0.0003162278  0.55      4.412785  0.4152619  1.8761488
##   0.0003162278  0.60      4.832496  0.4088814  1.9950247
##   0.0003162278  0.65      5.110886  0.4039388  2.0762884
##   0.0003162278  0.70      5.316443  0.4007911  2.1362045
##   0.0003162278  0.75      5.539625  0.3993951  2.1984922
##   0.0003162278  0.80      5.773400  0.3979851  2.2634352
##   0.0003162278  0.85      5.993556  0.3965382  2.3242132
##   0.0003162278  0.90      6.214806  0.3953390  2.3842126
##   0.0003162278  0.95      6.435653  0.3951438  2.4434252
##   0.0003162278  1.00      6.627148  0.3960871  2.4938713
##   0.0004641589  0.05      1.261087  0.6085782  1.0189820
##   0.0004641589  0.10      1.123669  0.6390425  0.9000095
##   0.0004641589  0.15      1.164993  0.6133115  0.9298008
##   0.0004641589  0.20      1.189075  0.6096881  0.9377882
##   0.0004641589  0.25      1.427461  0.5497882  1.0335619
##   0.0004641589  0.30      1.843010  0.4927518  1.1528130
##   0.0004641589  0.35      2.227449  0.4661581  1.2686915
##   0.0004641589  0.40      2.485938  0.4499742  1.3462404
##   0.0004641589  0.45      2.829363  0.4376523  1.4468181
##   0.0004641589  0.50      3.742314  0.4273524  1.6878918
##   0.0004641589  0.55      4.174946  0.4207975  1.8081983
##   0.0004641589  0.60      4.594340  0.4141781  1.9261794
##   0.0004641589  0.65      4.975399  0.4090406  2.0348872
##   0.0004641589  0.70      5.181959  0.4046936  2.0950356
##   0.0004641589  0.75      5.376541  0.4015630  2.1509527
##   0.0004641589  0.80      5.600711  0.4000369  2.2133656
##   0.0004641589  0.85      5.816207  0.3984793  2.2731981
##   0.0004641589  0.90      6.034677  0.3966788  2.3331875
##   0.0004641589  0.95      6.238300  0.3954088  2.3886154
##   0.0004641589  1.00      6.406618  0.3943766  2.4349194
##   0.0006812921  0.05      1.281693  0.6087269  1.0353114
##   0.0006812921  0.10      1.127002  0.6374669  0.8996686
##   0.0006812921  0.15      1.155688  0.6166920  0.9261218
##   0.0006812921  0.20      1.217676  0.6003011  0.9474664
##   0.0006812921  0.25      1.315994  0.5734656  0.9948136
##   0.0006812921  0.30      1.688673  0.5086936  1.1077105
##   0.0006812921  0.35      2.053796  0.4773739  1.2185612
##   0.0006812921  0.40      2.370642  0.4580996  1.3121358
##   0.0006812921  0.45      2.616587  0.4452304  1.3865277
##   0.0006812921  0.50      3.436090  0.4323490  1.6052334
##   0.0006812921  0.55      3.926544  0.4255385  1.7384778
##   0.0006812921  0.60      4.298585  0.4197740  1.8424947
##   0.0006812921  0.65      4.697001  0.4136272  1.9551790
##   0.0006812921  0.70      5.009771  0.4093401  2.0448762
##   0.0006812921  0.75      5.182474  0.4056273  2.0945144
##   0.0006812921  0.80      5.380687  0.4022837  2.1512275
##   0.0006812921  0.85      5.581299  0.4005099  2.2070248
##   0.0006812921  0.90      5.794796  0.3984975  2.2660614
##   0.0006812921  0.95      5.984991  0.3966966  2.3185091
##   0.0006812921  1.00      6.143082  0.3950393  2.3627282
##   0.0010000000  0.05      1.307274  0.6081179  1.0540774
##   0.0010000000  0.10      1.134096  0.6349536  0.9034576
##   0.0010000000  0.15      1.147587  0.6205941  0.9240831
##   0.0010000000  0.20      1.201005  0.6031614  0.9424712
##   0.0010000000  0.25      1.233183  0.5885640  0.9613114
##   0.0010000000  0.30      1.503934  0.5360578  1.0549407
##   0.0010000000  0.35      1.908809  0.4891862  1.1733133
##   0.0010000000  0.40      2.234790  0.4676660  1.2713060
##   0.0010000000  0.45      2.460303  0.4536412  1.3400658
##   0.0010000000  0.50      2.885898  0.4405867  1.4599312
##   0.0010000000  0.55      3.628130  0.4302802  1.6572277
##   0.0010000000  0.60      4.008087  0.4246165  1.7607962
##   0.0010000000  0.65      4.348573  0.4193416  1.8570767
##   0.0010000000  0.70      4.718019  0.4134599  1.9620255
##   0.0010000000  0.75      4.958149  0.4096779  2.0314933
##   0.0010000000  0.80      5.120290  0.4063857  2.0775435
##   0.0010000000  0.85      5.301122  0.4032548  2.1296013
##   0.0010000000  0.90      5.493665  0.4008964  2.1831544
##   0.0010000000  0.95      5.676431  0.3986480  2.2336972
##   0.0010000000  1.00      5.834378  0.3967462  2.2775728
##   0.0014677993  0.05      1.336930  0.6059884  1.0771207
##   0.0014677993  0.10      1.147535  0.6300074  0.9126731
##   0.0014677993  0.15      1.138124  0.6263365  0.9180509
##   0.0014677993  0.20      1.188752  0.6050468  0.9382396
##   0.0014677993  0.25      1.157149  0.6203220  0.9286790
##   0.0014677993  0.30      1.367047  0.5635909  1.0111239
##   0.0014677993  0.35      1.760235  0.5030525  1.1275270
##   0.0014677993  0.40      2.060531  0.4785165  1.2200719
##   0.0014677993  0.45      2.323230  0.4620950  1.2989880
##   0.0014677993  0.50      2.544062  0.4504286  1.3654961
##   0.0014677993  0.55      3.182997  0.4376965  1.5377266
##   0.0014677993  0.60      3.686233  0.4298476  1.6733191
##   0.0014677993  0.65      4.013812  0.4245110  1.7630776
##   0.0014677993  0.70      4.327575  0.4193539  1.8521821
##   0.0014677993  0.75      4.642173  0.4140941  1.9420619
##   0.0014677993  0.80      4.855948  0.4103833  2.0038625
##   0.0014677993  0.85      4.992817  0.4069993  2.0431492
##   0.0014677993  0.90      5.168439  0.4040451  2.0934017
##   0.0014677993  0.95      5.338836  0.4013673  2.1411903
##   0.0014677993  1.00      5.487433  0.3990975  2.1830212
##   0.0021544347  0.05      1.370232  0.6016072  1.1033356
##   0.0021544347  0.10      1.165647  0.6237541  0.9300735
##   0.0021544347  0.15      1.128098  0.6331985  0.9099633
##   0.0021544347  0.20      1.176118  0.6082328  0.9332465
##   0.0021544347  0.25      1.202817  0.6032622  0.9434320
##   0.0021544347  0.30      1.296384  0.5726402  0.9841014
##   0.0021544347  0.35      1.534713  0.5302471  1.0633884
##   0.0021544347  0.40      1.894491  0.4911468  1.1687439
##   0.0021544347  0.45      2.171975  0.4718793  1.2523988
##   0.0021544347  0.50      2.379829  0.4584789  1.3168504
##   0.0021544347  0.55      2.627098  0.4479158  1.3892756
##   0.0021544347  0.60      3.306595  0.4365146  1.5701412
##   0.0021544347  0.65      3.662866  0.4301082  1.6679081
##   0.0021544347  0.70      3.959955  0.4249446  1.7493977
##   0.0021544347  0.75      4.234595  0.4200120  1.8273679
##   0.0021544347  0.80      4.503245  0.4150764  1.9045378
##   0.0021544347  0.85      4.696623  0.4110910  1.9604966
##   0.0021544347  0.90      4.837812  0.4075508  2.0006868
##   0.0021544347  0.95      4.981946  0.4047469  2.0422107
##   0.0021544347  1.00      5.116349  0.4021495  2.0809789
##   0.0031622777  0.05      1.404961  0.5952974  1.1321970
##   0.0031622777  0.10      1.187779  0.6160621  0.9519746
##   0.0031622777  0.15      1.124638  0.6357857  0.9044624
##   0.0031622777  0.20      1.161004  0.6139027  0.9286379
##   0.0031622777  0.25      1.202845  0.6014257  0.9418581
##   0.0031622777  0.30      1.195762  0.6016910  0.9473830
##   0.0031622777  0.35      1.370611  0.5608942  1.0107014
##   0.0031622777  0.40      1.715585  0.5074123  1.1145893
##   0.0031622777  0.45      1.982261  0.4837592  1.1959785
##   0.0031622777  0.50      2.207395  0.4683017  1.2651389
##   0.0031622777  0.55      2.421678  0.4561834  1.3294288
##   0.0031622777  0.60      2.745568  0.4463501  1.4201122
##   0.0031622777  0.65      3.303331  0.4370008  1.5694361
##   0.0031622777  0.70      3.588889  0.4312461  1.6489964
##   0.0031622777  0.75      3.853152  0.4260513  1.7218294
##   0.0031622777  0.80      4.098154  0.4213014  1.7906936
##   0.0031622777  0.85      4.307681  0.4166796  1.8513403
##   0.0031622777  0.90      4.501612  0.4119583  1.9079742
##   0.0031622777  0.95      4.619864  0.4089147  1.9411705
##   0.0031622777  1.00      4.739317  0.4061408  1.9756685
##   0.0046415888  0.05      1.440468  0.5867300  1.1609584
##   0.0046415888  0.10      1.212117  0.6083304  0.9798743
##   0.0046415888  0.15      1.130121  0.6344791  0.9029952
##   0.0046415888  0.20      1.145037  0.6224401  0.9210046
##   0.0046415888  0.25      1.187633  0.6041232  0.9361156
##   0.0046415888  0.30      1.193602  0.6052406  0.9406589
##   0.0046415888  0.35      1.318973  0.5650618  0.9942009
##   0.0046415888  0.40      1.459425  0.5429218  1.0424589
##   0.0046415888  0.45      1.817526  0.4974982  1.1452100
##   0.0046415888  0.50      2.041558  0.4792819  1.2137051
##   0.0046415888  0.55      2.247275  0.4660972  1.2771440
##   0.0046415888  0.60      2.439037  0.4557937  1.3345667
##   0.0046415888  0.65      2.779148  0.4463093  1.4283488
##   0.0046415888  0.70      3.222853  0.4386778  1.5481363
##   0.0046415888  0.75      3.481173  0.4332276  1.6202025
##   0.0046415888  0.80      3.714615  0.4280398  1.6852075
##   0.0046415888  0.85      3.929905  0.4231614  1.7458404
##   0.0046415888  0.90      4.104130  0.4184473  1.7966817
##   0.0046415888  0.95      4.254873  0.4143737  1.8404374
##   0.0046415888  1.00      4.373898  0.4113009  1.8738763
##   0.0068129207  0.05      1.474064  0.5789141  1.1878568
##   0.0068129207  0.10      1.236672  0.6067533  1.0024735
##   0.0068129207  0.15      1.143126  0.6301670  0.9130552
##   0.0068129207  0.20      1.131566  0.6314688  0.9112303
##   0.0068129207  0.25      1.168478  0.6107905  0.9306833
##   0.0068129207  0.30      1.205254  0.6001458  0.9416007
##   0.0068129207  0.35      1.221083  0.5945777  0.9535330
##   0.0068129207  0.40      1.382060  0.5532733  1.0156574
##   0.0068129207  0.45      1.578796  0.5230626  1.0795386
##   0.0068129207  0.50      1.884260  0.4916370  1.1657571
##   0.0068129207  0.55      2.079741  0.4762018  1.2255957
##   0.0068129207  0.60      2.273992  0.4653025  1.2847309
##   0.0068129207  0.65      2.443131  0.4564024  1.3357450
##   0.0068129207  0.70      2.764270  0.4475863  1.4240967
##   0.0068129207  0.75      3.119008  0.4409924  1.5208579
##   0.0068129207  0.80      3.362813  0.4357531  1.5885725
##   0.0068129207  0.85      3.564214  0.4305254  1.6454117
##   0.0068129207  0.90      3.744293  0.4255372  1.6967347
##   0.0068129207  0.95      3.899093  0.4210867  1.7418821
##   0.0068129207  1.00      4.032804  0.4177116  1.7806410
##   0.0100000000  0.05      1.505265  0.5705768  1.2139121
##   0.0100000000  0.10      1.266933  0.6056718  1.0231165
##   0.0100000000  0.15      1.168416  0.6209151  0.9320595
##   0.0100000000  0.20      1.129635  0.6328435  0.9077632
##   0.0100000000  0.25      1.153427  0.6182822  0.9250164
##   0.0100000000  0.30      1.188288  0.6034599  0.9359787
##   0.0100000000  0.35      1.220116  0.5967331  0.9474529
##   0.0100000000  0.40      1.318990  0.5637728  0.9929971
##   0.0100000000  0.45      1.445662  0.5447754  1.0390620
##   0.0100000000  0.50      1.673143  0.5122192  1.1087586
##   0.0100000000  0.55      1.936603  0.4883477  1.1822598
##   0.0100000000  0.60      2.115010  0.4753447  1.2361329
##   0.0100000000  0.65      2.298547  0.4656101  1.2912642
##   0.0100000000  0.70      2.447155  0.4580558  1.3367381
##   0.0100000000  0.75      2.712134  0.4505034  1.4103972
##   0.0100000000  0.80      2.999601  0.4444034  1.4896002
##   0.0100000000  0.85      3.237364  0.4389740  1.5556988
##   0.0100000000  0.90      3.422917  0.4336991  1.6087835
##   0.0100000000  0.95      3.584445  0.4291336  1.6552388
##   0.0100000000  1.00      3.722405  0.4252350  1.6948360
##   0.0146779927  0.05      1.535451  0.5620304  1.2401129
##   0.0146779927  0.10      1.303199  0.6040350  1.0503075
##   0.0146779927  0.15      1.197241  0.6102881  0.9567412
##   0.0146779927  0.20      1.137649  0.6303964  0.9119417
##   0.0146779927  0.25      1.139151  0.6269886  0.9164829
##   0.0146779927  0.30      1.171109  0.6094635  0.9326383
##   0.0146779927  0.35      1.202685  0.5989755  0.9398210
##   0.0146779927  0.40      1.267788  0.5834510  0.9652585
##   0.0146779927  0.45      1.400033  0.5491907  1.0197708
##   0.0146779927  0.50      1.522594  0.5356085  1.0654042
##   0.0146779927  0.55      1.735470  0.5078293  1.1288549
##   0.0146779927  0.60      1.975024  0.4874380  1.1944996
##   0.0146779927  0.65      2.154286  0.4757904  1.2467162
##   0.0146779927  0.70      2.304436  0.4674026  1.2923398
##   0.0146779927  0.75      2.465939  0.4602466  1.3409753
##   0.0146779927  0.80      2.694767  0.4536329  1.4057422
##   0.0146779927  0.85      2.915852  0.4481500  1.4678702
##   0.0146779927  0.90      3.119378  0.4429750  1.5248746
##   0.0146779927  0.95      3.299409  0.4377867  1.5759133
##   0.0146779927  1.00      3.443916  0.4335340  1.6174509
##   0.0215443469  0.05      1.563293  0.5528020  1.2638000
##   0.0215443469  0.10      1.344401  0.6004597  1.0834252
##   0.0215443469  0.15      1.221068  0.6053702  0.9842889
##   0.0215443469  0.20      1.157699  0.6227582  0.9243210
##   0.0215443469  0.25      1.134577  0.6298205  0.9131010
##   0.0215443469  0.30      1.155775  0.6170244  0.9265459
##   0.0215443469  0.35      1.184604  0.6045599  0.9365370
##   0.0215443469  0.40      1.225067  0.5928658  0.9474625
##   0.0215443469  0.45      1.345903  0.5609572  0.9966271
##   0.0215443469  0.50      1.470178  0.5403426  1.0412527
##   0.0215443469  0.55      1.600650  0.5272727  1.0905880
##   0.0215443469  0.60      1.797710  0.5035319  1.1491733
##   0.0215443469  0.65      2.017396  0.4868430  1.2090220
##   0.0215443469  0.70      2.177907  0.4767624  1.2542771
##   0.0215443469  0.75      2.316461  0.4692377  1.2953161
##   0.0215443469  0.80      2.488779  0.4627515  1.3448784
##   0.0215443469  0.85      2.681514  0.4568569  1.4007050
##   0.0215443469  0.90      2.868321  0.4517280  1.4536692
##   0.0215443469  0.95      3.038324  0.4467716  1.5019374
##   0.0215443469  1.00      3.195395  0.4421923  1.5467725
##   0.0316227766  0.05      1.588739  0.5423444  1.2854412
##   0.0316227766  0.10      1.385369  0.5953357  1.1164756
##   0.0316227766  0.15      1.246723  0.6051276  1.0061874
##   0.0316227766  0.20      1.185831  0.6118613  0.9454854
##   0.0316227766  0.25      1.143049  0.6261014  0.9186401
##   0.0316227766  0.30      1.143618  0.6249266  0.9199002
##   0.0316227766  0.35      1.171148  0.6099075  0.9340062
##   0.0316227766  0.40      1.198654  0.6001559  0.9407270
##   0.0316227766  0.45      1.311391  0.5770032  0.9733055
##   0.0316227766  0.50      1.420382  0.5475765  1.0214277
##   0.0316227766  0.55      1.535499  0.5350259  1.0603739
##   0.0316227766  0.60      1.662120  0.5213415  1.1104086
##   0.0316227766  0.65      1.855503  0.5015567  1.1674636
##   0.0316227766  0.70      2.060835  0.4873443  1.2243337
##   0.0316227766  0.75      2.195574  0.4784772  1.2615212
##   0.0316227766  0.80      2.331588  0.4713736  1.3012672
##   0.0316227766  0.85      2.509187  0.4651801  1.3515697
##   0.0316227766  0.90      2.674131  0.4599048  1.3989055
##   0.0316227766  0.95      2.831068  0.4553268  1.4432331
##   0.0316227766  1.00      2.973072  0.4508809  1.4840442
##   0.0464158883  0.05      1.611632  0.5309059  1.3044164
##   0.0464158883  0.10      1.423575  0.5895608  1.1472426
##   0.0464158883  0.15      1.276940  0.6058016  1.0286919
##   0.0464158883  0.20      1.209197  0.6049652  0.9693286
##   0.0464158883  0.25      1.162890  0.6174244  0.9320109
##   0.0464158883  0.30      1.143445  0.6243240  0.9204647
##   0.0464158883  0.35      1.160245  0.6157401  0.9292250
##   0.0464158883  0.40      1.185184  0.6046207  0.9399566
##   0.0464158883  0.45      1.232267  0.5912267  0.9509805
##   0.0464158883  0.50      1.369922  0.5630096  0.9970738
##   0.0464158883  0.55      1.481891  0.5400363  1.0408619
##   0.0464158883  0.60      1.590975  0.5315824  1.0799847
##   0.0464158883  0.65      1.715838  0.5167363  1.1275289
##   0.0464158883  0.70      1.892013  0.5008766  1.1793458
##   0.0464158883  0.75      2.072236  0.4887266  1.2306127
##   0.0464158883  0.80      2.197335  0.4807201  1.2666206
##   0.0464158883  0.85      2.357585  0.4740049  1.3104636
##   0.0464158883  0.90      2.507179  0.4687802  1.3525811
##   0.0464158883  0.95      2.649057  0.4638283  1.3941042
##   0.0464158883  1.00      2.772117  0.4594633  1.4301562
##   0.0681292069  0.05      1.631128  0.5190381  1.3204229
##   0.0681292069  0.10      1.457310  0.5834218  1.1748313
##   0.0681292069  0.15      1.311313  0.6047727  1.0561665
##   0.0681292069  0.20      1.229183  0.6038432  0.9918742
##   0.0681292069  0.25      1.188228  0.6071181  0.9525000
##   0.0681292069  0.30      1.155764  0.6175540  0.9279759
##   0.0681292069  0.35      1.152348  0.6202132  0.9252219
##   0.0681292069  0.40      1.174352  0.6100173  0.9361877
##   0.0681292069  0.45      1.196760  0.6011317  0.9444277
##   0.0681292069  0.50      1.298769  0.5792360  0.9708692
##   0.0681292069  0.55      1.406366  0.5549177  1.0118624
##   0.0681292069  0.60      1.518197  0.5393399  1.0513934
##   0.0681292069  0.65      1.629960  0.5297564  1.0937695
##   0.0681292069  0.70      1.756748  0.5125897  1.1404414
##   0.0681292069  0.75      1.905258  0.5002103  1.1847206
##   0.0681292069  0.80      2.060563  0.4897539  1.2308170
##   0.0681292069  0.85      2.189924  0.4826309  1.2684846
##   0.0681292069  0.90      2.338299  0.4764315  1.3098212
##   0.0681292069  0.95      2.471186  0.4719395  1.3467334
##   0.0681292069  1.00      2.587782  0.4679744  1.3803261
##   0.1000000000  0.05      1.645964  0.5109364  1.3333671
##   0.1000000000  0.10      1.484293  0.5785023  1.1976417
##   0.1000000000  0.15      1.343738  0.6023878  1.0832944
##   0.1000000000  0.20      1.252230  0.6036400  1.0099199
##   0.1000000000  0.25      1.205839  0.6018246  0.9718727
##   0.1000000000  0.30      1.176162  0.6076903  0.9422578
##   0.1000000000  0.35      1.160372  0.6140498  0.9298181
##   0.1000000000  0.40      1.165646  0.6155270  0.9293363
##   0.1000000000  0.45      1.189108  0.6054055  0.9419029
##   0.1000000000  0.50      1.259432  0.5856497  0.9624213
##   0.1000000000  0.55      1.345413  0.5696491  0.9897656
##   0.1000000000  0.60      1.449320  0.5493717  1.0279881
##   0.1000000000  0.65      1.547000  0.5397846  1.0600239
##   0.1000000000  0.70      1.667307  0.5294023  1.1078486
##   0.1000000000  0.75      1.792070  0.5124870  1.1517126
##   0.1000000000  0.80      1.931381  0.5005557  1.1931494
##   0.1000000000  0.85      2.044137  0.4923821  1.2290822
##   0.1000000000  0.90      2.174279  0.4856289  1.2671823
##   0.1000000000  0.95      2.306005  0.4802241  1.3047760
##   0.1000000000  1.00      2.417157  0.4765075  1.3362797
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.1 and lambda = 1e-04.
plot(enet_chem, main = "Elastic Net – Cross-Validated RMSE")

best_params <- enet_chem$bestTune
best_cv_rmse <- min(enet_chem$results$RMSE)
best_cv_r2   <- enet_chem$results$Rsquared[which.min(enet_chem$results$RMSE)]

cat("Best lambda (fraction):", best_params$fraction, "\n")
## Best lambda (fraction): 0.1
cat("Best lambda (lambda):  ", best_params$lambda, "\n")
## Best lambda (lambda):   1e-04
cat("CV RMSE:               ", round(best_cv_rmse, 4), "\n")
## CV RMSE:                1.1193
cat("CV R²:                 ", round(best_cv_r2,   4), "\n")
## CV R²:                  0.6373

The optimal Elastic Net model achieves a resampled cross-validated RMSE of approximately 1.119 and R² of 0.637.


(d) Test Set Performance

chem_pred  <- predict(enet_chem, newdata = X_test2)

test_rmse2 <- RMSE(chem_pred, y_test2)
test_r2_2  <- 1 - sum((y_test2 - chem_pred)^2) / sum((y_test2 - mean(y_test2))^2)

cat("Test RMSE:", round(test_rmse2, 4), "\n")
## Test RMSE: 1.149
cat("Test R²:  ", round(test_r2_2,  4), "\n")
## Test R²:   0.5739
tibble(Observed = y_test2, Predicted = chem_pred) %>%
  ggplot(aes(Observed, Predicted)) +
  geom_point(color = "#5C8A2C", alpha = 0.75, size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "firebrick") +
  labs(
    title    = "Elastic Net – Observed vs. Predicted Yield (Test Set)",
    subtitle = paste0("Test R² = ", round(test_r2_2, 3),
                      "  |  CV R² = ", round(best_cv_r2, 3)),
    x = "Observed Yield (%)",
    y = "Predicted Yield (%)"
  ) +
  theme_minimal(base_size = 13)

Metric CV (Training) Test Set
RMSE 1.119 1.149
0.637 0.574

The test set performance is somewhat different from the resampled CV estimate, suggesting the model generalizes reasonably well.


(e) Variable Importance

imp <- varImp(enet_chem, scale = TRUE)
plot(imp, top = 20, main = "Top 20 Most Important Predictors (Elastic Net)")

imp_df <- imp$importance %>%
  rownames_to_column("Predictor") %>%
  arrange(desc(Overall)) %>%
  head(20)

imp_df
# Biological predictors are labeled BiologicalMaterial in this dataset
bio_top    <- imp_df %>% filter(str_detect(Predictor, "BiologicalMaterial"))
proc_top   <- imp_df %>% filter(str_detect(Predictor, "ManufacturingProcess"))

cat("Biological predictors in top 20:         ", nrow(bio_top),  "\n")
## Biological predictors in top 20:          8
cat("Manufacturing process predictors in top 20:", nrow(proc_top), "\n")
## Manufacturing process predictors in top 20: 12

Interpretation: The top 20 predictors include both biological and process variables, but manufacturing process predictors dominate the list. This is meaningful: process variables can be actively controlled, making them the most actionable targets for yield improvement.


(f) Relationships Between Top Predictors and Yield

We visualize the top 6 predictors against yield to understand directionality and form.

top6 <- imp_df$Predictor[1:6]

# Combine training data with yield for plotting
plot_df <- bind_cols(
  as.data.frame(X_train2),
  Yield = y_train2
) %>%
  dplyr::select(all_of(top6), Yield) %>%
  pivot_longer(-Yield, names_to = "Predictor", values_to = "Value")

ggplot(plot_df, aes(Value, Yield)) +
  geom_point(alpha = 0.45, color = "#2C5C8F", size = 1.8) +
  geom_smooth(method = "loess", se = TRUE, color = "firebrick", linewidth = 1) +
  facet_wrap(~ Predictor, scales = "free_x", ncol = 2) +
  labs(
    title    = "Top 6 Predictors vs. Yield (Training Set)",
    subtitle = "LOESS smooth shown in red",
    x = "Predictor Value",
    y = "Yield (%)"
  ) +
  theme_minimal(base_size = 12) +
  theme(strip.text = element_text(face = "bold"))

How this information can improve yield:

  1. Positive linear relationships — If a process variable (e.g., a temperature or pressure setting) shows a clear positive slope with yield, operators can target higher values of that variable in future runs, subject to feasibility constraints.

  2. Negative relationships — Variables with inverse relationships to yield may represent impurities or inefficiencies; reducing them is a lever for improvement.

  3. Non-linear patterns — Curved LOESS fits (e.g., an inverted-U shape) suggest optimal operating windows. Running the process near the peak of such a curve would maximize yield.

  4. Biological material predictors — Although these cannot be changed in the process itself, they can serve as incoming quality controls. Batches with unfavorable biological predictor values could be flagged before processing, avoiding low-yield runs.

  5. Financial impact — Since a 1% improvement in yield generates ~$100,000 per batch, even modest optimization (e.g., +1–2% yield by targeting favorable process settings) would represent substantial revenue gains.


Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.9.5                 forcats_1.0.1                  
##  [3] stringr_1.6.0                   dplyr_1.2.0                    
##  [5] purrr_1.2.1                     readr_2.1.6                    
##  [7] tidyr_1.3.2                     tibble_3.3.1                   
##  [9] tidyverse_2.0.0                 MASS_7.3-65                    
## [11] elasticnet_1.3                  lars_1.3                       
## [13] pls_2.9-0                       caret_7.0-1                    
## [15] lattice_0.22-7                  ggplot2_4.0.2                  
## [17] AppliedPredictiveModeling_1.1-7
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4052.112    farver_2.1.2        
##  [4] S7_0.2.1             fastmap_1.2.0        RANN_2.6.2          
##  [7] pROC_1.19.0.1        digest_0.6.39        rpart_4.1.24        
## [10] timechange_0.4.0     lifecycle_1.0.5      cluster_2.1.8.1     
## [13] survival_3.8-3       magrittr_2.0.4       compiler_4.5.2      
## [16] rlang_1.1.7          sass_0.4.10          tools_4.5.2         
## [19] plotrix_3.8-14       yaml_2.3.12          data.table_1.18.2.1 
## [22] knitr_1.51           labeling_0.4.3       plyr_1.8.9          
## [25] RColorBrewer_1.1-3   withr_3.0.2          nnet_7.3-20         
## [28] grid_4.5.2           stats4_4.5.2         future_1.69.0       
## [31] globals_0.19.0       scales_1.4.0         iterators_1.0.14    
## [34] cli_3.6.5            ellipse_0.5.0        rmarkdown_2.30      
## [37] generics_0.1.4       otel_0.2.0           rstudioapi_0.18.0   
## [40] future.apply_1.20.2  tzdb_0.5.0           reshape2_1.4.5      
## [43] cachem_1.1.0         splines_4.5.2        parallel_4.5.2      
## [46] vctrs_0.7.1          hardhat_1.4.2        Matrix_1.7-4        
## [49] jsonlite_2.0.0       hms_1.1.4            listenv_0.10.0      
## [52] foreach_1.5.2        gower_1.0.2          jquerylib_0.1.4     
## [55] recipes_1.3.1        glue_1.8.0           parallelly_1.46.1   
## [58] codetools_0.2-20     stringi_1.8.7        gtable_0.3.6        
## [61] rpart.plot_3.1.4     CORElearn_1.57.3.1   pillar_1.11.1       
## [64] htmltools_0.5.9      ipred_0.9-15         lava_1.8.2          
## [67] R6_2.6.1             evaluate_1.0.5       bslib_0.10.0        
## [70] class_7.3-23         Rcpp_1.1.1           nlme_3.1-168        
## [73] prodlim_2025.04.28   mgcv_1.9-3           xfun_0.56           
## [76] pkgconfig_2.0.3      ModelMetrics_1.2.2.2