library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(tidyverse)
library(corrplot)
library(knitr)
library(glmnet)

Exercise 6.2 — Predicting Pharmaceutical Permeability

(a) Load the Data

data(permeability)
dim(fingerprints)
## [1]  165 1107
length(permeability)
## [1] 165
summary(permeability)
##   permeability  
##  Min.   : 0.06  
##  1st Qu.: 1.55  
##  Median : 4.91  
##  Mean   :12.24  
##  3rd Qu.:15.47  
##  Max.   :55.60

The dataset contains 165 compounds described by 1107 binary fingerprint predictors, with a continuous permeability response.


(b) Remove Near-Zero Variance Predictors

Many binary fingerprint predictors are sparse and most molecules don’t contain a given substructure, so we use nearZeroVar() to remove uninformative columns.

nzv_idx <- nearZeroVar(fingerprints)
cat("Predictors removed (near-zero variance):", length(nzv_idx), "\n")
## Predictors removed (near-zero variance): 719
fp_filtered <- fingerprints[, -nzv_idx]
cat("Predictors remaining:", ncol(fp_filtered), "\n")
## Predictors remaining: 388

After filtering, 388 predictors remain for modeling.


(c) Split, Pre-process, and Tune a PLS Model

We log-transform the response (permeability is right-skewed), split 80/20, and tune PLS via 10-fold cross-validation.

set.seed(123)

# Log-transform the skewed response
log_perm <- log(permeability)

# 80/20 train/test split (stratified on the log response)
train_idx <- createDataPartition(log_perm, p = 0.80, list = FALSE)

X_train <- fp_filtered[train_idx, ]
X_test  <- fp_filtered[-train_idx, ]
y_train <- log_perm[train_idx]
y_test  <- log_perm[-train_idx]

cat("Training set:", nrow(X_train), "samples\n")
## Training set: 133 samples
cat("Test set:    ", nrow(X_test),  "samples\n")
## Test set:     32 samples
# 10-fold CV control
ctrl <- trainControl(method = "cv", number = 10)

# Tune PLS (search over 1–20 latent variables)
set.seed(123)
pls_fit <- train(
  x          = X_train,
  y          = y_train,
  method     = "pls",
  tuneGrid   = data.frame(ncomp = 1:20),
  trControl  = ctrl,
  preProcess = c("center", "scale")
)

pls_fit
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 119, 118, 120, 121, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     1.318623  0.2878538  1.0936803
##    2     1.175250  0.4223872  0.9277603
##    3     1.143653  0.4607692  0.9003402
##    4     1.121302  0.4889732  0.9181559
##    5     1.120204  0.5001917  0.8905865
##    6     1.095501  0.5310705  0.8497398
##    7     1.071736  0.5497272  0.8402284
##    8     1.057515  0.5679785  0.8305465
##    9     1.083285  0.5624847  0.8530669
##   10     1.134648  0.5583097  0.8964011
##   11     1.146658  0.5665197  0.9089386
##   12     1.166392  0.5571221  0.9368057
##   13     1.177877  0.5496513  0.9342370
##   14     1.196855  0.5450016  0.9461325
##   15     1.205878  0.5472701  0.9557193
##   16     1.196609  0.5634834  0.9398086
##   17     1.212974  0.5541142  0.9606699
##   18     1.223674  0.5468819  0.9721771
##   19     1.222901  0.5542604  0.9704077
##   20     1.227794  0.5532549  0.9667679
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.
plot(pls_fit, main = "PLS: CV R² vs. Number of Latent Variables")

best_ncomp <- pls_fit$bestTune$ncomp
best_r2 <- pls_fit$results$Rsquared[which.min(pls_fit$results$RMSE)]
cat(sprintf("Optimal latent variables: %d\n", best_ncomp))
## Optimal latent variables: 8
cat(sprintf("Resampled R^2:             %.4f\n", best_r2))
## Resampled R^2:             0.5680

The optimal PLS model uses 8 latent variable(s) with a resampled R^2 of approximately 0.568 (selected by minimum RMSE).


(d) Test Set Prediction

pls_pred   <- predict(pls_fit, newdata = X_test)
test_r2    <- cor(pls_pred, y_test)^2
test_rmse  <- RMSE(pls_pred, y_test)

cat(sprintf("Test set R^2:   %.4f\n", test_r2))
## Test set R^2:   0.3922
cat(sprintf("Test set RMSE: %.4f\n", test_rmse))
## Test set RMSE: 1.1973
# Observed vs. predicted plot
tibble(Observed = y_test, Predicted = pls_pred) %>%
  ggplot(aes(Observed, Predicted)) +
  geom_point(color = "steelblue", alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "PLS: Observed vs. Predicted (Test Set)",
       subtitle = sprintf("R^2 = %.3f | RMSE = %.3f", test_r2, test_rmse),
       x = "Observed log(Permeability)",
       y = "Predicted log(Permeability)") +
  theme_minimal()

This is notably lower than the resampled R^2 of 0.562, indicating the model does not generalize as well to new compounds. With only 32 test samples, some of this drop may reflect the small test set size, but the scatter plot shows several poorly predicted compounds, particularly at the extremes of the permeability range.


(e) Compare Additional Models

We also try Ridge Regression and Elastic Net, which are well-suited to high-dimensional, correlated predictors.

set.seed(123)
ridge_grid <- expand.grid(
  alpha  = 0,
  lambda = 10^seq(-3, 2, length.out = 20)
)

ridge_fit <- train(
  x          = X_train,
  y          = y_train,
  method     = "glmnet",
  tuneGrid   = ridge_grid,
  trControl  = ctrl,
  preProcess = c("center", "scale")
)

ridge_pred <- predict(ridge_fit, newdata = X_test)
ridge_r2   <- cor(ridge_pred, y_test)^2
ridge_rmse <- RMSE(ridge_pred, y_test)
cat(sprintf("Ridge — Test R^2: %.4f | RMSE: %.4f\n", ridge_r2, ridge_rmse))
## Ridge — Test R^2: 0.3229 | RMSE: 1.2375
set.seed(123)
enet_grid <- expand.grid(
  alpha  = c(0.1, 0.5, 0.9),
  lambda = 10^seq(-2, 1, length.out = 5)
)

enet_fit <- train(
  x          = X_train,
  y          = y_train,
  method     = "glmnet",
  tuneGrid   = enet_grid,
  trControl  = ctrl,
  preProcess = c("center", "scale")
)

enet_pred <- predict(enet_fit, newdata = X_test)
enet_r2   <- cor(enet_pred, y_test)^2
enet_rmse <- RMSE(enet_pred, y_test)
cat(sprintf("Elastic Net — Test R^2: %.4f | RMSE: %.4f\n", enet_r2, enet_rmse))
## Elastic Net — Test R^2: 0.5021 | RMSE: 1.0816
compare_62 <- tibble(
  Model       = c("PLS", "Ridge", "Elastic Net"),
  `Test R²`   = round(c(test_r2, ridge_r2, enet_r2), 4),
  `Test RMSE` = round(c(test_rmse, ridge_rmse, enet_rmse), 4)
)

kable(compare_62, caption = "Model Comparison on Test Set")
Model Comparison on Test Set
Model Test R² Test RMSE
PLS 0.3922 1.1973
Ridge 0.3229 1.2375
Elastic Net 0.5021 1.0816

(f) Recommendation

best_model <- compare_62$Model[which.max(compare_62$`Test R²`)]
cat("Best performing model:", best_model, "\n")
## Best performing model: Elastic Net

Elastic net has the best test set performance (R² = 0.502, RMSE = 1.082), but we would not recommend replacing the laboratory permeability experiment with any of these models because:

  • An R^2 of 0.502 means the best model explains only ~50% of the variance in permeability and substantial prediction error remains.
  • The dataset is small (165 compounds, ~33 test samples), making the test set estimate unstable.
  • The drop from elastic net’s training CV performance to test performance suggests the model may not generalize reliably to novel chemical scaffolds.

However, these models could serve as a useful early-stage screening tool to flag low-permeability candidates before expensive lab testing, rather than as a full replacement.


Exercise 6.3 — Chemical Manufacturing Process Yield

Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch.


(a) Load the Data

data(ChemicalManufacturingProcess)
dim(ChemicalManufacturingProcess)
## [1] 176  58
names(ChemicalManufacturingProcess)[1:5]
## [1] "Yield"                "BiologicalMaterial01" "BiologicalMaterial02"
## [4] "BiologicalMaterial03" "BiologicalMaterial04"

The dataset contains 176 manufacturing runs with 57 predictors (12 biological material, 45 manufacturing process) and a Yield response.

# Separate response and predictors
yield  <- ChemicalManufacturingProcess$Yield
chem_x <- ChemicalManufacturingProcess %>% dplyr::select(-Yield)
cat("Response range:", round(min(yield), 2), "to", round(max(yield), 2), "\n")
## Response range: 35.25 to 46.34
cat("Missing values per predictor (top 10):\n")
## Missing values per predictor (top 10):
sort(colSums(is.na(chem_x)), decreasing = TRUE)[1:10]
## ManufacturingProcess03 ManufacturingProcess11 ManufacturingProcess10 
##                     15                     10                      9 
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27 
##                      5                      5                      5 
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30 
##                      5                      5                      5 
## ManufacturingProcess31 
##                      5

Yield ranges from 35.25 to 46.34% — a fairly narrow band, which means even small prediction errors matter economically given the $100k/batch revenue impact.

ManufacturingProcess03 has the most missing values (15 out of 176 runs = ~8.5%), which is notable but manageable with imputation.

Missing values are concentrated in the process predictors, not the biological ones — suggesting occasional sensor failures or recording gaps in the manufacturing process.


(b) Impute Missing Values

A small fraction of predictor cells are missing. We use k-nearest neighbors imputation via preProcess.

# Count missing before imputation
total_missing <- sum(is.na(chem_x))
pct_missing   <- round(100 * total_missing / prod(dim(chem_x)), 2)
cat(sprintf("Total missing cells: %d (%.2f%% of predictor matrix)\n",
            total_missing, pct_missing))
## Total missing cells: 106 (1.06% of predictor matrix)
# KNN imputation
impute_pp  <- preProcess(chem_x, method = "knnImpute")
chem_x_imp <- predict(impute_pp, chem_x)

cat("Missing values after imputation:", sum(is.na(chem_x_imp)), "\n")
## Missing values after imputation: 0

106 missing cells represent only 1.06% of the predictor matrix, so the missingness is minor and KNN imputation is appropriate. All missing values were successfully filled — 0 remain after imputation.


(c) Split, Pre-process, and Tune a Model

We split 80/20 and tune a Partial Least Squares (PLS) model via 10-fold CV. PLS is appropriate here given possible collinearity between process predictors.

set.seed(42)
train_idx2 <- createDataPartition(yield, p = 0.80, list = FALSE)

X_train2 <- chem_x_imp[train_idx2, ]
X_test2  <- chem_x_imp[-train_idx2, ]
y_train2 <- yield[train_idx2]
y_test2  <- yield[-train_idx2]

cat("Training set:", nrow(X_train2), "samples\n")
## Training set: 144 samples
cat("Test set:    ", nrow(X_test2),  "samples\n")
## Test set:     32 samples
ctrl2 <- trainControl(method = "cv", number = 10)

set.seed(42)
pls_fit2 <- train(
  x          = X_train2,
  y          = y_train2,
  method     = "pls",
  tuneGrid   = data.frame(ncomp = 1:15),
  trControl  = ctrl2,
  preProcess = c("center", "scale")
)

pls_fit2
## Partial Least Squares 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 129, 130, 129, 130, 130, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     1.646603  0.3824172  1.238103
##    2     2.091968  0.4563824  1.271448
##    3     1.343115  0.5577216  1.020864
##    4     1.728039  0.5094557  1.154045
##    5     2.298369  0.4673390  1.313287
##    6     2.323376  0.4696938  1.321661
##    7     2.440565  0.4849592  1.352095
##    8     2.434288  0.4933797  1.356165
##    9     2.827671  0.4857789  1.493314
##   10     2.943715  0.4687688  1.529403
##   11     3.067121  0.4601724  1.563973
##   12     3.169587  0.4485196  1.609779
##   13     3.206828  0.4448435  1.633663
##   14     3.227088  0.4401647  1.655101
##   15     3.183676  0.4326873  1.642474
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plot(pls_fit2, main = "PLS (Yield): CV RMSE vs. Number of Latent Variables")

The optimal PLS model uses 3 latent variables with a resampled RMSE of 1.343 and R^2 of 0.558 (selected by minimum RMSE). RMSE rises sharply beyond 3 components, indicating overfitting with additional latent variables.

best_ncomp2 <- pls_fit2$bestTune$ncomp
best_rmse2  <- min(pls_fit2$results$RMSE)
best_r2_2   <- pls_fit2$results$Rsquared[which.min(pls_fit2$results$RMSE)]

cat(sprintf("Optimal latent variables: %d\n", best_ncomp2))
## Optimal latent variables: 3
cat(sprintf("Resampled RMSE:           %.4f\n", best_rmse2))
## Resampled RMSE:           1.3431
cat(sprintf("Resampled R^2:             %.4f\n", best_r2_2))
## Resampled R^2:             0.5577

The optimal model uses 3 latent variable(s) with a resampled RMSE of 1.343 and R² of 0.558.


(d) Test Set Performance

pls_pred2  <- predict(pls_fit2, newdata = X_test2)
test_r2_2  <- cor(pls_pred2, y_test2)^2
test_rmse2 <- RMSE(pls_pred2, y_test2)

cat(sprintf("Test set R²:   %.4f\n", test_r2_2))
## Test set R²:   0.6866
cat(sprintf("Test set RMSE: %.4f\n", test_rmse2))
## Test set RMSE: 1.1288
tibble(Observed = y_test2, Predicted = pls_pred2) %>%
  ggplot(aes(Observed, Predicted)) +
  geom_point(color = "darkorange", alpha = 0.8) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "PLS: Observed vs. Predicted Yield (Test Set)",
       subtitle = sprintf("R² = %.3f | RMSE = %.3f", test_r2_2, test_rmse2),
       x = "Observed Yield (%)",
       y = "Predicted Yield (%)") +
  theme_minimal()

The test set R^2 of 0.687 and RMSE of 1.129 are notably better than the resampled estimates (R² = 0.558, RMSE = 1.343). This is somewhat unusual — CV estimates are typically conservative — but can occur with small datasets where the random train/test split happened to place easier-to-predict compounds in the test set. The scatter plot shows a reasonable linear trend with some spread, particularly in the 37.5–40% yield range where the model tends to overpredict.

Comparison: The resampled (training) RMSE was 1.343 and the test set RMSE is 1.129. The test performance is consistent with the cross-validated estimate, suggesting the model generalizes reasonably.


(e) Variable Importance

vi <- varImp(pls_fit2, scale = TRUE)
plot(vi, top = 20, main = "Top 20 Predictors by Importance (PLS)")

# Tabulate top 20
vi_df <- vi$importance %>%
  rownames_to_column("Predictor") %>%
  arrange(desc(Overall)) %>%
  head(20)

# Annotate predictor type
vi_df <- vi_df %>%
  mutate(Type = ifelse(str_detect(Predictor, "^BiologicalMaterial"),
                       "Biological", "Process"))

kable(vi_df, caption = "Top 20 Predictors by Variable Importance",
      digits = 2)
Top 20 Predictors by Variable Importance
Predictor Overall Type
ManufacturingProcess32 100.00 Process
ManufacturingProcess09 90.23 Process
ManufacturingProcess13 84.70 Process
ManufacturingProcess36 82.46 Process
ManufacturingProcess17 81.80 Process
ManufacturingProcess06 60.17 Process
ManufacturingProcess11 56.87 Process
BiologicalMaterial02 56.79 Biological
BiologicalMaterial06 55.71 Biological
BiologicalMaterial03 55.21 Biological
ManufacturingProcess33 53.30 Process
ManufacturingProcess12 52.67 Process
BiologicalMaterial08 49.99 Biological
BiologicalMaterial12 47.60 Biological
ManufacturingProcess34 45.63 Process
BiologicalMaterial11 45.44 Biological
BiologicalMaterial01 45.30 Biological
BiologicalMaterial04 43.90 Biological
ManufacturingProcess04 39.25 Process
ManufacturingProcess28 36.67 Process
# Breakdown by type
type_counts <- vi_df %>% count(Type)
kable(type_counts, caption = "Top 20: Biological vs. Process Predictors")
Top 20: Biological vs. Process Predictors
Type n
Biological 8
Process 12

Process predictors dominate the top 20 (12 of 20), with ManufacturingProcess32 being the single most important predictor by a clear margin. However, biological material predictors are well represented (8 of 20), with BiologicalMaterial02, 06, and 03 all appearing in the top 10. This suggests that both raw material quality and process conditions meaningfully influence yield and that neither category can be ignored.


(f) Relationship Between Top Predictors and Yield

# Select top 6 predictors
top6 <- vi_df$Predictor[1:6]

# Reconstruct full (imputed) data frame with yield for training set
train_df2 <- as.data.frame(X_train2) %>%
  mutate(Yield = y_train2)

# Scatter plots for top 6 vs yield
train_df2 %>%
  dplyr::select(all_of(c(top6, "Yield"))) %>%
  pivot_longer(-Yield, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(Value, Yield)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 0.8) +
  facet_wrap(~Predictor, scales = "free_x", ncol = 2) +
  labs(title = "Top 6 Predictors vs. Yield (Training Set)",
       x = "Predictor Value", y = "Yield (%)") +
  theme_minimal()

The scatter plots reveal actionable patterns for each top process predictor:

  • ManufacturingProcess32 (most important): clear positive relationship — higher values associate with higher yield. This is a prime candidate for process optimization; operators should aim to maximize this parameter.

  • ManufacturingProcess06: strong positive relationship with wide confidence interval at high values, suggesting yield gains are possible but variable at extreme settings.

  • ManufacturingProcess09: positive relationship but with considerable scatter, indicating other factors moderate its effect.

  • ManufacturingProcess13 and ManufacturingProcess17: both show negative relationships — higher values associate with lower yield. These are parameters to minimize or tightly control.

  • ManufacturingProcess36: appears largely discrete/categorical with a slight negative trend. The clustering of values suggests this may be a stepped or categorical process setting rather than truly continuous.

This information is directly actionable: increase Process32 and Process06, reduce Process13 and Process17, and investigate whether Process36 settings can be standardized to the higher-yield levels.


Session Info

sessionInfo()
## R version 4.5.0 (2025-04-11)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] glmnet_4.1-10                   Matrix_1.7-3                   
##  [3] knitr_1.50                      corrplot_0.95                  
##  [5] lubridate_1.9.4                 forcats_1.0.1                  
##  [7] stringr_1.5.2                   dplyr_1.1.4                    
##  [9] purrr_1.1.0                     readr_2.1.5                    
## [11] tidyr_1.3.1                     tibble_3.3.0                   
## [13] tidyverse_2.0.0                 pls_2.9-0                      
## [15] caret_7.0-1                     lattice_0.22-6                 
## [17] ggplot2_4.0.0.9000              AppliedPredictiveModeling_1.1-7
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4052.112    farver_2.1.2        
##  [4] S7_0.2.0             fastmap_1.2.0        RANN_2.6.2          
##  [7] pROC_1.19.0.1        digest_0.6.37        rpart_4.1.24        
## [10] timechange_0.3.0     lifecycle_1.0.4      cluster_2.1.8.1     
## [13] survival_3.8-3       magrittr_2.0.4       compiler_4.5.0      
## [16] rlang_1.1.6          sass_0.4.10          tools_4.5.0         
## [19] plotrix_3.8-14       yaml_2.3.10          data.table_1.17.8   
## [22] labeling_0.4.3       plyr_1.8.9           RColorBrewer_1.1-3  
## [25] withr_3.0.2          nnet_7.3-20          grid_4.5.0          
## [28] stats4_4.5.0         future_1.68.0        globals_0.18.0      
## [31] scales_1.4.0         iterators_1.0.14     MASS_7.3-65         
## [34] cli_3.6.5            ellipse_0.5.0        rmarkdown_2.30      
## [37] generics_0.1.4       rstudioapi_0.17.1    future.apply_1.20.1 
## [40] reshape2_1.4.4       tzdb_0.5.0           cachem_1.1.0        
## [43] splines_4.5.0        parallel_4.5.0       vctrs_0.6.5         
## [46] hardhat_1.4.2        jsonlite_2.0.0       hms_1.1.3           
## [49] listenv_0.10.0       foreach_1.5.2        gower_1.0.2         
## [52] jquerylib_0.1.4      recipes_1.3.1        glue_1.8.0          
## [55] parallelly_1.46.0    codetools_0.2-20     shape_1.4.6.1       
## [58] stringi_1.8.7        gtable_0.3.6         rpart.plot_3.1.4    
## [61] CORElearn_1.57.3.1   pillar_1.11.1        htmltools_0.5.8.1   
## [64] ipred_0.9-15         lava_1.8.2           R6_2.6.1            
## [67] evaluate_1.0.5       bslib_0.9.0          class_7.3-23        
## [70] Rcpp_1.1.0           nlme_3.1-168         prodlim_2025.04.28  
## [73] mgcv_1.9-1           xfun_0.53            pkgconfig_2.0.3     
## [76] ModelMetrics_1.2.2.2