Problem 6.2: Predicting Molecular Permeability

(a) Load Data

library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(tidyverse)
data(permeability)

(b) Filter Low Frequency Predictors

nzv <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nzv]
num_predictors_left <- ncol(filtered_fingerprints)
num_predictors_left
## [1] 388

(c) Train/Test Split and PLS Tuning

set.seed(123)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
X_train <- filtered_fingerprints[trainIndex, ]
X_test <- filtered_fingerprints[-trainIndex, ]
y_train <- permeability[trainIndex]
y_test <- permeability[-trainIndex]

preproc <- preProcess(X_train)
X_train_pp <- predict(preproc, X_train)
X_test_pp <- predict(preproc, X_test)

ctrl <- trainControl(method = "cv", number = 10)
pls_model <- train(X_train_pp, y_train,
                   method = "pls",
                   tuneLength = 20,
                   trControl = ctrl,
                   metric = "Rsquared")
pls_model
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.36940  0.3397706  10.315252
##    2     11.78929  0.4816147   8.575238
##    3     11.90097  0.4849485   9.105076
##    4     12.00362  0.4962113   9.407429
##    5     11.76339  0.5219280   9.024291
##    6     11.55813  0.5310320   8.646010
##    7     11.56404  0.5294165   8.761596
##    8     11.83939  0.5159611   9.222387
##    9     11.97478  0.5174448   9.199224
##   10     12.57694  0.4792439   9.678215
##   11     12.73620  0.4725734   9.714152
##   12     13.03145  0.4514959   9.968887
##   13     13.06366  0.4396748   9.834200
##   14     13.38089  0.4168034  10.057344
##   15     13.60993  0.4011096  10.200724
##   16     13.74283  0.3964039  10.201961
##   17     13.97149  0.3844652  10.412445
##   18     14.21149  0.3695810  10.612168
##   19     14.23455  0.3709200  10.593149
##   20     14.31085  0.3732814  10.675872
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 6.

(d) Test Set Prediction and R²

pls_pred <- predict(pls_model, X_test_pp)
test_R2 <- cor(pls_pred, y_test)^2
test_R2
## [1] 0.3244542

(e) Other Models

set.seed(123)
rf_model <- train(X_train_pp, y_train, method = "rf", trControl = ctrl, metric = "Rsquared")
svm_model <- train(X_train_pp, y_train, method = "svmRadial", trControl = ctrl, metric = "Rsquared")
rf_model
## Random Forest 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 119, 118, 120, 121, 119, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##     2   11.99516  0.5315122  9.116588
##   195   10.42245  0.5809743  7.487867
##   388   10.59277  0.5611845  7.612958
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 195.
svm_model
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 120, 119, 120, 120, 119, ... 
## Resampling results across tuning parameters:
## 
##   C     RMSE      Rsquared   MAE     
##   0.25  12.85182  0.5586969  8.422283
##   0.50  11.94997  0.5669842  7.962381
##   1.00  11.02712  0.5849114  7.547417
## 
## Tuning parameter 'sigma' was held constant at a value of 0.001587463
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.001587463 and C = 1.

(f) Recommendation

While the Random Forest and SVM models provide reasonable predictive power (with R^2 ≈ 0.58), they are not yet accurate enough to fully replace the permeability lab experiment.

Problem 6.3: Modeling Pharmaceutical Manufacturing Yield

(a) Load Data

library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
ls()
##  [1] "ChemicalManufacturingProcess" "ctrl"                        
##  [3] "filtered_fingerprints"        "fingerprints"                
##  [5] "num_predictors_left"          "nzv"                         
##  [7] "permeability"                 "pls_model"                   
##  [9] "pls_pred"                     "preproc"                     
## [11] "rf_model"                     "svm_model"                   
## [13] "test_R2"                      "trainIndex"                  
## [15] "X_test"                       "X_test_pp"                   
## [17] "X_train"                      "X_train_pp"                  
## [19] "y_test"                       "y_train"

(b) Handle Missing Values

str(ChemicalManufacturingProcess)
## 'data.frame':    176 obs. of  58 variables:
##  $ Yield                 : num  38 42.4 42 41.4 42.5 ...
##  $ BiologicalMaterial01  : num  6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
##  $ BiologicalMaterial02  : num  49.6 61 61 61 63.3 ...
##  $ BiologicalMaterial03  : num  57 67.5 67.5 67.5 72.2 ...
##  $ BiologicalMaterial04  : num  12.7 14.7 14.7 14.7 14 ...
##  $ BiologicalMaterial05  : num  19.5 19.4 19.4 19.4 17.9 ...
##  $ BiologicalMaterial06  : num  43.7 53.1 53.1 53.1 54.7 ...
##  $ BiologicalMaterial07  : num  100 100 100 100 100 100 100 100 100 100 ...
##  $ BiologicalMaterial08  : num  16.7 19 19 19 18.2 ...
##  $ BiologicalMaterial09  : num  11.4 12.6 12.6 12.6 12.8 ...
##  $ BiologicalMaterial10  : num  3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
##  $ BiologicalMaterial11  : num  138 154 154 154 148 ...
##  $ BiologicalMaterial12  : num  18.8 21.1 21.1 21.1 21.1 ...
##  $ ManufacturingProcess01: num  NA 0 0 0 10.7 12 11.5 12 12 12 ...
##  $ ManufacturingProcess02: num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess03: num  NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
##  $ ManufacturingProcess04: num  NA 917 912 911 918 924 933 929 928 938 ...
##  $ ManufacturingProcess05: num  NA 1032 1004 1015 1028 ...
##  $ ManufacturingProcess06: num  NA 210 207 213 206 ...
##  $ ManufacturingProcess07: num  NA 177 178 177 178 178 177 178 177 177 ...
##  $ ManufacturingProcess08: num  NA 178 178 177 178 178 178 178 177 177 ...
##  $ ManufacturingProcess09: num  43 46.6 45.1 44.9 45 ...
##  $ ManufacturingProcess10: num  NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
##  $ ManufacturingProcess11: num  NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
##  $ ManufacturingProcess12: num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess13: num  35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
##  $ ManufacturingProcess14: num  4898 4869 4878 4897 4992 ...
##  $ ManufacturingProcess15: num  6108 6095 6087 6102 6233 ...
##  $ ManufacturingProcess16: num  4682 4617 4617 4635 4733 ...
##  $ ManufacturingProcess17: num  35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
##  $ ManufacturingProcess18: num  4865 4867 4877 4872 4886 ...
##  $ ManufacturingProcess19: num  6049 6097 6078 6073 6102 ...
##  $ ManufacturingProcess20: num  4665 4621 4621 4611 4659 ...
##  $ ManufacturingProcess21: num  0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
##  $ ManufacturingProcess22: num  NA 3 4 5 8 9 1 2 3 4 ...
##  $ ManufacturingProcess23: num  NA 0 1 2 4 1 1 2 3 1 ...
##  $ ManufacturingProcess24: num  NA 3 4 5 18 1 1 2 3 4 ...
##  $ ManufacturingProcess25: num  4873 4869 4897 4892 4930 ...
##  $ ManufacturingProcess26: num  6074 6107 6116 6111 6151 ...
##  $ ManufacturingProcess27: num  4685 4630 4637 4630 4684 ...
##  $ ManufacturingProcess28: num  10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
##  $ ManufacturingProcess29: num  21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
##  $ ManufacturingProcess30: num  9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
##  $ ManufacturingProcess31: num  69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
##  $ ManufacturingProcess32: num  156 169 173 171 171 173 159 161 160 164 ...
##  $ ManufacturingProcess33: num  66 66 66 68 70 70 65 65 65 66 ...
##  $ ManufacturingProcess34: num  2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
##  $ ManufacturingProcess35: num  486 508 509 496 468 490 475 478 491 488 ...
##  $ ManufacturingProcess36: num  0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
##  $ ManufacturingProcess37: num  0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
##  $ ManufacturingProcess38: num  3 2 2 2 2 2 2 2 3 3 ...
##  $ ManufacturingProcess39: num  7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
##  $ ManufacturingProcess40: num  NA 0.1 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess41: num  NA 0.15 0 0 0 0 0 0 0 0 ...
##  $ ManufacturingProcess42: num  11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
##  $ ManufacturingProcess43: num  3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
##  $ ManufacturingProcess44: num  1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
##  $ ManufacturingProcess45: num  2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
processPredictors <- ChemicalManufacturingProcess[, -1]
yield <- ChemicalManufacturingProcess[, 1]
sum(is.na(processPredictors))
## [1] 106
preproc2 <- preProcess(processPredictors, method = "knnImpute")
processPredictors_imp <- predict(preproc2, processPredictors)

(c) Train/Test Split and Model Tuning

set.seed(123)
trainIndex2 <- createDataPartition(yield, p = 0.8, list = FALSE)
X_train2 <- processPredictors_imp[trainIndex2, ]
X_test2 <- processPredictors_imp[-trainIndex2, ]
y_train2 <- yield[trainIndex2]
y_test2 <- yield[-trainIndex2]

ctrl2 <- trainControl(method = "cv", number = 10)
ridge_model <- train(X_train2, y_train2,
                     method = "ridge",
                     preProcess = c("center", "scale"),
                     tuneLength = 10,
                     trControl = ctrl2,
                     metric = "Rsquared")
ridge_model
## Ridge Regression 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 131, 130, 130, 129, 131, 129, ... 
## Resampling results across tuning parameters:
## 
##   lambda        RMSE      Rsquared   MAE     
##   0.0000000000  6.742444  0.3775102  2.664958
##   0.0001000000  5.991244  0.3985266  2.427099
##   0.0002371374  5.392968  0.4254371  2.228818
##   0.0005623413  4.839187  0.4227875  2.092529
##   0.0013335214  4.477744  0.4089676  2.003610
##   0.0031622777  4.199031  0.4081150  1.918281
##   0.0074989421  3.842863  0.4184952  1.797845
##   0.0177827941  3.323832  0.4410475  1.626668
##   0.0421696503  2.716389  0.4781883  1.433809
##   0.1000000000  2.171541  0.5257725  1.263974
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was lambda = 0.1.

(d) Test Set Prediction and R²

ridge_pred <- predict(ridge_model, X_test2)
test_R2_chem <- cor(ridge_pred, y_test2)^2
test_R2_chem
## [1] 0.4997636

(e) Predictor Importance

importance <- varImp(ridge_model, scale = TRUE)
plot(importance, top = 15)

(f) Interpretation and Future Implications

top_predictors <- rownames(importance$importance)[order(importance$importance$Overall, decreasing = TRUE)][1:5]
top_predictors
## [1] "ManufacturingProcess32" "BiologicalMaterial06"   "BiologicalMaterial03"  
## [4] "ManufacturingProcess13" "ManufacturingProcess36"

The top five predictors most strongly associated with product yield were ManufacturingProcess32, BiologicalMaterial06, BiologicalMaterial03, ManufacturingProcess13, and ManufacturingProcess36. This suggests that both process parameters and raw material quality play important roles in determining yield. While biological predictors like BiologicalMaterial06 and BiologicalMaterial03 cannot be modified during production, they can be used to screen and select higher-quality raw materials. On the other hand, process variables such as ManufacturingProcess32 offer opportunities for optimization and control to enhance yield in future manufacturing runs.