library(AppliedPredictiveModeling)
data(permeability)
dim(fingerprints)
## [1] 165 1107
length(permeability)
## [1] 165
The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
# Identify near-zero variance predictors
nzv <- nearZeroVar(fingerprints)
# How many were identified?
cat("\nNear-Zero Variance Predictors:", (length(nzv)))
##
## Near-Zero Variance Predictors: 719
# Remove them
filtered_fingerprints <- fingerprints[, -nzv]
## Remaining
remaining <- ncol(filtered_fingerprints)
cat("\n",remaining,"predictors remain")
##
## 388 predictors remain
set.seed(2000)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
train_fingerprints <- filtered_fingerprints[trainIndex, ]
test_fingerprints <- filtered_fingerprints[-trainIndex, ]
train_permeability <- permeability[trainIndex]
test_permeability <- permeability[-trainIndex]
ctrl <- trainControl(method = "cv", # 10-fold cross-validation
number = 10)
set.seed(2000)
pls_model <- train(
x = train_fingerprints,
y = train_permeability,
method = "pls",
preProcess = c("center", "scale"),
tuneLength = 20, # test up to 20 components
trControl = ctrl
)
pls_model
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 119, 121, 120, 120, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.41795 0.2764716 10.120980
## 2 12.55694 0.3789597 9.074513
## 3 12.63942 0.3680981 9.318871
## 4 12.86336 0.3460122 9.401722
## 5 12.50244 0.3786661 9.065023
## 6 12.28430 0.3721996 9.117472
## 7 12.16283 0.3845695 9.152741
## 8 12.12427 0.3970405 9.327471
## 9 12.22012 0.4038059 9.295827
## 10 12.28955 0.3916603 9.448883
## 11 12.66676 0.3708377 9.689703
## 12 12.91660 0.3573279 9.969366
## 13 13.23584 0.3345804 10.317099
## 14 13.23736 0.3357853 10.305494
## 15 13.49081 0.3153397 10.500522
## 16 13.61978 0.3166509 10.566463
## 17 14.00326 0.3020118 10.773887
## 18 13.99029 0.3133309 10.765515
## 19 13.96125 0.3239681 10.840184
## 20 14.33791 0.3051138 11.222270
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 8.
Optimal number of latent variables: 8
Corresponding resampled Rsquared = 0.3970405
# Predict on the test set
pls_pred <- predict(pls_model, newdata = test_fingerprints)
pls_pred
## [1] 31.7198703 3.2490750 10.6774955 -3.1807912 -5.5216300 -3.4350936
## [7] 15.3769562 -0.6934557 0.1318053 2.0609708 1.0839686 1.2949157
## [13] 1.2615286 -4.3297064 0.1600624 1.5144468 28.5992554 34.0855017
## [19] 9.3385724 -11.0455351 5.7705793 7.3241894 24.8509832 28.1520296
## [25] 30.2086872 41.3329724 31.0212005 33.8246322 30.4643122 1.8458102
## [31] -10.7816562 0.1704149
# Compare predictions to actual permeability values
pls_results <- postResample(pred = pls_pred, obs = test_permeability)
pls_results
## RMSE Rsquared MAE
## 11.5713467 0.5327504 8.6921279
Test set Rsquared = ~0.53 This means the model explains about 53% of the variation in permeability for new, unseen compounds — which is actually pretty solid given the small sample size (n = 165) and the large number of predictors (388 after filtering). The RMSE (~11.57) and MAE (~8.69) show the average prediction error, giving a good sense of how close the model’s predictions are to the actual values.
# Filter sparse predictors (from part b)
nzv <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nzv]
set.seed(2000)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
train_x <- filtered_fingerprints[trainIndex, ]
test_x <- filtered_fingerprints[-trainIndex, ]
train_y <- permeability[trainIndex]
test_y <- permeability[-trainIndex]
ctrl <- trainControl(method = "cv", number = 10)
set.seed(2000)
pls_model <- train(
x = train_x, y = train_y,
method = "pls",
preProcess = c("center", "scale"),
tuneLength = 20,
trControl = ctrl
)
set.seed(2000)
pcr_model <- train(
x = train_x, y = train_y,
method = "pcr",
preProcess = c("center", "scale"),
tuneLength = 20,
trControl = ctrl
)
set.seed(2000)
ridge_model <- train(
x = train_x, y = train_y,
method = "ridge",
preProcess = c("center", "scale"),
tuneLength = 25,
trControl = ctrl
)
set.seed(2000)
lasso_model <- train(
x = train_x, y = train_y,
method = "glmnet",
preProcess = c("center", "scale"),
tuneGrid = expand.grid(alpha = 1, lambda = seq(0.0001, 1, length = 25)),
trControl = ctrl
)
set.seed(2000)
enet_model <- train(
x = train_x, y = train_y,
method = "glmnet",
preProcess = c("center", "scale"),
tuneGrid = expand.grid(alpha = seq(0, 1, length = 10),
lambda = seq(0.0001, 1, length = 10)),
trControl = ctrl
)
resamps <- resamples(list(
PLS = pls_model,
PCR = pcr_model,
Ridge = ridge_model,
Lasso = lasso_model,
ElasticNet = enet_model
))
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: PLS, PCR, Ridge, Lasso, ElasticNet
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 7.076393 7.970348 8.859370 9.327471 9.950452 14.50668 0
## PCR 6.159372 8.223635 9.433957 9.342160 10.271137 12.63368 0
## Ridge 6.213315 8.574332 10.309985 9.934395 10.874256 13.55892 0
## Lasso 6.526063 8.029437 8.976769 9.309936 9.434883 14.95368 0
## ElasticNet 6.061060 7.029357 9.030271 8.783810 9.430936 12.62041 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 9.063152 10.555886 12.17900 12.12427 13.01773 16.49118 0
## PCR 7.269815 11.605115 12.59395 12.69494 14.78929 16.08053 0
## Ridge 9.103980 11.372282 13.40596 12.95877 14.16245 15.86788 0
## Lasso 10.178398 10.667863 11.49465 12.44725 13.41017 19.32848 0
## ElasticNet 7.299224 9.935326 12.19719 12.07952 13.79525 16.41967 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 0.045191718 0.2377321 0.4744234 0.3970405 0.5461079 0.6409265 0
## PCR 0.079182195 0.1391263 0.2455999 0.3842431 0.6578316 0.8552646 0
## Ridge 0.081713899 0.2097779 0.3789017 0.3648712 0.4927643 0.7003389 0
## Lasso 0.003947468 0.1351255 0.3638928 0.3712542 0.6031675 0.7295333 0
## ElasticNet 0.124490845 0.1548822 0.2897370 0.3975842 0.6534233 0.8387160 0
bwplot(resamps, metric = "Rsquared")
After comparing the models, Partial Least Squares (PLS) and Elastic Net provided the strongest predictive performance. PLS achieved a mean cross-validated Rsquared = 0.3970405 and Elastic Net achieved Rsquared = 0.3975842, both with similar RMSE (~ 12).
Principal Component Regression (PCR) followed closely, showing slightly higher predictive power than Ridge regression, while Lasso regression lagged behind. Overall, Elastic Net provided the best balance between bias and variance, making it the most reliable choice for predicting compound permeability.
R2 measures how much of the variation in permeability the model can explain. With R2 values around 0.4, the best models capture only part of the real-world variability. This is decent for early screening, but not accurate enough to replace lab measurements. In scientific contexts, models usually need R2 values close to 0.9 and very low error rates before they’re trusted to stand in for experiments, especially for properties tied to safety and regulatory decisions.
So, while none of these models are accurate enough to replace the permeability lab experiment, the PLS and Elastic Net models show reasonable predictive power. They could be useful as pre-screening tools to help decide which compounds should move forward to lab testing first. Using them alongside lab experiments could save time and cost while still keeping results scientifically sound.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
# Inspect it
dim(ChemicalManufacturingProcess)
## [1] 176 58
# Separate predictors and response
processPredictors <- ChemicalManufacturingProcess[, -1] # all columns except Yield
yield <- ChemicalManufacturingProcess$Yield
dim(processPredictors)
## [1] 176 57
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
# Check
dim(processPredictors)
## [1] 176 57
sum(is.na(processPredictors))
## [1] 106
# Impute missing values and standardize predictors
preProc <- preProcess(processPredictors,
method = c("knnImpute", "center", "scale"))
# Apply the transformation
imputedPredictors <- predict(preProc, processPredictors)
# Verify that no missing values remain
sum(is.na(imputedPredictors))
## [1] 0
set.seed(2000)
# 80% training, 20% testing
trainIndex <- createDataPartition(yield, p = 0.8, list = FALSE)
train_x <- processPredictors[trainIndex, ]
test_x <- processPredictors[-trainIndex, ]
train_y <- yield[trainIndex]
test_y <- yield[-trainIndex]
preProc <- preProcess(train_x,
method = c("knnImpute", "center", "scale"))
train_pp <- predict(preProc, train_x)
test_pp <- predict(preProc, test_x)
set.seed(2000)
ctrl <- trainControl(method = "cv", number = 10)
pls_model <- train(
x = train_pp,
y = train_y,
method = "pls",
tuneLength = 20,
preProcess = NULL, # already pre-processed
trControl = ctrl
)
pls_model
## Partial Least Squares
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 128, 130, 131, 129, 130, 130, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.420364 0.4679612 1.1670911
## 2 1.239961 0.5987144 1.0201621
## 3 1.197046 0.6199545 0.9892591
## 4 1.220618 0.6176468 1.0056464
## 5 1.243927 0.6078181 1.0152531
## 6 1.270206 0.5906142 1.0354282
## 7 1.278082 0.5900651 1.0501498
## 8 1.298981 0.5816187 1.0760832
## 9 1.317696 0.5807568 1.0960564
## 10 1.328232 0.5807916 1.1026338
## 11 1.349818 0.5787871 1.1064332
## 12 1.338006 0.5871866 1.1010177
## 13 1.356129 0.5762249 1.1175860
## 14 1.375862 0.5756547 1.1311661
## 15 1.384161 0.5762779 1.1356926
## 16 1.388906 0.5754148 1.1343138
## 17 1.383065 0.5811255 1.1255779
## 18 1.387202 0.5790426 1.1263117
## 19 1.395208 0.5730710 1.1306956
## 20 1.408497 0.5686318 1.1416159
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
The optimal model used 3 latent components and achieved a cross-validated Rsquared of 0.62 with an RMSE of 1.20. This indicates the model explains approximately 62% of the variation in product yield during resampling, with an average prediction error of about 1.2 yield percentage points.
# Predict product yield for the test set
pls_pred <- predict(pls_model, newdata = test_pp)
pls_test_results <- postResample(pred = pls_pred, obs = test_y)
pls_test_results
## RMSE Rsquared MAE
## 1.0907625 0.5698560 0.8573382
The PLS model achieved a cross-validated Rsquared=0.62 during training and a test-set Rsquared=0.57 with an RMSE of 1.09 and MAE of 0.86. The small drop in R2 and similar error values indicate that the model’s performance on new data closely matches its training results. Therefore, the model generalizes well and provides reliable predictions for product yield in new manufacturing runs.
# Variable importance for the trained PLS model
pls_importance <- varImp(pls_model, scale = TRUE)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
plot(pls_importance, top = 20)
colnames(processPredictors)[1:12] # biological
## [1] "BiologicalMaterial01" "BiologicalMaterial02" "BiologicalMaterial03"
## [4] "BiologicalMaterial04" "BiologicalMaterial05" "BiologicalMaterial06"
## [7] "BiologicalMaterial07" "BiologicalMaterial08" "BiologicalMaterial09"
## [10] "BiologicalMaterial10" "BiologicalMaterial11" "BiologicalMaterial12"
colnames(processPredictors)[13:57] # process
## [1] "ManufacturingProcess01" "ManufacturingProcess02" "ManufacturingProcess03"
## [4] "ManufacturingProcess04" "ManufacturingProcess05" "ManufacturingProcess06"
## [7] "ManufacturingProcess07" "ManufacturingProcess08" "ManufacturingProcess09"
## [10] "ManufacturingProcess10" "ManufacturingProcess11" "ManufacturingProcess12"
## [13] "ManufacturingProcess13" "ManufacturingProcess14" "ManufacturingProcess15"
## [16] "ManufacturingProcess16" "ManufacturingProcess17" "ManufacturingProcess18"
## [19] "ManufacturingProcess19" "ManufacturingProcess20" "ManufacturingProcess21"
## [22] "ManufacturingProcess22" "ManufacturingProcess23" "ManufacturingProcess24"
## [25] "ManufacturingProcess25" "ManufacturingProcess26" "ManufacturingProcess27"
## [28] "ManufacturingProcess28" "ManufacturingProcess29" "ManufacturingProcess30"
## [31] "ManufacturingProcess31" "ManufacturingProcess32" "ManufacturingProcess33"
## [34] "ManufacturingProcess34" "ManufacturingProcess35" "ManufacturingProcess36"
## [37] "ManufacturingProcess37" "ManufacturingProcess38" "ManufacturingProcess39"
## [40] "ManufacturingProcess40" "ManufacturingProcess41" "ManufacturingProcess42"
## [43] "ManufacturingProcess43" "ManufacturingProcess44" "ManufacturingProcess45"
The most influential predictors in the model were primarily manufacturing process variables, particularly ManufacturingProcess32, 13, 17, 36, and 09. While several biological material variables contributed moderately, the process predictors clearly dominated the top-importance rankings. This suggests that process control parameters have the greatest impact on product yield, whereas biological characteristics play a secondary role in determining baseline performance.
library(ggplot2)
# Add yield back to the preprocessed data
data_plot <- data.frame(Yield = train_y, train_pp)
# Top predictors from varImp
top_predictors <- c("ManufacturingProcess32", "ManufacturingProcess13",
"ManufacturingProcess17", "ManufacturingProcess36",
"ManufacturingProcess09")
# Plot yield vs each top predictor
for (var in top_predictors) {
print(
ggplot(data_plot, aes_string(x = var, y = "Yield")) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", color = "darkred", se = FALSE) +
labs(title = paste("Yield vs", var),
x = var,
y = "Product Yield") +
theme_minimal()
)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Several of the top manufacturing process variables show clear linear relationships with product yield:
Understanding these relationships helps engineers and scientists focus process optimization efforts:
In future manufacturing runs, these insights could guide targeted adjustments and experimental trials to identify optimal process settings and result in improving yield consistency and overall productivity without needing to test every variable blindly.