library(fpp3)
library(dplyr)
library(ggplot2)
library(glmnet)
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
##
## MAE, RMSE
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
library(glmnet)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
data(permeability)
set.seed(123)
This exercise uses molecular fingerprint data to predict compound permeability. The predictor matrix contains 1,107 binary molecular descriptors for 165 compounds, and the response variable is permeability.
x <- as.data.frame(fingerprints)
y <- permeability
dim(x)
## [1] 165 1107
length(y)
## [1] 165
The predictor matrix contains 165 observations and 1,107 molecular fingerprint predictors.
Since fingerprint predictors are sparse, we remove predictors with very low information using nearZeroVar() from the caret package.
nzv <- nearZeroVar(x)
x_nzv <- x[, -nzv]
cat("Original number of predictors:", ncol(x), "\n")
## Original number of predictors: 1107
cat("Number removed by nearZeroVar:", length(nzv), "\n")
## Number removed by nearZeroVar: 719
cat("Number left for modeling:", ncol(x_nzv), "\n")
## Number left for modeling: 388
Answer: After filtering with nearZeroVar(), r ncol(x_nzv) predictors remain for modeling.
We split the data into training and test sets using an 80/20 split. A Partial Least Squares (PLS) model is tuned using repeated 10-fold cross-validation. The predictors are centered and scaled before fitting.
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
x_train <- x_nzv[train_index, ]
x_test <- x_nzv[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5
)
set.seed(123)
pls_fit <- train(
x = x_train,
y = y_train,
method = "pls",
tuneLength = 20,
metric = "Rsquared",
preProcess = c("center", "scale"),
trControl = ctrl
)
pls_fit
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 120, 119, 118, 120, 121, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.38797 0.3533888 10.284137
## 2 11.65145 0.5098159 8.372496
## 3 11.72077 0.5109591 8.966913
## 4 11.90051 0.4995055 9.132150
## 5 11.80999 0.5049593 8.903070
## 6 11.68016 0.5120501 8.649013
## 7 11.63782 0.5133375 8.909445
## 8 11.68993 0.5190079 8.989466
## 9 11.88654 0.5182558 9.115443
## 10 12.24348 0.4983417 9.271576
## 11 12.38183 0.4879627 9.416005
## 12 12.57818 0.4764429 9.612964
## 13 12.61936 0.4703638 9.565395
## 14 12.76579 0.4600944 9.666269
## 15 12.92812 0.4523409 9.828174
## 16 13.06989 0.4454467 9.926024
## 17 13.29583 0.4350569 10.069689
## 18 13.53772 0.4253171 10.271930
## 19 13.61403 0.4231635 10.326181
## 20 13.73461 0.4195437 10.411078
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 8.
plot(pls_fit)
best_pls_ncomp <- pls_fit$bestTune$ncomp
best_pls_r2 <- getTrainPerf(pls_fit)$TrainRsquared
cat("Best number of latent variables:", best_pls_ncomp, "\n")
## Best number of latent variables: 8
cat("Best resampled R^2:", best_pls_r2, "\n")
## Best resampled R^2: 0.5190079
We now use the tuned PLS model to predict permeability for the test set and compute the out-of-sample R².
pls_pred <- predict(pls_fit, x_test)
pls_perf <- postResample(pls_pred, y_test)
pls_test_r2 <- cor(pls_pred, y_test)^2
pls_perf
## RMSE Rsquared MAE
## 11.8439471 0.3729547 8.4698200
cat("PLS test set R^2:", pls_test_r2, "\n")
## PLS test set R^2: 0.3729547
Answer: The test set estimate of R² for the PLS model is r round(pls_test_r2, 4).
In addition to PLS, we fit several other models :
Elastic Net Random Forest Gradient Boosting Machine (GBM) Support Vector Machine with Radial Basis Kernel (SVM Radial)
ctrl <- trainControl(
method = "cv",
number = 3
)
set.seed(123)
glmnet_fit <- train(
x = x_train,
y = y_train,
method = "glmnet",
metric = "Rsquared",
preProcess = c("center", "scale"),
tuneLength = 3,
trControl = ctrl
)
set.seed(123)
rf_fit <- train(
x = x_train,
y = y_train,
method = "rf",
metric = "Rsquared",
tuneLength = 3,
trControl = ctrl,
importance = TRUE,
ntree = 150
)
rf_fit
## Random Forest
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 89, 89, 88
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 12.35774 0.4648536 9.256543
## 195 11.34319 0.5267624 7.930896
## 388 11.38599 0.5255588 7.984449
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 195.
rf_pred <- predict(rf_fit, x_test)
rf_test_r2 <- cor(rf_pred, y_test)^2
# -----------------------------
# GBM
# -----------------------------
set.seed(123)
gbm_fit <- train(
x = x_train,
y = y_train,
method = "gbm",
metric = "Rsquared",
tuneLength = 10,
verbose = FALSE,
trControl = ctrl
)
gbm_pred <- predict(gbm_fit, x_test)
gbm_test_r2 <- cor(gbm_pred, y_test)^2
# -----------------------------
# SVM Radial
# -----------------------------
set.seed(123)
svm_fit <- train(
x = x_train,
y = y_train,
method = "svmRadial",
metric = "Rsquared",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = ctrl
)
svm_pred <- predict(svm_fit, x_test)
svm_test_r2 <- cor(svm_pred, y_test)^2
glmnet_pred <- predict(glmnet_fit, x_test)
glmnet_test_r2 <- cor(glmnet_pred, y_test)^2
glmnet_test_r2
## [1] 0.3581228
# -----------------------------
# Model Comparison
# -----------------------------
results <- data.frame(
Model = c("PLS", "GLMNET", "Random Forest", "GBM", "SVM Radial"),
Test_R2 = round(c(pls_test_r2, glmnet_test_r2, rf_test_r2, gbm_test_r2, svm_test_r2), 4)
)
results <- results[order(-results$Test_R2), ]
results
Answer: The table above compares the test set R² values across models. The model with the highest test-set R² is the best-performing model for this dataset.
A predictive model for permeability could potentially save time and money by screening out weak compounds before laboratory testing. However, I would not recommend fully replacing the laboratory experiment with any model developed here unless the predictive accuracy were extremely high and validated on new external data.
A more practical recommendation would be to use the model as a screening tool:
prioritize compounds with favorable predicted permeability, reduce the number of molecules sent for lab testing, and keep laboratory testing as the final decision step.
Thus, these models can be valuable for decision support, but they should supplement rather than replace the experimental assay.
##Conclusion
In this exercise, sparse molecular fingerprint predictors were filtered using nearZeroVar(), then several predictive models were trained and evaluated for permeability prediction. PLS provided a reasonable baseline after tuning the number of latent variables, while other models such as Random Forest, GBM, Elastic Net, and SVM were also compared on the same test data. The final recommendation depends on the model with the best out-of-sample R², but in general, the models are more appropriate for early-stage screening than for full replacement of laboratory experiments.
The goal of this exercise is to model product yield using both biological predictors and manufacturing process predictors. Biological predictors describe the raw material and cannot be changed, while process predictors can potentially be adjusted to improve yield.
library(AppliedPredictiveModeling)
library(caret)
library(gbm)
library(randomForest)
library(ggplot2)
library(dplyr)
set.seed(123)
data(ChemicalManufacturingProcess)
df <- ChemicalManufacturingProcess
# Split predictors and response
x <- df[, colnames(df) != "Yield"]
y <- df$Yield
dim(x)
## [1] 176 57
length(y)
## [1] 176
# Check missing values
sum(is.na(x))
## [1] 106
# Preprocess (impute + scale)
pp <- preProcess(x, method = c("medianImpute", "center", "scale"))
x_pp <- predict(pp, x)
Missing values were handled using median imputation, and the predictors were standardized through centering and scaling.
# Train/test split
set.seed(123)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
x_train <- x_pp[train_index, ]
x_test <- x_pp[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
ctrl <- trainControl(method = "cv", number = 5)
set.seed(123)
pls_fit <- train(
x = x_train,
y = y_train,
method = "pls",
tuneLength = 10,
trControl = ctrl
)
pls_fit
## Partial Least Squares
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 116, 116, 115, 116, 113
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.374009 0.4663943 1.1259596
## 2 1.166388 0.5931561 0.9462169
## 3 1.172749 0.6092352 0.9587660
## 4 1.170526 0.6215004 0.9611829
## 5 1.187658 0.6086952 0.9769572
## 6 1.212169 0.5985403 0.9892013
## 7 1.236181 0.5883427 1.0065309
## 8 1.303887 0.5625551 1.0423878
## 9 1.344552 0.5525198 1.0600339
## 10 1.435224 0.5191705 1.1108164
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.
#Test performance
pls_pred <- predict(pls_fit, x_test)
postResample(pls_pred, y_test)
## RMSE Rsquared MAE
## 1.3026967 0.5070011 1.1236997
### (d) Predict on the test set
pls_pred <- predict(pls_fit, x_test)
test_perf <- postResample(pls_pred, y_test)
test_perf
## RMSE Rsquared MAE
## 1.3026967 0.5070011 1.1236997
getTrainPerf(pls_fit)
The test-set performance is comparable to the resampled training performance, indicating that the model generalizes reasonably well and is not overfitting.
### (e) Variable importance
pls_imp <- varImp(pls_fit)
imp_df <- pls_imp$importance
imp_df$Predictor <- rownames(imp_df)
top20 <- imp_df %>%
arrange(desc(Overall)) %>%
slice(1:20)
top20
plot(pls_imp, top = 20)
Variable importance analysis shows that manufacturing process predictors dominate the most influential variables, indicating they play a larger role in determining yield than biological factors.
# Top 5 predictors
top5_names <- top20$Predictor[1:5]
top5_names
## [1] "ManufacturingProcess32" "ManufacturingProcess13" "ManufacturingProcess09"
## [4] "ManufacturingProcess17" "ManufacturingProcess36"
# Plot relationships
par(mfrow = c(2, 3))
for (v in top5_names) {
plot(df[[v]], df$Yield,
xlab = v,
ylab = "Yield",
main = paste("Yield vs", v),
pch = 19, col = "blue")
abline(lm(df$Yield ~ df[[v]]), col = "red", lwd = 2)
}
par(mfrow = c(1, 1))
The relationships between top predictors and yield suggest that
adjusting key process variables can improve output, while biological
variables help in screening raw material quality before production.