library(fpp3)
library(dplyr)
library(ggplot2)
library(glmnet)
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
## 
##     MAE, RMSE
library(pls)
## Warning: package 'pls' was built under R version 4.4.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
library(glmnet)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(gbm)
## Warning: package 'gbm' was built under R version 4.4.3
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
data(permeability)
set.seed(123)

6.2 Developing a Model to Predict Permeability

This exercise uses molecular fingerprint data to predict compound permeability. The predictor matrix contains 1,107 binary molecular descriptors for 165 compounds, and the response variable is permeability.

(a) Load the data

x <- as.data.frame(fingerprints)
y <- permeability

dim(x)
## [1]  165 1107
length(y)
## [1] 165

The predictor matrix contains 165 observations and 1,107 molecular fingerprint predictors.

(b) Filter out low-frequency predictors

Since fingerprint predictors are sparse, we remove predictors with very low information using nearZeroVar() from the caret package.

nzv <- nearZeroVar(x)
x_nzv <- x[, -nzv]

cat("Original number of predictors:", ncol(x), "\n")
## Original number of predictors: 1107
cat("Number removed by nearZeroVar:", length(nzv), "\n")
## Number removed by nearZeroVar: 719
cat("Number left for modeling:", ncol(x_nzv), "\n")
## Number left for modeling: 388

Answer: After filtering with nearZeroVar(), r ncol(x_nzv) predictors remain for modeling.

(c) Split the data, preprocess, and tune a PLS model

We split the data into training and test sets using an 80/20 split. A Partial Least Squares (PLS) model is tuned using repeated 10-fold cross-validation. The predictors are centered and scaled before fitting.

train_index <- createDataPartition(y, p = 0.8, list = FALSE)

x_train <- x_nzv[train_index, ]
x_test  <- x_nzv[-train_index, ]
y_train <- y[train_index]
y_test  <- y[-train_index]

ctrl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 5
)

set.seed(123)
pls_fit <- train(
  x = x_train,
  y = y_train,
  method = "pls",
  tuneLength = 20,
  metric = "Rsquared",
  preProcess = c("center", "scale"),
  trControl = ctrl
)

pls_fit
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 120, 119, 118, 120, 121, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.38797  0.3533888  10.284137
##    2     11.65145  0.5098159   8.372496
##    3     11.72077  0.5109591   8.966913
##    4     11.90051  0.4995055   9.132150
##    5     11.80999  0.5049593   8.903070
##    6     11.68016  0.5120501   8.649013
##    7     11.63782  0.5133375   8.909445
##    8     11.68993  0.5190079   8.989466
##    9     11.88654  0.5182558   9.115443
##   10     12.24348  0.4983417   9.271576
##   11     12.38183  0.4879627   9.416005
##   12     12.57818  0.4764429   9.612964
##   13     12.61936  0.4703638   9.565395
##   14     12.76579  0.4600944   9.666269
##   15     12.92812  0.4523409   9.828174
##   16     13.06989  0.4454467   9.926024
##   17     13.29583  0.4350569  10.069689
##   18     13.53772  0.4253171  10.271930
##   19     13.61403  0.4231635  10.326181
##   20     13.73461  0.4195437  10.411078
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 8.
plot(pls_fit)

best_pls_ncomp <- pls_fit$bestTune$ncomp
best_pls_r2 <- getTrainPerf(pls_fit)$TrainRsquared

cat("Best number of latent variables:", best_pls_ncomp, "\n")
## Best number of latent variables: 8
cat("Best resampled R^2:", best_pls_r2, "\n")
## Best resampled R^2: 0.5190079

Answer: The optimal number of latent variables is r best_pls_ncomp, with a resampled estimate of R² = r round(best_pls_r2, 4).

(d) Predict the response for the test set

We now use the tuned PLS model to predict permeability for the test set and compute the out-of-sample R².

pls_pred <- predict(pls_fit, x_test)
pls_perf <- postResample(pls_pred, y_test)
pls_test_r2 <- cor(pls_pred, y_test)^2

pls_perf
##       RMSE   Rsquared        MAE 
## 11.8439471  0.3729547  8.4698200
cat("PLS test set R^2:", pls_test_r2, "\n")
## PLS test set R^2: 0.3729547

Answer: The test set estimate of R² for the PLS model is r round(pls_test_r2, 4).

(e) Try other models and compare performance

In addition to PLS, we fit several other models :

Elastic Net Random Forest Gradient Boosting Machine (GBM) Support Vector Machine with Radial Basis Kernel (SVM Radial)

ctrl <- trainControl(
  method = "cv",
  number = 3
)

set.seed(123)
glmnet_fit <- train(
  x = x_train,
  y = y_train,
  method = "glmnet",
  metric = "Rsquared",
  preProcess = c("center", "scale"),
  tuneLength = 3,
  trControl = ctrl
)
set.seed(123)

rf_fit <- train(
  x = x_train,
  y = y_train,
  method = "rf",
  metric = "Rsquared",
  tuneLength = 3,              
  trControl = ctrl,
  importance = TRUE,
  ntree = 150                  
)

rf_fit
## Random Forest 
## 
## 133 samples
## 388 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 89, 89, 88 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##     2   12.35774  0.4648536  9.256543
##   195   11.34319  0.5267624  7.930896
##   388   11.38599  0.5255588  7.984449
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 195.
rf_pred <- predict(rf_fit, x_test)
rf_test_r2 <- cor(rf_pred, y_test)^2


# -----------------------------
# GBM
# -----------------------------
set.seed(123)
gbm_fit <- train(
  x = x_train,
  y = y_train,
  method = "gbm",
  metric = "Rsquared",
  tuneLength = 10,
  verbose = FALSE,
  trControl = ctrl
)

gbm_pred <- predict(gbm_fit, x_test)
gbm_test_r2 <- cor(gbm_pred, y_test)^2


# -----------------------------
# SVM Radial
# -----------------------------
set.seed(123)
svm_fit <- train(
  x = x_train,
  y = y_train,
  method = "svmRadial",
  metric = "Rsquared",
  preProcess = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl
)
svm_pred <- predict(svm_fit, x_test)
svm_test_r2 <- cor(svm_pred, y_test)^2
glmnet_pred <- predict(glmnet_fit, x_test)
glmnet_test_r2 <- cor(glmnet_pred, y_test)^2

glmnet_test_r2
## [1] 0.3581228
# -----------------------------
# Model Comparison
# -----------------------------
results <- data.frame(
  Model = c("PLS", "GLMNET", "Random Forest", "GBM", "SVM Radial"),
  Test_R2 = round(c(pls_test_r2, glmnet_test_r2, rf_test_r2, gbm_test_r2, svm_test_r2), 4)
)

results <- results[order(-results$Test_R2), ]

results

Answer: The table above compares the test set R² values across models. The model with the highest test-set R² is the best-performing model for this dataset.

(f) Recommendation

A predictive model for permeability could potentially save time and money by screening out weak compounds before laboratory testing. However, I would not recommend fully replacing the laboratory experiment with any model developed here unless the predictive accuracy were extremely high and validated on new external data.

A more practical recommendation would be to use the model as a screening tool:

prioritize compounds with favorable predicted permeability, reduce the number of molecules sent for lab testing, and keep laboratory testing as the final decision step.

Thus, these models can be valuable for decision support, but they should supplement rather than replace the experimental assay.

##Conclusion

In this exercise, sparse molecular fingerprint predictors were filtered using nearZeroVar(), then several predictive models were trained and evaluated for permeability prediction. PLS provided a reasonable baseline after tuning the number of latent variables, while other models such as Random Forest, GBM, Elastic Net, and SVM were also compared on the same test data. The final recommendation depends on the model with the best out-of-sample R², but in general, the models are more appropriate for early-stage screening than for full replacement of laboratory experiments.

6.3 6.3 Understanding Product Yield in a Chemical Manufacturing Process

The goal of this exercise is to model product yield using both biological predictors and manufacturing process predictors. Biological predictors describe the raw material and cannot be changed, while process predictors can potentially be adjusted to improve yield.

(a) Load the data

library(AppliedPredictiveModeling)
library(caret)
library(gbm)
library(randomForest)
library(ggplot2)
library(dplyr)

set.seed(123)
data(ChemicalManufacturingProcess)

df <- ChemicalManufacturingProcess

# Split predictors and response
x <- df[, colnames(df) != "Yield"]
y <- df$Yield

dim(x)
## [1] 176  57
length(y)
## [1] 176

B Impute missing values

# Check missing values
sum(is.na(x))
## [1] 106
# Preprocess (impute + scale)
pp <- preProcess(x, method = c("medianImpute", "center", "scale"))

x_pp <- predict(pp, x)

Missing values were handled using median imputation, and the predictors were standardized through centering and scaling.

  1. Train/test split and model (PLS)
# Train/test split
set.seed(123)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)

x_train <- x_pp[train_index, ]
x_test  <- x_pp[-train_index, ]
y_train <- y[train_index]
y_test  <- y[-train_index]
ctrl <- trainControl(method = "cv", number = 5)

set.seed(123)
pls_fit <- train(
  x = x_train,
  y = y_train,
  method = "pls",
  tuneLength = 10,
  trControl = ctrl
)

pls_fit
## Partial Least Squares 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 116, 116, 115, 116, 113 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     1.374009  0.4663943  1.1259596
##    2     1.166388  0.5931561  0.9462169
##    3     1.172749  0.6092352  0.9587660
##    4     1.170526  0.6215004  0.9611829
##    5     1.187658  0.6086952  0.9769572
##    6     1.212169  0.5985403  0.9892013
##    7     1.236181  0.5883427  1.0065309
##    8     1.303887  0.5625551  1.0423878
##    9     1.344552  0.5525198  1.0600339
##   10     1.435224  0.5191705  1.1108164
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 2.

#Test performance

pls_pred <- predict(pls_fit, x_test)
postResample(pls_pred, y_test)
##      RMSE  Rsquared       MAE 
## 1.3026967 0.5070011 1.1236997

(d)Predict on test set

### (d) Predict on the test set

pls_pred <- predict(pls_fit, x_test)

test_perf <- postResample(pls_pred, y_test)
test_perf
##      RMSE  Rsquared       MAE 
## 1.3026967 0.5070011 1.1236997
getTrainPerf(pls_fit)

The test-set performance is comparable to the resampled training performance, indicating that the model generalizes reasonably well and is not overfitting.

(e)Variable importance

### (e) Variable importance

pls_imp <- varImp(pls_fit)

imp_df <- pls_imp$importance
imp_df$Predictor <- rownames(imp_df)

top20 <- imp_df %>%
  arrange(desc(Overall)) %>%
  slice(1:20)

top20
plot(pls_imp, top = 20)

Variable importance analysis shows that manufacturing process predictors dominate the most influential variables, indicating they play a larger role in determining yield than biological factors.

(f)Relationships between top predictors and yield

# Top 5 predictors
top5_names <- top20$Predictor[1:5]
top5_names
## [1] "ManufacturingProcess32" "ManufacturingProcess13" "ManufacturingProcess09"
## [4] "ManufacturingProcess17" "ManufacturingProcess36"
# Plot relationships
par(mfrow = c(2, 3))

for (v in top5_names) {
  plot(df[[v]], df$Yield,
       xlab = v,
       ylab = "Yield",
       main = paste("Yield vs", v),
       pch = 19, col = "blue")
  
  abline(lm(df$Yield ~ df[[v]]), col = "red", lwd = 2)
}

par(mfrow = c(1, 1))

The relationships between top predictors and yield suggest that adjusting key process variables can improve output, while biological variables help in screening raw material quality before production.