# libraries
library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(RANN)
library(glmnet)
library(ggplot2)

exercise 6.2.

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

  1. Start R and use these commands to load the data: library(AppliedPredictiveModeling) data(permeability)

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

  1. The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

  2. Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

  3. Predict the response for the test set. What is the test set estimate of R2?

  4. Try building other models discussed in this chapter. Do any have better predictive performance?

  5. Would you recommend any of your models to replace the permeability laboratory experiment?

(a) Start R and use these commands to load the data

# loading the data
data(permeability)

dim(fingerprints)
## [1]  165 1107
length(permeability)
## [1] 165

(b) Filter out the predictors that have

low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

nzv <- nearZeroVar(fingerprints)

filtered_fingerprints <- fingerprints[, -nzv]

n_remaining <- ncol(filtered_fingerprints)
n_remaining
## [1] 388
#summary(filtered_fingerprints)

388 predictors remain.

(c) Split the data into a training and a test set, pre-process the data, and

tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

set.seed(123) 

# train/test split
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
X_train <- filtered_fingerprints[trainIndex, ]
X_test <- filtered_fingerprints[-trainIndex, ]
y_train <- permeability[trainIndex]
y_test <- permeability[-trainIndex]

# training control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

# PLS model
pls_fit <- train(
  x = X_train, y = y_train,
  method = "pls",
  preProc = c("center", "scale"), 
  trControl = ctrl,
  tuneLength = 20,
  metric = "Rsquared")

# RMSE and R2 vs number of components
plot(pls_fit)

# Best number of components
best_ncomp <- pls_fit$bestTune$ncomp
best_ncomp
## [1] 6
# Best cross-validated R-2
best_cv_R2 <- max(pls_fit$results$Rsquared)
best_cv_R2
## [1] 0.5307411

The best number of components was 6, with a repeated CV R2 of 0.5307. From the plot, one can see overfitting setting in beyond 6–8 components, as performance drops steadily.

PLS explains moderate variance (53% during CV), but it may not capture the non-linear relationships well enough, which is not surprising; a known limitation of linear methods like PLS.

(d) Predict the response for the test set. What is the test set estimate of R2?

# Predict on test set
pls_pred <- predict(pls_fit, newdata = X_test)

# test set R2
pls_R2 <- cor(pls_pred, y_test)^2
pls_RMSE <- sqrt(mean((pls_pred - y_test)^2))
pls_R2
## [1] 0.3244542
pls_RMSE
## [1] 12.34869

Test set R2 = 0.324. This drop from 0.53 (CV) to 0.32 (test) indicates some optimism in cross-validation estimates and limited generalizability of the PLS model to unseen data. RMSE was 12.35.

(e) Try building other models.

# Lasso
set.seed(123)
lasso_fit <- train(
  x = X_train, y = y_train,
  method = "glmnet",
  preProc = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = expand.grid(
    alpha = 1,
    lambda = 10^seq(-4, 1, length = 100)))
lasso_pred <- predict(lasso_fit, newdata = X_test)
lasso_R2 <- cor(lasso_pred, y_test)^2


# Ridge
set.seed(123)
ridge_fit <- train(
  x = X_train, y = y_train,
  method = "glmnet",
  preProc = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = expand.grid(
    alpha = 0,
    lambda = 10^seq(-4, 1, length = 100)))
ridge_pred <- predict(ridge_fit, newdata = X_test)
ridge_R2 <- cor(ridge_pred, y_test)^2


# Elastic Net
set.seed(123)
enet_fit <- train(
  x = X_train, y = y_train,
  method = "glmnet",
  preProc = c("center", "scale"),
  trControl = ctrl,
  tuneLength = 10)
enet_pred <- predict(enet_fit, newdata = X_test)
enet_R2 <- cor(enet_pred, y_test)^2

# I'll add a few more out of curiosity:

# Random Forest
set.seed(123)
rf_fit <- train(
  x = X_train, y = y_train,
  method = "rf",
  trControl = ctrl,
  importance = TRUE)
rf_pred <- predict(rf_fit, X_test)
rf_R2 <- cor(rf_pred, y_test)^2

# SVM
set.seed(123)
svm_fit <- train(
  x = X_train, y = y_train,
  method = "svmRadial",
  preProc = c("center", "scale"),
  trControl = ctrl,
  tuneLength = 10)
svm_pred <- predict(svm_fit, X_test)
svm_R2 <- cor(svm_pred, y_test)^2

# Compare models
model_comparison <- data.frame(
  Model = c("PLS", "Random Forest", "SVM", "Lasso", "Ridge", "Elastic Net"),
  R2_Test = c(pls_R2, rf_R2, svm_R2, lasso_R2, ridge_R2, enet_R2))
#model_comparison[order(-model_comparison$R2_Test), ]
model_rmse <- data.frame(
  Model = c("PLS", "Random Forest", "SVM", "Lasso", "Ridge", "Elastic Net"),
  RMSE_Test = c(
    sqrt(mean((pls_pred - y_test)^2)),
    sqrt(mean((rf_pred - y_test)^2)),
    sqrt(mean((svm_pred - y_test)^2)),
    sqrt(mean((lasso_pred - y_test)^2)),
    sqrt(mean((ridge_pred - y_test)^2)),
    sqrt(mean((enet_pred - y_test)^2))))
#model_rmse[order(model_rmse$RMSE_Test), ]

model_summary <- merge(model_comparison, model_rmse, by = "Model")
model_summary <- model_summary[order(-model_summary$R2_Test), ]
model_summary
##           Model   R2_Test RMSE_Test
## 6           SVM 0.5038552  10.29432
## 4 Random Forest 0.4929775  10.08085
## 2         Lasso 0.3888176  10.99783
## 1   Elastic Net 0.3655409  11.05010
## 5         Ridge 0.3266984  11.02039
## 3           PLS 0.3244542  12.34869

Notes:

  • top-performing models (SVM and Random Forest) explain about 50% of the variance in permeability.
  • All models show RMSEs around 10 permeability units.
  • on average, the model predictions deviate from the true permeability by ±10 units.

(f) Would you recommend any of your models to replace the permeability laboratory experiment?

Based on the models’ performance:

  • Best model (SVM) explains ~50% of the variance in test set permeability. This is moderately predictive, but not accurate enough to completely replace the laboratory permeability assay, especially in high-stakes applications (eg, drug approval).
  • These models could be very useful as a screening tool.

===============================================================================

exercise 6.3

6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

  1. Start R and use these commands to load the data: library(AppliedPredictiveModeling) data(chemicalManufacturingProcess) The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

  2. A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

  3. Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

  4. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

  5. Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

  6. Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

(a) Start R and use these commands to load the data

# fixing errors with loading the data
load(system.file("data", "chemicalManufacturingProcess.RData", package = "AppliedPredictiveModeling"))
# str(ChemicalManufacturingProcess)
# ls()


processPredictors <- ChemicalManufacturingProcess[, -1]
yield <- ChemicalManufacturingProcess$Yield


dim(processPredictors)
## [1] 176  57
length(yield) 
## [1] 176

(b) Use an imputation function to fill in the missing values

sum(is.na(processPredictors))
## [1] 106
preProc <- preProcess(processPredictors, method = c("knnImpute", "center", "scale"))

# Impute and scale
processed_data <- predict(preProc, processPredictors)

# Check
sum(is.na(processed_data))
## [1] 0

There were 106 missing values, and they were imputed.

(c) Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

set.seed(123)

# Train-test split
trainIndex <- createDataPartition(yield, p = 0.8, list = FALSE)
X_train <- processed_data[trainIndex, ]
X_test <- processed_data[-trainIndex, ]
y_train <- yield[trainIndex]
y_test <- yield[-trainIndex]

# Training control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

# Train Random Forest
rf_fit <- train(
  x = X_train, y = y_train,
  method = "rf",
  trControl = ctrl,
  importance = TRUE)
rf_fit$results
##   mtry     RMSE  Rsquared       MAE    RMSESD RsquaredSD     MAESD
## 1    2 1.236134 0.6302038 0.9809238 0.2223745  0.1282774 0.1564944
## 2   29 1.134331 0.6465247 0.8748643 0.2454849  0.1375425 0.1698118
## 3   57 1.148726 0.6296347 0.8641660 0.2654705  0.1459044 0.1821881
best_rmse <- rf_fit$results[rf_fit$results$mtry == rf_fit$bestTune$mtry, "RMSE"]
best_rmse
## [1] 1.134331
# Train Lasso
lasso_fit <- train(
  x = X_train, y = y_train,
  method = "glmnet",
  preProc = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = expand.grid(
    alpha = 1,                      
    lambda = 10^seq(-4, 1, length = 100)))
lasso_fit$bestTune
##    alpha     lambda
## 60     1 0.09545485
min(lasso_fit$results$RMSE)
## [1] 1.144071
# Train an Elastic net
enet_fit <- train(
  x = X_train, y = y_train,
  method = "glmnet",
  preProc = c("center", "scale"),
  trControl = ctrl,
  tuneLength = 10)
enet_fit$bestTune
##    alpha     lambda
## 97     1 0.07886757
min(enet_fit$results$RMSE)
## [1] 1.1619
# train a ridge regression
ridge_fit <- train(
  x = X_train, y = y_train,
  method = "glmnet",
  preProc = c("center", "scale"),
  trControl = ctrl,
  tuneGrid = expand.grid(
    alpha = 0, 
    lambda = 10^seq(-4, 1, length = 100)))
ridge_fit$bestTune
##    alpha   lambda
## 88     0 2.477076
min(ridge_fit$results$RMSE)
## [1] 1.337693
# Results
model_results <- data.frame(
  Model = c("Random Forest", "Lasso", "Elastic Net", "Ridge"),
  RMSE = c(
    min(rf_fit$results$RMSE),
    min(lasso_fit$results$RMSE),
    min(enet_fit$results$RMSE),
    min(ridge_fit$results$RMSE)))
model_results
##           Model     RMSE
## 1 Random Forest 1.134331
## 2         Lasso 1.144071
## 3   Elastic Net 1.161900
## 4         Ridge 1.337693

Of the models mentioned in this chapter, the Lasso seems to have the lowest RMSE, so I would choose that to predict on the test set. But because I did Random Forest and it showed a slightly better/lower RMSE, I will go with both to see how well they perform on the test set:

(d) Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

# Random Forest
rf_pred <- predict(rf_fit, newdata = X_test)
rf_test_R2 <- cor(rf_pred, y_test)^2
rf_test_RMSE <- sqrt(mean((rf_pred - y_test)^2))
rf_train_R2 <- getTrainPerf(rf_fit)$TrainRsquared
rf_train_RMSE <- getTrainPerf(rf_fit)$TrainRMSE

# Lasso
lasso_pred <- predict(lasso_fit, newdata = X_test)
lasso_test_R2 <- cor(lasso_pred, y_test)^2
lasso_test_RMSE <- sqrt(mean((lasso_pred - y_test)^2))
lasso_train_R2 <- getTrainPerf(lasso_fit)$TrainRsquared
lasso_train_RMSE <- getTrainPerf(lasso_fit)$TrainRMSE

# Compare
performance_summary <- data.frame(
  Model = c("Random Forest", "Lasso"),
  Train_R2 = c(rf_train_R2, lasso_train_R2),
  Train_RMSE = c(rf_train_RMSE, lasso_train_RMSE),
  Test_R2 = c(rf_test_R2, lasso_test_R2),
  Test_RMSE = c(rf_test_RMSE, lasso_test_RMSE))

performance_summary
##           Model  Train_R2 Train_RMSE   Test_R2 Test_RMSE
## 1 Random Forest 0.6465247   1.134331 0.5401572  1.268733
## 2         Lasso 0.6252550   1.144071 0.5743982  1.206413

Lasso vs Random Forest

  • Train performance is slightly better for Random Forest (higher R2, lower RMSE).
  • Test performance is better for Lasso — both R2 is higher and RMSE is lower, despite it being a simpler linear model.
  • Random Forest may be overfitting slightly
  • Lasso generalizes better

(e) Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

I am curios as to how the Lasso and the RF will compare here, so I am going to keep both!

# --- RANDOM FOREST TOP PREDICTORS
varImp_rf <- varImp(rf_fit, scale = TRUE)
top20_rf <- varImp_rf$importance
top20_rf$Variable <- rownames(top20_rf)
top20_rf$Type <- ifelse(top20_rf$Variable %in% colnames(processPredictors)[1:12], "Biological", "Process")
top20_rf <- top20_rf[order(-top20_rf$Overall), ][1:20, ]

ggplot(top20_rf, aes(x = reorder(Variable, Overall), y = Overall, fill = Type)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 20 Predictors - Random Forest",
       x = "Predictor", y = "Importance (Scaled)",
       fill = "Predictor Type") +
  theme_minimal()

# --- LASSO TOP PREDICTORS ---
best_lambda <- lasso_fit$bestTune$lambda
lasso_coefs <- coef(lasso_fit$finalModel, s = best_lambda)

lasso_coef_df <- as.data.frame(as.matrix(lasso_coefs))
colnames(lasso_coef_df) <- "Coefficient"
lasso_coef_df$Variable <- rownames(lasso_coef_df)
lasso_coef_df <- lasso_coef_df[lasso_coef_df$Variable != "(Intercept)", ]
nonzero_lasso <- subset(lasso_coef_df, Coefficient != 0)

nonzero_lasso$Type <- ifelse(nonzero_lasso$Variable %in% colnames(processPredictors)[1:12], "Biological", "Process")
nonzero_lasso$AbsCoef <- abs(nonzero_lasso$Coefficient)
top20_lasso <- nonzero_lasso[order(-nonzero_lasso$AbsCoef), ][1:20, ]

ggplot(top20_lasso, aes(x = reorder(Variable, AbsCoef), y = AbsCoef, fill = Type)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 20 Predictors - Lasso",
       x = "Predictor", y = "Absolute Coefficient",
       fill = "Predictor Type") +
  theme_minimal()

Notes:

  • Random Forest: 7 biological predictors vs. 13 process predictors. Top 5 predictors: ManufacturingProcess32, ManufacturingProcess17, BiologicalMaterial06, BiologicalMaterial03, BiologicalMaterial12.
  • Lasso: Process variables dominate heavily among non-zero coefficients. of the top 5, 4 were process-related. Among the top 5: ManufacturingProcess32, ManufacturingProcess09, ManufacturingProcess17, ManufacturingProcess06, and BiologicalMaterial06.

(f) Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

I decided to only focus on Lasso for this one:

# #Top 4 from Random Forest
# varImp_rf <- varImp(rf_fit, scale = TRUE)
# top_predictors_sorted <- varImp_rf$importance
# top_predictors_sorted$Variable <- rownames(top_predictors_sorted)
# top_predictors_sorted <- top_predictors_sorted[order(-top_predictors_sorted$Overall), ]
# top_predictors_sorted$Type <- ifelse(
#   top_predictors_sorted$Variable %in% colnames(processPredictors)[1:12],
#   "Biological", "Process")
# top4_rf <- rownames(top_predictors_sorted)[1:4] 
# 
# for (pred in top4_rf) {
#   p <- ggplot(data.frame(x = processed_data[[pred]], y = yield), aes(x = x, y = y)) +
#     geom_point(alpha = 0.6) +
#     geom_smooth(method = "loess", color = "blue") +
#     labs(
#       title = paste("Yield vs", pred, "(Random Forest)"),
#       x = pred,
#       y = "Yield"
#     ) +
#     theme_minimal()
#   print(p)}


#Top 4 from Lasso
nonzero_lasso <- nonzero_lasso[order(-nonzero_lasso$AbsCoef), ]
top4_lasso <- head(nonzero_lasso$Variable, 4)

for (pred in top4_lasso) {
  p <- ggplot(data.frame(x = processed_data[[pred]], y = yield), aes(x = x, y = y)) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "loess", color = "red") +
    labs(
      title = paste("Yield vs", pred, "(Lasso)"),
      x = pred,
      y = "Yield"
    ) +
    theme_minimal()
  print(p)}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

Notes:

  • ManufacturingProcess32: Nonlinear S-shaped curve. Yield is low at low levels of the predictor, increases steadily as the variable increases, and then plateaus or even slightly declines at high values. Operational insight: Maintain ManufacturingProcess32 within a mid-to-upper optimal range, but avoid the extreme high tail.
  • ManufacturingProcess09: Clear positive linear relationship. We should consider increasing this parameter (if feasible/safe) to further improve yield.
  • ManufacturingProcess17: Strong inverse U-shaped (concave) relationship. This is a critical tuning parameter. We should identify and operate within its optimal band to prevent under- or over-processing.
  • ManufacturingProcess06: Rapid rise initially, then a plateau. We should use just enough of this process to reach the yield plateau; higher values don’t add value and may waste resources or energy.