DATA624

6.2. Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

Start R and use these commands to load the data:

library(AppliedPredictiveModeling)

## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3

library(caret)

## Warning: package 'caret' was built under R version 4.5.1

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:fabletools':
## 
##     MAE, RMSE

data(permeability)

The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

filter_prints <- fingerprints[, -nearZeroVar(fingerprints)]
dim(filter_prints)

## [1] 165 388

From the 1107 predictors, we can see that only 388 of them are left for modeling.

Split the data into a training and a test set, pre-process the data, and une a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

set.seed(2024)

#prep data
response_var <- permeability
feature_matrix <- filter_prints

#stratified spliting
response_groups <- cut(response_var, 
                       breaks = quantile(response_var, probs = seq(0, 1, 0.2)),
                       include.lowest = TRUE)
split_indices <- createDataPartition(response_groups, p = 0.75, list = FALSE)

x_training <- feature_matrix[split_indices, ]
y_training <- response_var[split_indices]
x_testing <- feature_matrix[-split_indices, ]
y_testing <- response_var[-split_indices]

#train PLS with repeated CV
control <- trainControl(method = "repeatedcv", number = 5, repeats = 3)
pls_model <- train(x_training, y_training, 
                   method = "pls",
                   tuneLength = 20,
                   trControl = control,
                   preProcess = c("center", "scale"),
                   metric = "Rsquared")

plot(pls_model)

pls_model

## Partial Least Squares 
## 
## 125 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (5 fold, repeated 3 times) 
## Summary of sample sizes: 100, 101, 100, 101, 98, 99, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.27841  0.2632483   9.991065
##    2     12.40055  0.3603328   8.720806
##    3     12.23602  0.3972240   8.974394
##    4     12.50742  0.3902796   9.229317
##    5     12.60172  0.3934566   9.225058
##    6     12.26538  0.4126069   8.968100
##    7     12.04593  0.4319011   8.823445
##    8     11.95945  0.4498600   9.047216
##    9     12.10201  0.4436340   8.991114
##   10     12.23726  0.4417904   9.017227
##   11     12.53254  0.4293280   9.266068
##   12     12.85222  0.4131497   9.521330
##   13     13.15930  0.3970853   9.757412
##   14     13.40589  0.3855165   9.926069
##   15     13.78026  0.3670988  10.050599
##   16     14.17459  0.3560211  10.295451
##   17     14.64715  0.3372384  10.599658
##   18     14.95076  0.3348392  10.863048
##   19     15.42855  0.3164574  11.251152
##   20     15.68827  0.3133719  11.487606
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 8.

I used Rsquared to select the optimal model with the largest value. The final value used for the model was ncomp = 8 and the corresponding resampled estimate of R²: 0.4499

Predict the response for the test set. What is the test set estimate of R2?

#predict with test set optimal PLS model
test_predictions <- predict(pls_model, x_testing)

#calctest set R²
test_metrics <- postResample(pred = test_predictions, obs = y_testing)
test_R2 <- test_metrics["Rsquared"]

round(test_R2, 4)

## Rsquared 
##   0.4484

The test set estimate of R2 is 0.4484.

Try building other models discussed in this chapter. Do any have better predictive performance?

Let’s try building an Elastic Net Model and comparing it to the PLS model.

library(elasticnet)

## Warning: package 'elasticnet' was built under R version 4.5.2

## Loading required package: lars

## Warning: package 'lars' was built under R version 4.5.2

## Loaded lars 1.3

set.seed(1580)

response_var <- permeability
feature_matrix <- filter_prints

#stratified split
response_groups <- cut(response_var, 
                       breaks = quantile(response_var, probs = seq(0, 1, 0.2)),
                       include.lowest = TRUE)
split_indices <- createDataPartition(response_groups, p = 0.75, list = FALSE)

x_train <- feature_matrix[split_indices, ]
y_train <- response_var[split_indices]
x_test <- feature_matrix[-split_indices, ]
y_test <- response_var[-split_indices]

#train elastic net model
ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 3)

#removing lambna and use smaller fractions to avoid errors
enet_grid <- expand.grid(.lambda = c(0.01, 0.05, 0.1, 0.5), 
                         .fraction = seq(0.2, 0.9, length = 10))

#suppressing the warnings
enet_model <- suppressWarnings(
  train(x_train, y_train,
        method = "enet",
        tuneGrid = enet_grid,
        trControl = ctrl,
        preProcess = c("center", "scale"),
        metric = "Rsquared")
)

#predict test set
enet_pred <- predict(enet_model, x_test)
enet_R2 <- postResample(enet_pred, y_test)["Rsquared"]

#PLS predictions
pls_pred <- predict(pls_model, x_test)
pls_R2 <- postResample(pls_pred, y_test)["Rsquared"]

# Compare results
cat("PLS Test R²:", round(pls_R2, 4), "\n")

## PLS Test R²: 0.6462

cat("Elastic Net Test R²:", round(enet_R2, 4), "\n\n")

## Elastic Net Test R²: 0.3975

if(enet_R2 > pls_R2) {
  cat("✓ Yes, Elastic Net has better predictive performance.\n")
} else {
  cat("✗ No, PLS still has better predictive performance.\n")
}

## ✗ No, PLS still has better predictive performance.

# View optimal parameters
cat("\nElastic Net optimal parameters:\n")

## 
## Elastic Net optimal parameters:

print(enet_model$bestTune)

##     fraction lambda
## 38 0.7444444    0.5

Would you recommend any of your models to replace the permeability laboratory experiment?

I would not recommend any of my models to replace the permeability laboratory experiment the models aren’t accurate enough to.

6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the re-lationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing pro-cess. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:

Start R and use these commands to load the data:

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

ChemicalManufacturingProcess

From the dataframe ChemicalManufacturingProcess, it contains 57 predictors. There are 12 of them that describe the input biological materials while the rest describe the process predictors in 176 manufacturing runs. The target variable yield contains the percent yield of each run.

A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

sum(is.na(ChemicalManufacturingProcess))

## [1] 106

miss <- preProcess(ChemicalManufacturingProcess, method = "bagImpute")
Chemical <- predict(miss, ChemicalManufacturingProcess)

sum(is.na(Chemical))

## [1] 0

Using the bagimpute function, we can see that the dataset has 106 missing values. The method uses the other vars to predict and fill in the missing data.

Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

I will be tuning an Elastic Net model from this chapter.

near0 <- nearZeroVar(Chemical)
cmp_clean <- Chemical[, -near0]

#split training and test data
set.seed(2024)
trainingRows <- createDataPartition(cmp_clean$Yield, p = 0.80, list = FALSE)
train <- cmp_clean[trainingRows, ]
test <- cmp_clean[-trainingRows, ]

#preprocess and tune Elastic net model
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)

enet_grid <- expand.grid(lambda = c(0.01, 0.05, 0.1, 0.5), 
                         fraction = seq(0.1, 0.9, length = 10))

enet_model <- train(train[, -which(names(train) == "Yield")], 
                    train$Yield,
                    method = "enet",
                    tuneGrid = enet_grid,
                    preProcess = c("center", "scale"),
                    trControl = ctrl,
                    metric = "Rsquared")

cat("performance metric of R^2:", round(max(enet_model$results$Rsquared), 4), "\n")

## performance metric of R^2: 0.6094

cat("lambda:", enet_model$bestTune$lambda, 
    ", fraction:", enet_model$bestTune$fraction, "\n")

## lambda: 0.01 , fraction: 0.1

From the ELS model, we can see that the optimal performance metric of r^2 is 0.61 and the optimal parameters of lambda is 0.01. The fraction as 0.1.

Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

#predict on test set using the tuned ENM
enet_test_pred <- predict(enet_model, test[, -which(names(test) == "Yield")])

#calc test set performance metrics
test_metrics <- postResample(pred = enet_test_pred, obs = test$Yield)

#get resampled training performance from cross-validation
train_metrics <- enet_model$results[enet_model$results$lambda == enet_model$bestTune$lambda &
                                      enet_model$results$fraction == enet_model$bestTune$fraction, ]

performance_table <- data.frame(
  Metric = c("R²", "RMSE", "MAE"),
  Resampled_Training = c(
    round(train_metrics$Rsquared, 4),
    round(train_metrics$RMSE, 4),
    round(train_metrics$MAE, 4)
  ),
  Test_Set = c(
    round(test_metrics["Rsquared"], 4),
    round(test_metrics["RMSE"], 4),
    round(test_metrics["MAE"], 4)
  )
)

print(performance_table)

##          Metric Resampled_Training Test_Set
## Rsquared     R²             0.6094   0.6213
## RMSE       RMSE             1.2782   1.2512
## MAE         MAE             1.0399   1.0569

The performance metric of R^2 for the resampled training is 0.61. While the R^2 of the test set is 0.62, having a higher R^2 by 0.01.

Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

#extract coeff from the final Elastic Net model
final_model <- enet_model$finalModel

#get coeff optimal fraction
coef_result <- predict(final_model, 
                       s = enet_model$bestTune$fraction, 
                       mode = "fraction", 
                       type = "coefficients")

#extract coefficient vector 
coef_vector <- coef_result$coefficients
names(coef_vector) <- colnames(train[, -which(names(train) == "Yield")])

#remove NA
coef_vector <- coef_vector[!is.na(names(coef_vector))]

#find top and bot predictors
top10 <- head(sort(coef_vector, decreasing = TRUE), 10)

#sepbiological and process predictors
bio_predictors <- grepl("BiologicalMaterial", names(coef_vector))
process_predictors <- grepl("ManufacturingProcess", names(coef_vector))

sum_bio_abs <- sum(abs(coef_vector[bio_predictors]))
sum_process_abs <- sum(abs(coef_vector[process_predictors]))
count_bio <- sum(bio_predictors)
count_process <- sum(process_predictors)

top10_df <- data.frame(Predictor = names(top10), Coefficient = round(top10, 4))
print(top10_df)

##                                     Predictor Coefficient
## ManufacturingProcess32 ManufacturingProcess32      0.6936
## ManufacturingProcess09 ManufacturingProcess09      0.3669
## BiologicalMaterial01     BiologicalMaterial01      0.0000
## BiologicalMaterial02     BiologicalMaterial02      0.0000
## BiologicalMaterial03     BiologicalMaterial03      0.0000
## BiologicalMaterial04     BiologicalMaterial04      0.0000
## BiologicalMaterial05     BiologicalMaterial05      0.0000
## BiologicalMaterial06     BiologicalMaterial06      0.0000
## BiologicalMaterial08     BiologicalMaterial08      0.0000
## BiologicalMaterial09     BiologicalMaterial09      0.0000

The most important predictors are ManufacturingProcess32 and ManufacturingProcess09. The process predictrs dominate the model.

Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process

The predictors that improve the yields of future runs of the manufactoring process are ManufacturingProcess32 and ManufacturingProcess09. To improve the yield, it could be beneficial to conduct a biological assessment of the raw materials and improve accuracy of the manufacturing measurements.

library(ggplot2)
library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.5.1

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

#top predictors from elastic net output
top_predictors <- c("ManufacturingProcess32", "ManufacturingProcess09")

#scatter plots
plots <- list()
for(i in 1:length(top_predictors)) {
  p <- ggplot(Chemical, aes_string(x = top_predictors[i], y = "Yield")) +
    geom_point(alpha = 0.6, color = "steelblue") +
    geom_smooth(method = "lm", se = TRUE, color = "red") +
    labs(title = paste("Yield vs", top_predictors[i]),
         x = top_predictors[i], y = "Yield") +
    theme_minimal()
  plots[[i]] <- p
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

grid.arrange(grobs = plots, ncol = 2)

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

#fit linear model with top preedictors
lm_model <- lm(Yield ~ ManufacturingProcess32 + ManufacturingProcess09, 
               data = Chemical)

summary(lm_model)

## 
## Call:
## lm(formula = Yield ~ ManufacturingProcess32 + ManufacturingProcess09, 
##     data = Chemical)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6017 -0.9306  0.0157  0.6466  3.4215 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -17.84503    3.62551  -4.922 1.99e-06 ***
## ManufacturingProcess32   0.20131    0.01647  12.224  < 2e-16 ***
## ManufacturingProcess09   0.57208    0.05748   9.953  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.175 on 173 degrees of freedom
## Multiple R-squared:  0.5994, Adjusted R-squared:  0.5948 
## F-statistic: 129.4 on 2 and 173 DF,  p-value: < 2.2e-16

DATA624

Jackie Yee

2026-04-12