Required Libraries

library(tidyverse)
library(AppliedPredictiveModeling)
library(kableExtra)
library(latex2exp)
library(caret)
library(janitor)
library(gridExtra)
library(ggcorrplot)

6.2

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:

  1. Start R and use these commands to load the data: The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
data(permeability)

kbl(fingerprints[1:5, 1:10],
    caption = "Fingerprints") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(fingerprints), " x ", ncol(fingerprints))))
Fingerprints
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
0 0 0 0 0 1 1 1 0 0
0 0 0 0 0 0 1 1 0 0
0 0 0 0 0 1 1 1 0 0
0 0 0 0 0 0 1 1 0 0
0 0 0 0 0 0 1 1 0 0
Dimensions:
165 x 1107
kbl(head(permeability)) |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(permeability), " x ", ncol(permeability))))
permeability
12.520
1.120
19.405
1.730
1.680
0.510
Dimensions:
165 x 1
  1. The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

After using the nearZeroVar function, we are left with 388 predictors left to model.

nearZeroVar_filter <- 
  fingerprints |>
  nearZeroVar(names = TRUE, saveMetrics = FALSE)

filtered_fingerprints <-
  as_tibble(fingerprints) |>
  select(-all_of(nearZeroVar_filter)) |>
  clean_names()

kbl(filtered_fingerprints[1:5, 1:10],
    caption = "Filtered Fingerprints") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(filtered_fingerprints), " x ", ncol(filtered_fingerprints))))
Filtered Fingerprints
x1 x2 x3 x4 x5 x6 x11 x12 x15 x16
0 0 0 0 0 1 0 1 0 0
0 0 0 0 0 0 0 1 0 0
0 0 0 0 0 1 0 0 0 0
0 0 0 0 0 0 0 1 0 0
0 0 0 0 0 0 0 1 0 0
Dimensions:
165 x 388
  1. Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of \(R^2\)?

We don’t have any missing data in our combined dataset of predictors and response variables.

filtered_fingerprints$permeability <- permeability
paste("Missing Values:", any(is.na(filtered_fingerprints)))
## [1] "Missing Values: FALSE"

We will set our seed for reproducibility and split the data into a training and test set.

set.seed(42)

rows <- sample(nrow(filtered_fingerprints))
filtered_fingerprints <- filtered_fingerprints[rows, ]

sample <- 
  sample(c(TRUE, FALSE), 
         nrow(filtered_fingerprints), 
         replace=TRUE, 
         prob=c(0.7,0.3))

train_df <- filtered_fingerprints[sample, ]
train_x <- train_df |>
    select(-permeability)
train_y <- train_df$permeability
train_y <- as.numeric(train_y)

test_df <- filtered_fingerprints[!sample, ]
test_x <- test_df |>
    select(-permeability)

test_y <- test_df$permeability
test_y <- as.numeric(test_y)

Using a Ten-Fold Cross-Validation and setting our pre-processing to be center and scale within our PLS model.

set.seed(42)

ctrl <- 
  trainControl(method = "cv",
               number = 10)

plsTune <- 
  train(train_x, train_y, 
        method = "pls", 
        tuneLength = 20,
        trControl = ctrl, 
        preProc = c("center", "scale"))

plsTune
## Partial Least Squares 
## 
## 121 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 109, 109, 109, 109, 109, 109, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.15906  0.3304974  10.076105
##    2     11.83888  0.4725912   8.209110
##    3     11.76765  0.4792891   9.089389
##    4     12.05513  0.4827926   9.211061
##    5     12.04366  0.4760738   9.094754
##    6     11.95211  0.4853975   9.079037
##    7     12.07644  0.4672773   9.340922
##    8     12.07147  0.4604480   9.352997
##    9     12.12336  0.4523207   9.390719
##   10     12.71329  0.4235732   9.582716
##   11     13.26484  0.4056196   9.985620
##   12     13.81725  0.3857177  10.369833
##   13     14.42787  0.3649552  10.700331
##   14     15.12876  0.3471364  11.160827
##   15     15.31352  0.3413391  11.139511
##   16     16.12273  0.3245996  11.783037
##   17     16.42704  0.3240388  11.944609
##   18     16.84677  0.3171920  12.194782
##   19     17.50543  0.2866305  12.731654
##   20     17.74646  0.2875936  12.867303
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.

We end up with the best tune of \(ncomp=3\) and an \(R^2 = 0.479\)

results <-
  plsTune$results |>
  filter(ncomp == 3)

kbl(results,
    caption = "PLS Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
PLS Model
ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
3 11.76765 0.4792891 9.089389 1.71757 0.138497 1.279859
plot(plsTune)

  1. Predict the response for the test set. What is the test set estimate of \(R^2\)?

We get an \(R^2\) of 0.396

pls_pred <- predict(plsTune, test_x)

# Model performance metrics
results <- data.frame(RMSE = caret::RMSE(pls_pred, test_y),
                      Rsquared = caret::R2(pls_pred, test_y))
kbl(results, caption = "PLS Predictions") |>
  kable_classic(full_width = F, html_font = "Cambria")
PLS Predictions
RMSE Rsquared
12.48383 0.3964867
  1. Try building other models discussed in this chapter. Do any have better predictive performance?

Ridge Regression

The ridge regression model has a better predictive performance based on \(Ridge \,R^2: 0.4537 > PLS \,R^2: 0.3965\) even with a slightly worse training model itself \(Ridge \,R^2: 0.4395 < PLS \,R^2: 0.4793\).

set.seed(42)

ridgeGrid <- data.frame(.lambda = seq(0, 1, length.out = 15))

ridge <- train(train_x, train_y,
               method = "ridge",
               preProcess = c("center", "scale"),
               trControl = ctrl,
               tuneGrid = ridgeGrid)

ridge
## Ridge Regression 
## 
## 121 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 109, 109, 109, 109, 109, 109, ... 
## Resampling results across tuning parameters:
## 
##   lambda      RMSE          Rsquared   MAE         
##   0.00000000  4.636227e+22  0.2220327  2.293202e+22
##   0.07142857  1.467755e+01  0.3799053  1.051587e+01
##   0.14285714  1.399918e+01  0.4172643  1.023236e+01
##   0.21428571  1.380433e+01  0.4394862  1.021073e+01
##   0.28571429  1.380573e+01  0.4548502  1.032430e+01
##   0.35714286  1.392012e+01  0.4662088  1.049601e+01
##   0.42857143  1.411608e+01  0.4750284  1.072538e+01
##   0.50000000  1.435985e+01  0.4821513  1.100673e+01
##   0.57142857  1.464429e+01  0.4879994  1.130376e+01
##   0.64285714  1.497295e+01  0.4927583  1.163218e+01
##   0.71428571  1.533318e+01  0.4966683  1.199063e+01
##   0.78571429  1.572050e+01  0.4999762  1.236046e+01
##   0.85714286  1.613106e+01  0.5027340  1.274891e+01
##   0.92857143  1.656438e+01  0.5050831  1.314606e+01
##   1.00000000  1.701388e+01  0.5070745  1.356125e+01
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.2142857.
plot(ridge)

results <-
  ridge$results |>
  filter(round(lambda, 7) == 0.2142857)

kbl(results,
    caption = "Ridge Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
Ridge Model
lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
0.2142857 13.80433 0.4394862 10.21073 2.600612 0.1847115 1.940164
ridge_pred <- predict(ridge, test_x)

# Model performance metrics
results <- data.frame(RMSE = caret::RMSE(ridge_pred, test_y),
                      Rsquared = caret::R2(ridge_pred, test_y))
kbl(results, caption = "Ridge Predictions") |>
  kable_classic(full_width = F, html_font = "Cambria")
Ridge Predictions
RMSE Rsquared
12.21753 0.4537338

Principal Component Analysis

The ridge regression model has a better predictive performance based on \(Ridge \,R^2: 0.4537 > PCA \,R^2: 0.4138\) even with a slightly worse training model itself \(Ridge \,R^2: 0.4395 < PCA \,R^2: 0.4501\).

set.seed(42)

pca <- train(train_x,
                 train_y,
                 method = "pcr",
                 preProcess = c("center", "scale"),
                 trControl = ctrl,
                 tuneLength=15)

pca
## Principal Component Analysis 
## 
## 121 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 109, 109, 109, 109, 109, 109, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     15.23271  0.1057016  11.796539
##    2     15.25690  0.1056819  11.829238
##    3     14.01283  0.2673234  10.682211
##    4     13.11927  0.3488303   9.873778
##    5     13.00804  0.3702847   9.343916
##    6     13.03019  0.3414406   9.325229
##    7     12.95938  0.3622568   9.227836
##    8     12.15867  0.4445602   8.765973
##    9     12.13636  0.4328711   9.091685
##   10     11.74169  0.4500606   8.955268
##   11     11.84011  0.4620500   9.066807
##   12     11.84831  0.4655223   8.986896
##   13     11.86410  0.4652278   8.951864
##   14     11.91279  0.4568032   8.971189
##   15     12.07052  0.4485460   9.223338
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 10.
plot(pca)

results <-
  pca$results |>
  filter(ncomp == 10)

kbl(results,
    caption = "PCA Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
PCA Model
ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
10 11.74169 0.4500606 8.955268 1.928418 0.1736516 1.406815
pca_pred <- predict(pca, test_x)

# Model performance metrics
results <- data.frame(RMSE = caret::RMSE(pca_pred, test_y),
                      Rsquared = caret::R2(pca_pred, test_y))
kbl(results, caption = "PCA Predictions") |>
  kable_classic(full_width = F, html_font = "Cambria")
PCA Predictions
RMSE Rsquared
12.23152 0.413801
  1. Would you recommend any of your models to replace the permeability laboratory experiment?

If cost cutting is the objective for our model by correctly predicting permeability, I would consider replacing the model with the ridge regression. We know that the training model is not as strong as the PLS, however the difference with our predictions having a both noticeably better RMSE and \(R^2\) leaves us potentially saving significant resources.

6.3.

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch:

  1. Start R and use these commands to load the data:

The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

data(ChemicalManufacturingProcess)

chem <- ChemicalManufacturingProcess
kbl(chem[1:5, 1:10],
    caption = "Chemical Manufacturing Process") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(chem), " x ", ncol(chem))))
Chemical Manufacturing Process
Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
38.00 6.25 49.58 56.97 12.74 19.51 43.73 100 16.66 11.44
42.44 8.01 60.97 67.48 14.65 19.36 53.14 100 19.04 12.55
42.03 8.01 60.97 67.48 14.65 19.36 53.14 100 19.04 12.55
41.42 8.01 60.97 67.48 14.65 19.36 53.14 100 19.04 12.55
42.49 7.47 63.33 72.25 14.02 17.91 54.66 100 18.22 12.80
Dimensions:
176 x 58
  1. A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).

We can see which variables have missing values below. Afterwards, we can use a KNN imputation method to replace these NA values and confirm that we have zero missing values.

missing <- 
  chem %>% 
  summarise(across(everything(), ~ sum(is.na(.x)))) %>%
  pivot_longer(., 
            cols = everything()) |>
  filter(value != 0) |>
  rename(variable = name, 
         total = value) |>
  mutate(pct = round(total/nrow(chem), 4)) |>
  arrange(desc(pct), variable) |>
  mutate(variable = fct_reorder(variable, pct))

kbl(missing,
  caption = "Missing Values") |>
  kable_classic(full_width = F, html_font = "Cambria") %>%
  kable_styling(latex_options = "HOLD_position")
Missing Values
variable total pct
ManufacturingProcess03 15 0.0852
ManufacturingProcess11 10 0.0568
ManufacturingProcess10 9 0.0511
ManufacturingProcess25 5 0.0284
ManufacturingProcess26 5 0.0284
ManufacturingProcess27 5 0.0284
ManufacturingProcess28 5 0.0284
ManufacturingProcess29 5 0.0284
ManufacturingProcess30 5 0.0284
ManufacturingProcess31 5 0.0284
ManufacturingProcess33 5 0.0284
ManufacturingProcess34 5 0.0284
ManufacturingProcess35 5 0.0284
ManufacturingProcess36 5 0.0284
ManufacturingProcess02 3 0.0170
ManufacturingProcess06 2 0.0114
ManufacturingProcess01 1 0.0057
ManufacturingProcess04 1 0.0057
ManufacturingProcess05 1 0.0057
ManufacturingProcess07 1 0.0057
ManufacturingProcess08 1 0.0057
ManufacturingProcess12 1 0.0057
ManufacturingProcess14 1 0.0057
ManufacturingProcess22 1 0.0057
ManufacturingProcess23 1 0.0057
ManufacturingProcess24 1 0.0057
ManufacturingProcess40 1 0.0057
ManufacturingProcess41 1 0.0057
missing |>
  ggplot(aes(x=variable, y=pct)) +
  geom_bar(stat='identity', fill="#4E79A7") +
  coord_flip() +
  theme_bw() +
  labs(x = "") +
  geom_text(aes(label = paste0(round(pct*100, 2), "%")), colour = "white", hjust = 1, size=3, fontface = "bold") +
  scale_y_continuous(labels = scales::percent)

#t1 <- tableGrob(missing)  # transform into a tableGrob

#grid.arrange(missing_plot, t1, nrow=1)
set.seed(42)

knn_impute <- preProcess(chem, "knnImpute", k=5)
impute_chem <- predict(knn_impute, chem)

impute_missing<-
  impute_chem %>% 
  summarise(across(everything(), ~ sum(is.na(.x)))) %>%
  pivot_longer(., 
            cols = everything()) |>
  filter(value != 0) %>%
  nrow(.)

print(paste0("Missing Values: ", impute_missing >0))
## [1] "Missing Values: FALSE"
  1. Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

We will filter out predictors with low frequency using the nearZeroVar function. We then will use a lasso regression model to find the best fit model. The results show that we can have a \(\lambda = 0.1\) and get an \(R^2 = 0.6186\)

nzv_predictors <- nearZeroVar(impute_chem |> select(-Yield),
                              names = TRUE, saveMetrics = FALSE)

filtered_chem <-
  as_tibble(impute_chem) |>
  select(-all_of(nzv_predictors)) |>
  clean_names()

kbl(filtered_chem[1:5, 1:10],
    caption = "Filtered Chemical Manufacturing Process") |>
  kable_classic(full_width = F, html_font = "Cambria") |>
  footnote(general_title = "Dimensions: ",
          TeX(paste0(nrow(filtered_chem), " x ", ncol(filtered_chem))))
Filtered Chemical Manufacturing Process
yield biological_material01 biological_material02 biological_material03 biological_material04 biological_material05 biological_material06 biological_material08 biological_material09 biological_material10
-1.1792673 -0.2261036 -1.514098 -2.683036 0.2201765 0.4941942 -1.382888 -1.233131 -3.3962895 1.1005296
1.2263678 2.2391498 1.308996 -0.056235 1.2964386 0.4128555 1.129077 2.282619 -0.7227225 1.1005296
1.0042258 2.2391498 1.308996 -0.056235 1.2964386 0.4128555 1.129077 2.282619 -0.7227225 1.1005296
0.6737219 2.2391498 1.308996 -0.056235 1.2964386 0.4128555 1.129077 2.282619 -0.7227225 1.1005296
1.2534583 1.4827653 1.893939 1.135948 0.9414412 -0.3734185 1.534835 1.071310 -0.1205678 0.4162193
Dimensions:
176 x 57
rows <- sample(nrow(filtered_chem))

filtered_chem <- filtered_chem[rows, ]

sample <- 
  sample(c(TRUE, FALSE), nrow(filtered_chem),
         replace=TRUE, prob=c(0.7,0.3))

train_df <- filtered_chem[sample, ]
train_x <- train_df |>
    select(-yield)
train_y <- train_df$yield
train_y <- as.numeric(train_y)

test_df <- filtered_chem[!sample, ]
test_x <- test_df |>
    select(-yield)
test_y <- test_df$yield
test_y <- as.numeric(test_y)
lassoGrid <- expand.grid(alpha = 1, lambda = seq(0, 1, 0.10))

lasso <- train(train_x, train_y, method = "glmnet",
                     tuneGrid = lassoGrid, trControl = ctrl,
                     preProc = c("center","scale"))
lasso
## glmnet 
## 
## 130 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 118, 118, 118, 117, 117, 116, ... 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE       Rsquared   MAE      
##   0.0     2.1679137  0.4574132  1.0827952
##   0.1     0.6507601  0.6167738  0.5409456
##   0.2     0.6982033  0.6032534  0.5690600
##   0.3     0.7641762  0.5914106  0.6199460
##   0.4     0.8455375  0.5775348  0.6868610
##   0.5     0.9387491  0.4949481  0.7628042
##   0.6     0.9987038  0.2257833  0.8134976
##   0.7     0.9995421        NaN  0.8139303
##   0.8     0.9995421        NaN  0.8139303
##   0.9     0.9995421        NaN  0.8139303
##   1.0     0.9995421        NaN  0.8139303
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.1.
plot(lasso)

results <-
  lasso$results |>
  filter(round(lambda, 1) == 0.1)

kbl(results,
    caption = "Lasso Model") |>
  kable_classic(full_width = F, html_font = "Cambria")
Lasso Model
alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 0.1 0.6507601 0.6167738 0.5409456 0.1550173 0.1522287 0.1182374
  1. Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

We get an \(R^2 = 0.6036\) and an \(RMSE = 0.5868\) for our prediction performance. Compare this to our training set where we have an \(R^2 = 0.6186\) and an \(RMSE = 0.6914\)

lasso_pred <- predict(lasso, test_x)

# Model performance metrics
results <- data.frame(RMSE = caret::RMSE(lasso_pred, test_y),
                      Rsquared = caret::R2(lasso_pred, test_y))
kbl(results, caption = "Lasso Predictions") |>
  kable_classic(full_width = F, html_font = "Cambria")
Lasso Predictions
RMSE Rsquared
0.8367912 0.334423
  1. Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

We can see that process predictors absolutely dominates the top 15 list .

top_pred <- 
  varImp(lasso)$importance |>
  arrange(desc(Overall))

kbl(head(top_pred, 15),
    caption = "Top 15 Predictors") |>
  kable_classic(full_width = F, html_font = "Cambria")
Top 15 Predictors
Overall
manufacturing_process32 100.000000
manufacturing_process09 89.943827
manufacturing_process36 43.299434
manufacturing_process17 23.714786
manufacturing_process37 18.876765
manufacturing_process31 15.202819
manufacturing_process13 12.547803
manufacturing_process39 7.173467
manufacturing_process45 6.211404
manufacturing_process34 5.600002
biological_material01 0.000000
biological_material02 0.000000
biological_material03 0.000000
biological_material04 0.000000
biological_material05 0.000000
  1. Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

We can see the relationships and their strength between yield and the top predictors. We saw how manufacturing_process32 was the highest predictor in our model and it shows with the moderately strong correlation of 0.61. If we can manage these processes better and also the other strong relationships with the processes they have in common, we potentially could increase and improve the future yield. This is where some A/B experimentation could help where we can control our known factors and see how it changes the results.

filtered_pred <-
  rownames_to_column(head(top_pred, 9))

pred_corr <-
  filtered_chem |>
  select(yield, filtered_pred$rowname)

q <- cor(pred_corr)

ggcorrplot(q, type = "upper", outline.color = "white",
           ggtheme = theme_classic,
           colors = c("#F28E2B", "white", "#4E79A7"),
           lab = TRUE, show.legend = F, tl.cex = 5, lab_size = 3)