# Load libraries
library(caret)
library(elasticnet)
library(MASS)
library(lars)
library(pls)
library(AppliedPredictiveModeling)
library(RANN)
library(corrplot)
library(car)

Exercise 6.2

Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:



Load Data

Start R and use these commands to load the data. The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.

# Load data
data(permeability)

# View data
head(fingerprints[, 1:10])
##   X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1  0  0  0  0  0  1  1  1  0   0
## 2  0  0  0  0  0  0  1  1  0   0
## 3  0  0  0  0  0  1  1  1  0   0
## 4  0  0  0  0  0  0  1  1  0   0
## 5  0  0  0  0  0  0  1  1  0   0
## 6  0  0  0  0  0  0  1  1  0   0
head(permeability)
##   permeability
## 1       12.520
## 2        1.120
## 3       19.405
## 4        1.730
## 5        1.680
## 6        0.510



Remove Zero Variance Predictors

The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

After running the nearZeroVar function, only 388 predictors remain.

# Find near zero variance predictors
nzv <- nearZeroVar(fingerprints)

# Remove near zero variance predictors
filtered_fingerprints <- fingerprints[, -nzv]

# Number of remaining predictors
ncol(filtered_fingerprints)
## [1] 388



Tune PLS Model

Split the data into a training and a test set, preprocess the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R 2?

For the PLS model, 6 latent variables are optimal. On the training data set, the Rsquared value = 0.5335956. This is a mediocre model.

# Create 80/20 training - test split
set.seed(123)
trainIndex <- createDataPartition(permeability, p = .8, list = FALSE)
x_train <- filtered_fingerprints[trainIndex, ]
x_test <- filtered_fingerprints[-trainIndex, ]
y_train <- permeability[trainIndex]
y_test <- permeability[-trainIndex]

# Set up 10 fold cross-validation
cross_val_10 <- trainControl(method = "cv", 
                   number = 10)

# Train PLS model with preprocessing and tuning
pls_model <- train(x = x_train,
                 y = y_train,
                 method = "pls",
                 trControl = cross_val_10,
                 preProcess = c("center", "scale"),
                 tuneLength = 20)

# Optimal number of components
pls_model$bestTune
##   ncomp
## 6     6
# RSquared for training set
max(pls_model$results$Rsquared)
## [1] 0.5335956



Predict PLS model

Predict the response for the test set. What is the test set estimate of R 2?

On the test data set, the predictions resulted in a Rsquared of 0.3244542. This is a poor performing model and significantly under-performs the rsquared of the training data.

# Make prediction on test set
test_pred <- predict(pls_model, x_test)


# R-squared for prediction test set
postResample(pred = test_pred, obs = y_test)[2]
##  Rsquared 
## 0.3244542



Other model performance

Try building other models discussed in this chapter. Do any have better predictive performance?

  • Ridge: With lambda=0.1 and R-squared of 0.40, tthis model has moderate predictive power while applying very mild shrinkage to coefficients.
  • LASSO: Using fraction=0.1 (indicating 10% of coefficients retained), the model performed poorly with R-squared of 0.17.
  • Elastic Net: With fraction=0.3 and lambda=0.089, the model achieved R-squared of 0.40, selecting enough variables and balancing coefficient shrinkage similarly to Ridge to yield moderate predictive power.
  • PCR: Using 11 principal components and R-squared of 0.29, it reduced dimenstionality but performed poorly overall

Despite trying different regularization approaches (lambda=0.1 for Ridge, fraction=0.1 for LASSO, mixed parameters for Elastic Net) and dimension reduction (11 components for PCR), none of the models achieved strong predictive performance,

set.seed(123)

# Ridge model
ridge_model <- train(x = x_train,
                   y = y_train,
                   method = "ridge",
                   trControl = cross_val_10,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

# Ridge results
print("Ridge Results:")
## [1] "Ridge Results:"
# Optimal lambda
ridge_model$bestTune
##    lambda
## 20    0.1
# Best Train R2
max(ridge_model$results$Rsquared)
## [1] 0.5449848
ridge_pred <- predict(ridge_model, x_test)

# Test R2
print("Test R2:")
## [1] "Test R2:"
postResample(pred = ridge_pred, obs = y_test)[2]  
##  Rsquared 
## 0.4004427
cat("\n\n")
# Lasso model
lasso_model <- train(x = x_train,
                   y = y_train,
                   method = "lasso",
                   trControl = cross_val_10,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

# Lasso results
print("Lasso Results:")
## [1] "Lasso Results:"
# Optimal fraction
lasso_model$bestTune
##   fraction
## 1      0.1
# Best Train R2
max(lasso_model$results$Rsquared)  
## [1] 0.219283
lasso_pred <- predict(lasso_model, x_test)

# Test R2
print("Test R2:")
## [1] "Test R2:"
postResample(pred = lasso_pred, obs = y_test)[2]
##   Rsquared 
## 0.01791884
cat("\n\n")
# Elastic Net model
enet_grid <- expand.grid(lambda = seq(0, 0.1, length = 10),
                       fraction = seq(0.1, 1, length = 10))

enet_model <- train(x = x_train,
                  y = y_train,
                  method = "enet",
                  trControl = cross_val_10,
                  tuneGrid = enet_grid,
                  preProcess = c("center", "scale"))

# Elastic Net results
print("Elastic Net Results:")
## [1] "Elastic Net Results:"
# Optimal lambda and fraction
enet_model$bestTune
##    fraction     lambda
## 83      0.3 0.08888889
# Best R2
max(enet_model$results$Rsquared)  
## [1] 0.5415084
enet_pred <- predict(enet_model, x_test)

# Test R2
print("Test R2:")
## [1] "Test R2:"
postResample(pred = enet_pred, obs = y_test)[2]  # Test R2
##  Rsquared 
## 0.4014303
cat("\n\n")
# PCR model
pcr_model <- train(x = x_train,
                   y = y_train,
                   method = "pcr",
                   trControl = cross_val_10,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

# PCR results
print("PCR Results:")
## [1] "PCR Results:"
# Optimal number of components
pcr_model$bestTune
##    ncomp
## 11    11
# Best Train R2
max(pcr_model$results$Rsquared)  
## [1] 0.5109579
pcr_pred <- predict(pcr_model, x_test)

# Test R2
print("Test R2:")
## [1] "Test R2:"
postResample(pred = pcr_pred, obs = y_test)[2]
##  Rsquared 
## 0.2922108



Recommendation

Would you recommend any of your models to replace the permeability laboratory experiment?

All of the models had mediocre R squared values in the training sets and poor performance against the test data. I would not recommend any of these models to replace the permeability experiments.



Exercise 6.3

A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch:



Load data

Start R and use these commands to load the data:The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.

data("ChemicalManufacturingProcess")



Impute Missing Predictors

A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values

We use carets knn imputation method to impute missing predictor values. The ‘False’ output indicates that after the imputation was performed, the check for imputed was false (indicating no missing values).

# Separate predictors and outcome variables
predictors <- ChemicalManufacturingProcess[, !names(ChemicalManufacturingProcess) %in% "Yield"]
outcome <- ChemicalManufacturingProcess$Yield

# Create a KNN imputation model using caret's preProcess function
imputation_model <- preProcess(predictors, method = "knnImpute")

# Apply the imputation model to fill in missing values
predictors_imputed <- predict(imputation_model, newdata = predictors)

# Verify imputation; returns false is there are no remaining NA values
anyNA(predictors_imputed)
## [1] FALSE
# Recombine the imputed predictor variables with the original outcome
data_imputed <- cbind(predictors_imputed, Yield = outcome)



Tune PLS model

Split the data into a training and a test set, preprocess the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?

The model achieved a moderate Rsquare = 0.597. The ncomp = 3 indicates that the model used 3 latent variables in the final PLS model to effectively capture the underlying patterns.

set.seed(123)

# Split data into training and test sets
trainIndex <- createDataPartition(data_imputed$Yield, p = 0.8, list = FALSE)
train_data <- data_imputed[trainIndex, ]
test_data <- data_imputed[-trainIndex, ]

# Extract x and y for training and testing
x_train <- train_data[, -ncol(train_data)]
y_train <- train_data$Yield
x_test <- test_data[, -ncol(test_data)]
y_test <- test_data$Yield

# Set up cross validation for model training
cross_val_10 <- trainControl(method = "cv", number = 10)

# PLS model
pls_model <- train(x = x_train,
                   y = y_train,
                   method = "pls",
                   trControl = cross_val_10,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

# PLS results
print("PLS Results:")
## [1] "PLS Results:"
# Optimal number of components
print(pls_model$bestTune)
##   ncomp
## 3     3
# Best R2
print("Test R2:")
## [1] "Test R2:"
print(max(pls_model$results$Rsquared))  
## [1] 0.5979192



Predict PLS model

Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?

However, although the model still performed moderately with Rsquare = .481, it did not perform as well on the test data set.

# Predict on test data set
pls_pred <- predict(pls_model, x_test)

# Test R2
print("Test R2:")
## [1] "Test R2:"
print(postResample(pred = pls_pred, obs = y_test)[2])  
##  Rsquared 
## 0.4805676



Identify Important PLS components

Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?

The top 20 predictors were identified. Although the manufacturing predictors are the highest, the rest of the list contains a mix of biological. The PLS only uses 3 latent variables to model:

  • Component 1 explains 18.08% of the variance
  • Component 2 explains 6.95% of the variance
  • Component 3 explains 6.17% of the variance
# Extract loadings of PLS model
loadings <- loadings(pls_model$finalModel)
print(loadings)
## 
## Loadings:
##                        Comp 1 Comp 2 Comp 3
## BiologicalMaterial01    0.255 -0.197       
## BiologicalMaterial02    0.299 -0.137       
## BiologicalMaterial03    0.242              
## BiologicalMaterial04    0.252 -0.173       
## BiologicalMaterial05    0.122 -0.129       
## BiologicalMaterial06    0.292 -0.117       
## BiologicalMaterial07                 -0.250
## BiologicalMaterial08    0.276 -0.173 -0.109
## BiologicalMaterial09                 -0.157
## BiologicalMaterial10    0.195 -0.219       
## BiologicalMaterial11    0.255 -0.151       
## BiologicalMaterial12    0.256 -0.141 -0.126
## ManufacturingProcess01                     
## ManufacturingProcess02 -0.208  0.200       
## ManufacturingProcess03                0.136
## ManufacturingProcess04 -0.181              
## ManufacturingProcess05  0.104 -0.111       
## ManufacturingProcess06  0.149  0.148       
## ManufacturingProcess07                     
## ManufacturingProcess08                     
## ManufacturingProcess09  0.159  0.291 -0.226
## ManufacturingProcess10         0.137 -0.316
## ManufacturingProcess11  0.132  0.224 -0.201
## ManufacturingProcess12  0.108  0.197 -0.212
## ManufacturingProcess13 -0.125 -0.361  0.240
## ManufacturingProcess14        -0.174  0.362
## ManufacturingProcess15  0.146 -0.123  0.315
## ManufacturingProcess16                     
## ManufacturingProcess17        -0.409       
## ManufacturingProcess18                0.186
## ManufacturingProcess19  0.136 -0.260  0.190
## ManufacturingProcess20                0.185
## ManufacturingProcess21        -0.161 -0.195
## ManufacturingProcess22                     
## ManufacturingProcess23                     
## ManufacturingProcess24 -0.106              
## ManufacturingProcess25        -0.133  0.219
## ManufacturingProcess26        -0.120  0.213
## ManufacturingProcess27        -0.130  0.216
## ManufacturingProcess28  0.212 -0.122       
## ManufacturingProcess29  0.109 -0.144  0.216
## ManufacturingProcess30                     
## ManufacturingProcess31        -0.121  0.204
## ManufacturingProcess32  0.243         0.249
## ManufacturingProcess33  0.220         0.220
## ManufacturingProcess34         0.120       
## ManufacturingProcess35                     
## ManufacturingProcess36 -0.224        -0.185
## ManufacturingProcess37        -0.132 -0.104
## ManufacturingProcess38                     
## ManufacturingProcess39         0.130       
## ManufacturingProcess40                     
## ManufacturingProcess41                     
## ManufacturingProcess42         0.119       
## ManufacturingProcess43                     
## ManufacturingProcess44         0.158       
## ManufacturingProcess45         0.127       
## 
##                Comp 1 Comp 2 Comp 3
## SS loadings     1.144  1.219  1.287
## Proportion Var  0.020  0.021  0.023
## Cumulative Var  0.020  0.041  0.064
# Get variable importance scores PLS model
var_importance <- varImp(pls_model)
print(var_importance)
## pls variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess17   89.46
## ManufacturingProcess13   88.72
## ManufacturingProcess09   88.40
## ManufacturingProcess36   84.23
## ManufacturingProcess06   70.26
## ManufacturingProcess33   64.20
## BiologicalMaterial06     63.03
## BiologicalMaterial03     62.32
## BiologicalMaterial02     61.55
## BiologicalMaterial08     61.12
## ManufacturingProcess11   60.01
## BiologicalMaterial11     55.96
## BiologicalMaterial12     55.90
## BiologicalMaterial01     53.84
## BiologicalMaterial04     52.30
## ManufacturingProcess28   48.86
## ManufacturingProcess12   47.58
## ManufacturingProcess37   46.57
## BiologicalMaterial10     43.70
# Get explained variance for each component PLS model
explained_var <- explvar(pls_model$finalModel)
print(explained_var)
##    Comp 1    Comp 2    Comp 3 
## 18.004878  6.949832  6.174550



Comparison Against Lasso Model
  • By comparison, the lasso model achieved an Rsquared = 0.5619 on the test set - explaining about 56% of the variance in the outcome.
  • The model selected only 20 most important variables (out of 57), with ManufacturingProcess32 and BiologicalMaterial06 being the most influential (importance scores of 100 and 94 respectively). T
  • The remaining variables not shown in the importance list were effectively shrunk to zero coefficients
  • The predictive power is moderate but better than our PLS mode but are fair amount 44% remains unexplained by the lasso regression model.
# Lasso model
lasso_model <- train(x = x_train,
                   y = y_train,
                   method = "lasso",
                   trControl = cross_val_10,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

# Lasso results
print("Lasso Results:")
## [1] "Lasso Results:"
# Optimal fraction
print(lasso_model$bestTune)
##   fraction
## 1      0.1
# Best R2
print(max(lasso_model$results$Rsquared))
## [1] 0.6035836
# Test set prediction and R2
lasso_pred <- predict(lasso_model, x_test)

print("Test R2:")
## [1] "Test R2:"
print(postResample(pred = lasso_pred, obs = y_test)[2])
##  Rsquared 
## 0.5619242
# Extract Coefficients of Lasso model
importance <- varImp(lasso_model)

print("Variable Importance from LASSO:")
## [1] "Variable Importance from LASSO:"
print(importance)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## BiologicalMaterial06     94.06
## BiologicalMaterial03     81.27
## ManufacturingProcess13   80.63
## ManufacturingProcess36   78.46
## BiologicalMaterial02     76.04
## ManufacturingProcess17   75.92
## ManufacturingProcess31   75.30
## ManufacturingProcess09   73.04
## BiologicalMaterial12     69.48
## ManufacturingProcess06   66.28
## BiologicalMaterial11     59.72
## ManufacturingProcess33   58.31
## ManufacturingProcess29   54.48
## BiologicalMaterial04     53.93
## ManufacturingProcess11   47.55
## BiologicalMaterial01     45.62
## BiologicalMaterial08     44.93
## BiologicalMaterial09     40.88
## ManufacturingProcess30   40.40



Component Correlation

Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?

The correlation analysis reveals strong relationships between several biological materials (strongest being BiologicalMaterial02 and BiologicalMaterial06 at 0.954) and between certain manufacturing processes (like Process32 and Process33 at 0.856). These strong correlations suggest that these variables likely influence each other or respond to similar underlying factors in the manufacturing process. This information could be valuable for process optimization and quality control as these an issue or change in one variable may affect the response in the other.

# Define the important variables from lasso results
important_vars <- c("ManufacturingProcess32", "BiologicalMaterial06",
                   "BiologicalMaterial03", "ManufacturingProcess13",
                   "ManufacturingProcess36", "BiologicalMaterial02",
                   "ManufacturingProcess17", "ManufacturingProcess31",
                   "ManufacturingProcess09", "BiologicalMaterial12",
                   "ManufacturingProcess06", "BiologicalMaterial11",
                   "ManufacturingProcess33", "ManufacturingProcess29",
                   "BiologicalMaterial04", "ManufacturingProcess11",
                   "BiologicalMaterial01", "BiologicalMaterial08",
                   "BiologicalMaterial09", "ManufacturingProcess30")

# Create subset with important variables
important_data <- data_imputed[, important_vars]

# Calculate correlation matrix
cor_matrix <- cor(important_data)

# Create correlation plot
corrplot(cor_matrix, 
         method = "color",
         type = "upper",
         addCoef.col = "black",
         number.cex = 0.7,
         tl.cex = 0.7,
         diag = FALSE)

# Find strongest correlations
cor_df <- as.data.frame(as.table(cor_matrix))
names(cor_df) <- c("Var1", "Var2", "Correlation")
cor_df <- subset(cor_df, as.character(Var1) < as.character(Var2))
cor_df <- cor_df[order(abs(cor_df$Correlation), decreasing = TRUE), ]

# Print correlations above 0.8
print("Strong correlations > .8:")
## [1] "Strong correlations > .8:"
print(subset(cor_df, abs(Correlation) > 0.8))
##                       Var1                   Var2 Correlation
## 26    BiologicalMaterial02   BiologicalMaterial06   0.9543113
## 192   BiologicalMaterial11   BiologicalMaterial12   0.9037209
## 23    BiologicalMaterial03   BiologicalMaterial06   0.8723637
## 46    BiologicalMaterial02   BiologicalMaterial03   0.8607901
## 241 ManufacturingProcess32 ManufacturingProcess33   0.8555693
## 154 ManufacturingProcess29 ManufacturingProcess31   0.8430391
## 297   BiologicalMaterial01   BiologicalMaterial04   0.8196394
## 182   BiologicalMaterial06   BiologicalMaterial12   0.8128540
## 238   BiologicalMaterial08   BiologicalMaterial11   0.8003035