# libraries
library(AppliedPredictiveModeling)
library(caret)
library(pls)
library(RANN)
library(glmnet)
library(ggplot2)
Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:
The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?
Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?
Predict the response for the test set. What is the test set estimate of R2?
Try building other models discussed in this chapter. Do any have better predictive performance?
Would you recommend any of your models to replace the permeability laboratory experiment?
# loading the data
data(permeability)
dim(fingerprints)
## [1] 165 1107
length(permeability)
## [1] 165
low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?
nzv <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nzv]
n_remaining <- ncol(filtered_fingerprints)
n_remaining
## [1] 388
#summary(filtered_fingerprints)
388 predictors remain.
tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?
set.seed(123)
# train/test split
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
X_train <- filtered_fingerprints[trainIndex, ]
X_test <- filtered_fingerprints[-trainIndex, ]
y_train <- permeability[trainIndex]
y_test <- permeability[-trainIndex]
# training control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
# PLS model
pls_fit <- train(
x = X_train, y = y_train,
method = "pls",
preProc = c("center", "scale"),
trControl = ctrl,
tuneLength = 20,
metric = "Rsquared")
# RMSE and R2 vs number of components
plot(pls_fit)
# Best number of components
best_ncomp <- pls_fit$bestTune$ncomp
best_ncomp
## [1] 6
# Best cross-validated R-2
best_cv_R2 <- max(pls_fit$results$Rsquared)
best_cv_R2
## [1] 0.5307411
The best number of components was 6, with a repeated CV R2 of 0.5307. From the plot, one can see overfitting setting in beyond 6–8 components, as performance drops steadily.
PLS explains moderate variance (53% during CV), but it may not capture the non-linear relationships well enough, which is not surprising; a known limitation of linear methods like PLS.
# Predict on test set
pls_pred <- predict(pls_fit, newdata = X_test)
# test set R2
pls_R2 <- cor(pls_pred, y_test)^2
pls_RMSE <- sqrt(mean((pls_pred - y_test)^2))
pls_R2
## [1] 0.3244542
pls_RMSE
## [1] 12.34869
Test set R2 = 0.324. This drop from 0.53 (CV) to 0.32 (test) indicates some optimism in cross-validation estimates and limited generalizability of the PLS model to unseen data. RMSE was 12.35.
# Lasso
set.seed(123)
lasso_fit <- train(
x = X_train, y = y_train,
method = "glmnet",
preProc = c("center", "scale"),
trControl = ctrl,
tuneGrid = expand.grid(
alpha = 1,
lambda = 10^seq(-4, 1, length = 100)))
lasso_pred <- predict(lasso_fit, newdata = X_test)
lasso_R2 <- cor(lasso_pred, y_test)^2
# Ridge
set.seed(123)
ridge_fit <- train(
x = X_train, y = y_train,
method = "glmnet",
preProc = c("center", "scale"),
trControl = ctrl,
tuneGrid = expand.grid(
alpha = 0,
lambda = 10^seq(-4, 1, length = 100)))
ridge_pred <- predict(ridge_fit, newdata = X_test)
ridge_R2 <- cor(ridge_pred, y_test)^2
# Elastic Net
set.seed(123)
enet_fit <- train(
x = X_train, y = y_train,
method = "glmnet",
preProc = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
enet_pred <- predict(enet_fit, newdata = X_test)
enet_R2 <- cor(enet_pred, y_test)^2
# I'll add a few more out of curiosity:
# Random Forest
set.seed(123)
rf_fit <- train(
x = X_train, y = y_train,
method = "rf",
trControl = ctrl,
importance = TRUE)
rf_pred <- predict(rf_fit, X_test)
rf_R2 <- cor(rf_pred, y_test)^2
# SVM
set.seed(123)
svm_fit <- train(
x = X_train, y = y_train,
method = "svmRadial",
preProc = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
svm_pred <- predict(svm_fit, X_test)
svm_R2 <- cor(svm_pred, y_test)^2
# Compare models
model_comparison <- data.frame(
Model = c("PLS", "Random Forest", "SVM", "Lasso", "Ridge", "Elastic Net"),
R2_Test = c(pls_R2, rf_R2, svm_R2, lasso_R2, ridge_R2, enet_R2))
#model_comparison[order(-model_comparison$R2_Test), ]
model_rmse <- data.frame(
Model = c("PLS", "Random Forest", "SVM", "Lasso", "Ridge", "Elastic Net"),
RMSE_Test = c(
sqrt(mean((pls_pred - y_test)^2)),
sqrt(mean((rf_pred - y_test)^2)),
sqrt(mean((svm_pred - y_test)^2)),
sqrt(mean((lasso_pred - y_test)^2)),
sqrt(mean((ridge_pred - y_test)^2)),
sqrt(mean((enet_pred - y_test)^2))))
#model_rmse[order(model_rmse$RMSE_Test), ]
model_summary <- merge(model_comparison, model_rmse, by = "Model")
model_summary <- model_summary[order(-model_summary$R2_Test), ]
model_summary
## Model R2_Test RMSE_Test
## 6 SVM 0.5038552 10.29432
## 4 Random Forest 0.4929775 10.08085
## 2 Lasso 0.3888176 10.99783
## 1 Elastic Net 0.3655409 11.05010
## 5 Ridge 0.3266984 11.02039
## 3 PLS 0.3244542 12.34869
Notes:
Based on the models’ performance:
===============================================================================
6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:
Start R and use these commands to load the data: library(AppliedPredictiveModeling) data(chemicalManufacturingProcess) The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
A small percentage of cells in the predictor set contain missing values. Use an imputation function to fill in these missing values (e.g., see Sect. 3.8).
Split the data into a training and a test set, pre-process the data, and tune a model of your choice from this chapter. What is the optimal value of the performance metric?
Predict the response for the test set. What is the value of the performance metric and how does this compare with the resampled performance metric on the training set?
Which predictors are most important in the model you have trained? Do either the biological or process predictors dominate the list?
Explore the relationships between each of the top predictors and the response. How could this information be helpful in improving yield in future runs of the manufacturing process?
# fixing errors with loading the data
load(system.file("data", "chemicalManufacturingProcess.RData", package = "AppliedPredictiveModeling"))
# str(ChemicalManufacturingProcess)
# ls()
processPredictors <- ChemicalManufacturingProcess[, -1]
yield <- ChemicalManufacturingProcess$Yield
dim(processPredictors)
## [1] 176 57
length(yield)
## [1] 176
sum(is.na(processPredictors))
## [1] 106
preProc <- preProcess(processPredictors, method = c("knnImpute", "center", "scale"))
# Impute and scale
processed_data <- predict(preProc, processPredictors)
# Check
sum(is.na(processed_data))
## [1] 0
There were 106 missing values, and they were imputed.
set.seed(123)
# Train-test split
trainIndex <- createDataPartition(yield, p = 0.8, list = FALSE)
X_train <- processed_data[trainIndex, ]
X_test <- processed_data[-trainIndex, ]
y_train <- yield[trainIndex]
y_test <- yield[-trainIndex]
# Training control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
# Train Random Forest
rf_fit <- train(
x = X_train, y = y_train,
method = "rf",
trControl = ctrl,
importance = TRUE)
rf_fit$results
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 2 1.236134 0.6302038 0.9809238 0.2223745 0.1282774 0.1564944
## 2 29 1.134331 0.6465247 0.8748643 0.2454849 0.1375425 0.1698118
## 3 57 1.148726 0.6296347 0.8641660 0.2654705 0.1459044 0.1821881
best_rmse <- rf_fit$results[rf_fit$results$mtry == rf_fit$bestTune$mtry, "RMSE"]
best_rmse
## [1] 1.134331
# Train Lasso
lasso_fit <- train(
x = X_train, y = y_train,
method = "glmnet",
preProc = c("center", "scale"),
trControl = ctrl,
tuneGrid = expand.grid(
alpha = 1,
lambda = 10^seq(-4, 1, length = 100)))
lasso_fit$bestTune
## alpha lambda
## 60 1 0.09545485
min(lasso_fit$results$RMSE)
## [1] 1.144071
# Train an Elastic net
enet_fit <- train(
x = X_train, y = y_train,
method = "glmnet",
preProc = c("center", "scale"),
trControl = ctrl,
tuneLength = 10)
enet_fit$bestTune
## alpha lambda
## 97 1 0.07886757
min(enet_fit$results$RMSE)
## [1] 1.1619
# train a ridge regression
ridge_fit <- train(
x = X_train, y = y_train,
method = "glmnet",
preProc = c("center", "scale"),
trControl = ctrl,
tuneGrid = expand.grid(
alpha = 0,
lambda = 10^seq(-4, 1, length = 100)))
ridge_fit$bestTune
## alpha lambda
## 88 0 2.477076
min(ridge_fit$results$RMSE)
## [1] 1.337693
# Results
model_results <- data.frame(
Model = c("Random Forest", "Lasso", "Elastic Net", "Ridge"),
RMSE = c(
min(rf_fit$results$RMSE),
min(lasso_fit$results$RMSE),
min(enet_fit$results$RMSE),
min(ridge_fit$results$RMSE)))
model_results
## Model RMSE
## 1 Random Forest 1.134331
## 2 Lasso 1.144071
## 3 Elastic Net 1.161900
## 4 Ridge 1.337693
Of the models mentioned in this chapter, the Lasso seems to have the lowest RMSE, so I would choose that to predict on the test set. But because I did Random Forest and it showed a slightly better/lower RMSE, I will go with both to see how well they perform on the test set:
# Random Forest
rf_pred <- predict(rf_fit, newdata = X_test)
rf_test_R2 <- cor(rf_pred, y_test)^2
rf_test_RMSE <- sqrt(mean((rf_pred - y_test)^2))
rf_train_R2 <- getTrainPerf(rf_fit)$TrainRsquared
rf_train_RMSE <- getTrainPerf(rf_fit)$TrainRMSE
# Lasso
lasso_pred <- predict(lasso_fit, newdata = X_test)
lasso_test_R2 <- cor(lasso_pred, y_test)^2
lasso_test_RMSE <- sqrt(mean((lasso_pred - y_test)^2))
lasso_train_R2 <- getTrainPerf(lasso_fit)$TrainRsquared
lasso_train_RMSE <- getTrainPerf(lasso_fit)$TrainRMSE
# Compare
performance_summary <- data.frame(
Model = c("Random Forest", "Lasso"),
Train_R2 = c(rf_train_R2, lasso_train_R2),
Train_RMSE = c(rf_train_RMSE, lasso_train_RMSE),
Test_R2 = c(rf_test_R2, lasso_test_R2),
Test_RMSE = c(rf_test_RMSE, lasso_test_RMSE))
performance_summary
## Model Train_R2 Train_RMSE Test_R2 Test_RMSE
## 1 Random Forest 0.6465247 1.134331 0.5401572 1.268733
## 2 Lasso 0.6252550 1.144071 0.5743982 1.206413
Lasso vs Random Forest
I am curios as to how the Lasso and the RF will compare here, so I am going to keep both!
# --- RANDOM FOREST TOP PREDICTORS
varImp_rf <- varImp(rf_fit, scale = TRUE)
top20_rf <- varImp_rf$importance
top20_rf$Variable <- rownames(top20_rf)
top20_rf$Type <- ifelse(top20_rf$Variable %in% colnames(processPredictors)[1:12], "Biological", "Process")
top20_rf <- top20_rf[order(-top20_rf$Overall), ][1:20, ]
ggplot(top20_rf, aes(x = reorder(Variable, Overall), y = Overall, fill = Type)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 20 Predictors - Random Forest",
x = "Predictor", y = "Importance (Scaled)",
fill = "Predictor Type") +
theme_minimal()
# --- LASSO TOP PREDICTORS ---
best_lambda <- lasso_fit$bestTune$lambda
lasso_coefs <- coef(lasso_fit$finalModel, s = best_lambda)
lasso_coef_df <- as.data.frame(as.matrix(lasso_coefs))
colnames(lasso_coef_df) <- "Coefficient"
lasso_coef_df$Variable <- rownames(lasso_coef_df)
lasso_coef_df <- lasso_coef_df[lasso_coef_df$Variable != "(Intercept)", ]
nonzero_lasso <- subset(lasso_coef_df, Coefficient != 0)
nonzero_lasso$Type <- ifelse(nonzero_lasso$Variable %in% colnames(processPredictors)[1:12], "Biological", "Process")
nonzero_lasso$AbsCoef <- abs(nonzero_lasso$Coefficient)
top20_lasso <- nonzero_lasso[order(-nonzero_lasso$AbsCoef), ][1:20, ]
ggplot(top20_lasso, aes(x = reorder(Variable, AbsCoef), y = AbsCoef, fill = Type)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 20 Predictors - Lasso",
x = "Predictor", y = "Absolute Coefficient",
fill = "Predictor Type") +
theme_minimal()
Notes:
I decided to only focus on Lasso for this one:
# #Top 4 from Random Forest
# varImp_rf <- varImp(rf_fit, scale = TRUE)
# top_predictors_sorted <- varImp_rf$importance
# top_predictors_sorted$Variable <- rownames(top_predictors_sorted)
# top_predictors_sorted <- top_predictors_sorted[order(-top_predictors_sorted$Overall), ]
# top_predictors_sorted$Type <- ifelse(
# top_predictors_sorted$Variable %in% colnames(processPredictors)[1:12],
# "Biological", "Process")
# top4_rf <- rownames(top_predictors_sorted)[1:4]
#
# for (pred in top4_rf) {
# p <- ggplot(data.frame(x = processed_data[[pred]], y = yield), aes(x = x, y = y)) +
# geom_point(alpha = 0.6) +
# geom_smooth(method = "loess", color = "blue") +
# labs(
# title = paste("Yield vs", pred, "(Random Forest)"),
# x = pred,
# y = "Yield"
# ) +
# theme_minimal()
# print(p)}
#Top 4 from Lasso
nonzero_lasso <- nonzero_lasso[order(-nonzero_lasso$AbsCoef), ]
top4_lasso <- head(nonzero_lasso$Variable, 4)
for (pred in top4_lasso) {
p <- ggplot(data.frame(x = processed_data[[pred]], y = yield), aes(x = x, y = y)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", color = "red") +
labs(
title = paste("Yield vs", pred, "(Lasso)"),
x = pred,
y = "Yield"
) +
theme_minimal()
print(p)}
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Notes: