library(AppliedPredictiveModeling)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.3
library(pls)
## Warning: package 'pls' was built under R version 4.3.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
library(RColorBrewer)
library(reshape2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
6.2. Developing a model to predict permeability (see Sect. 1.4) could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug: (a) Start R and use these commands to load the data: > library(AppliedPredictiveModeling) > data(permeability) The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
library(AppliedPredictiveModeling)
data(permeability)
nzvFingerprints <- nearZeroVar(fingerprints)
noNzvFingerprints <- fingerprints[, -nzvFingerprints]
num_predictors_remaining <- ncol(noNzvFingerprints)
num_predictors_remaining
## [1] 388
set.seed(1234)
trainingRows <- createDataPartition(permeability, p = 0.75, list = FALSE)
trainFingerprints <- noNzvFingerprints[trainingRows, ]
trainPermeability <- permeability[trainingRows]
testFingerprints <- noNzvFingerprints[-trainingRows, ]
testPermeability <- permeability[-trainingRows]
# Model tuning
ctrl <- trainControl(method = "LGOCV")
plsTune <- train(x = trainFingerprints,
y = log10(trainPermeability),
method = "pls",
tuneGrid = expand.grid(ncomp = 1:15),
trControl = ctrl)
optimal_ncomp <- plsTune$bestTune$ncomp
optimal_R2 <- max(plsTune$results$Rsquared)
list(optimal_ncomp = optimal_ncomp, optimal_R2 = optimal_R2)
## $optimal_ncomp
## [1] 7
##
## $optimal_R2
## [1] 0.47583
test_predictions <- predict(plsTune, newdata = testFingerprints)
test_R2 <- cor(log10(testPermeability), test_predictions)^2
test_R2 # RSquared on the test set
## [1] 0.5725418
# Ridge Regression
ridgeTune <- train(x = trainFingerprints,
y = log10(trainPermeability),
method = "ridge",
trControl = ctrl)
## Warning: model fit failed for Resample04: lambda=0e+00 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Resample05: lambda=0e+00 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Resample07: lambda=0e+00 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Resample13: lambda=0e+00 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Resample22: lambda=0e+00 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
ridge_R2 <- max(ridgeTune$results$Rsquared)
# Elastic Net
enetTune <- train(x = trainFingerprints,
y = log10(trainPermeability),
method = "enet",
trControl = ctrl)
## Warning: model fit failed for Resample07: lambda=0e+00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Resample17: lambda=0e+00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
enet_R2 <- max(enetTune$results$Rsquared)
list(ridge_R2 = ridge_R2, enet_R2 = enet_R2)
## $ridge_R2
## [1] 0.4402529
##
## $enet_R2
## [1] 0.4545963
Based on the RSquared values, the elastic net model performs best. I would use this as a replacement for the permeability laboratory test due to its accuracy in predicting permeability.
6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect. 1.4. In this problem, the objective is to understand the relationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing process. Improving product yield by 1 % will boost revenue by approximately one hundred thousand dollars per batch: (a) Start R and use these commands to load the data: > library(AppliedPredictiveModeling) > data(chemicalManufacturing) The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
library(AppliedPredictiveModeling)
library(RANN)
## Warning: package 'RANN' was built under R version 4.3.3
library(caret)
data(ChemicalManufacturingProcess)
processPredictors <- ChemicalManufacturingProcess[, -1]
yield <- ChemicalManufacturingProcess$Yield
# Check the structure to confirm data is loaded correctly
str(processPredictors)
## 'data.frame': 176 obs. of 57 variables:
## $ BiologicalMaterial01 : num 6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
## $ BiologicalMaterial02 : num 49.6 61 61 61 63.3 ...
## $ BiologicalMaterial03 : num 57 67.5 67.5 67.5 72.2 ...
## $ BiologicalMaterial04 : num 12.7 14.7 14.7 14.7 14 ...
## $ BiologicalMaterial05 : num 19.5 19.4 19.4 19.4 17.9 ...
## $ BiologicalMaterial06 : num 43.7 53.1 53.1 53.1 54.7 ...
## $ BiologicalMaterial07 : num 100 100 100 100 100 100 100 100 100 100 ...
## $ BiologicalMaterial08 : num 16.7 19 19 19 18.2 ...
## $ BiologicalMaterial09 : num 11.4 12.6 12.6 12.6 12.8 ...
## $ BiologicalMaterial10 : num 3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
## $ BiologicalMaterial11 : num 138 154 154 154 148 ...
## $ BiologicalMaterial12 : num 18.8 21.1 21.1 21.1 21.1 ...
## $ ManufacturingProcess01: num NA 0 0 0 10.7 12 11.5 12 12 12 ...
## $ ManufacturingProcess02: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess03: num NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
## $ ManufacturingProcess04: num NA 917 912 911 918 924 933 929 928 938 ...
## $ ManufacturingProcess05: num NA 1032 1004 1015 1028 ...
## $ ManufacturingProcess06: num NA 210 207 213 206 ...
## $ ManufacturingProcess07: num NA 177 178 177 178 178 177 178 177 177 ...
## $ ManufacturingProcess08: num NA 178 178 177 178 178 178 178 177 177 ...
## $ ManufacturingProcess09: num 43 46.6 45.1 44.9 45 ...
## $ ManufacturingProcess10: num NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
## $ ManufacturingProcess11: num NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
## $ ManufacturingProcess12: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess13: num 35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
## $ ManufacturingProcess14: num 4898 4869 4878 4897 4992 ...
## $ ManufacturingProcess15: num 6108 6095 6087 6102 6233 ...
## $ ManufacturingProcess16: num 4682 4617 4617 4635 4733 ...
## $ ManufacturingProcess17: num 35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
## $ ManufacturingProcess18: num 4865 4867 4877 4872 4886 ...
## $ ManufacturingProcess19: num 6049 6097 6078 6073 6102 ...
## $ ManufacturingProcess20: num 4665 4621 4621 4611 4659 ...
## $ ManufacturingProcess21: num 0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
## $ ManufacturingProcess22: num NA 3 4 5 8 9 1 2 3 4 ...
## $ ManufacturingProcess23: num NA 0 1 2 4 1 1 2 3 1 ...
## $ ManufacturingProcess24: num NA 3 4 5 18 1 1 2 3 4 ...
## $ ManufacturingProcess25: num 4873 4869 4897 4892 4930 ...
## $ ManufacturingProcess26: num 6074 6107 6116 6111 6151 ...
## $ ManufacturingProcess27: num 4685 4630 4637 4630 4684 ...
## $ ManufacturingProcess28: num 10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
## $ ManufacturingProcess29: num 21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
## $ ManufacturingProcess30: num 9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
## $ ManufacturingProcess31: num 69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
## $ ManufacturingProcess32: num 156 169 173 171 171 173 159 161 160 164 ...
## $ ManufacturingProcess33: num 66 66 66 68 70 70 65 65 65 66 ...
## $ ManufacturingProcess34: num 2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ ManufacturingProcess35: num 486 508 509 496 468 490 475 478 491 488 ...
## $ ManufacturingProcess36: num 0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
## $ ManufacturingProcess37: num 0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
## $ ManufacturingProcess38: num 3 2 2 2 2 2 2 2 3 3 ...
## $ ManufacturingProcess39: num 7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
## $ ManufacturingProcess40: num NA 0.1 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess41: num NA 0.15 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess42: num 11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
## $ ManufacturingProcess43: num 3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
## $ ManufacturingProcess44: num 1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
## $ ManufacturingProcess45: num 2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
str(yield)
## num [1:176] 38 42.4 42 41.4 42.5 ...
preProcessModel <- preProcess(processPredictors, method = "knnImpute")
imputedPredictors <- predict(preProcessModel, processPredictors)
# Set seed for reproducibility
set.seed(4321)
# Split the data into training (70%) and test sets (30%)
trainingRows <- createDataPartition(yield, p = 0.70, list = FALSE)
trainPredictors <- imputedPredictors[trainingRows, ]
trainYield <- yield[trainingRows]
testPredictors <- imputedPredictors[-trainingRows, ]
testYield <- yield[-trainingRows]
# Model Tuning
ctrl <- trainControl(method = "LGOCV")
# Train a PLS model
plsTune <- train(x = trainPredictors, y = trainYield, method = "pls", trControl = ctrl)
# Find the optimal R-squared value on training data
optimal_R2_train <- max(plsTune$results$Rsquared)
optimal_R2_train
## [1] 0.5691764
# Make predictions on the test set
testPredictions <- predict(plsTune, newdata = testPredictors)
test_R2 <- cor(testYield, testPredictions)^2
list(train_R2 = optimal_R2_train, test_R2 = test_R2)
## $train_R2
## [1] 0.5691764
##
## $test_R2
## [1] 0.487234
# Determine variable importance in PLS model
variableImportance <- varImp(plsTune)
print(variableImportance)
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess32 99.31
## ManufacturingProcess09 97.58
## ManufacturingProcess17 91.58
## ManufacturingProcess36 82.51
## BiologicalMaterial02 71.06
## ManufacturingProcess11 70.23
## ManufacturingProcess06 69.06
## ManufacturingProcess12 68.70
## BiologicalMaterial08 67.87
## BiologicalMaterial06 67.39
## BiologicalMaterial12 61.87
## ManufacturingProcess33 61.86
## BiologicalMaterial03 61.30
## BiologicalMaterial11 60.54
## BiologicalMaterial04 55.56
## ManufacturingProcess04 54.84
## BiologicalMaterial01 54.53
## ManufacturingProcess28 47.85
## ManufacturingProcess30 45.41
# Extract top predictors based on variable importance
topPredictors <- rownames(variableImportance$importance[order(variableImportance$importance$Overall, decreasing = TRUE),])[1:5]
# Relationship between each of the top predictors and yield
PredictorsRelations<- par(mfrow = c(2, 3))
for (predictor in topPredictors) {
plot(trainPredictors[[predictor]], trainYield,
main = paste("Relationship of", predictor, "with Yield"),
xlab = predictor, ylab = "Yield", pch = 19, col = "blue")
}
PredictorsRelations
## $mfrow
## [1] 1 1