library(AppliedPredictiveModeling)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RANN)
Developing a model to predict permeability (see Sect. 1.4) could save sig- nificant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug:
data(permeability)
str(fingerprints)
## num [1:165, 1:1107] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:165] "1" "2" "3" "4" ...
## ..$ : chr [1:1107] "X1" "X2" "X3" "X4" ...
str(permeability)
## num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:165] "1" "2" "3" "4" ...
## ..$ : chr "permeability"
nzv <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nzv]
cat("Number of predictors remaining:", ncol(filtered_fingerprints), "\n")
## Number of predictors remaining: 388
Originally there were roughly 720 near zero variance fingerprints, with 388 predictors remaining for modeling. This is a significant reduction.
set.seed(624)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
trainX <- filtered_fingerprints[trainIndex, ]
testX <- filtered_fingerprints[-trainIndex, ]
trainY <- permeability[trainIndex]
testY <- permeability[-trainIndex]
set.seed(624)
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 5)
pls_model <- train(
x = trainX, y = trainY,
method = "pls",
preProcess = c("center", "scale"),
tuneLength = 20,
trControl = ctrl
)
print(pls_model)
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 121, 118, 119, 121, 119, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.52338 0.3040039 10.322412
## 2 12.38787 0.4200082 8.930705
## 3 12.47117 0.4257174 9.482308
## 4 12.45276 0.4360255 9.403024
## 5 12.32778 0.4500968 9.058762
## 6 12.22362 0.4540961 9.022149
## 7 12.04225 0.4536415 9.064597
## 8 11.84075 0.4687327 9.137174
## 9 11.82483 0.4737577 9.141689
## 10 11.89296 0.4682978 9.147988
## 11 11.85878 0.4721590 9.082044
## 12 11.85413 0.4742938 9.088010
## 13 12.04888 0.4630905 9.106178
## 14 12.27021 0.4463921 9.256883
## 15 12.51958 0.4286859 9.370305
## 16 12.62154 0.4233383 9.427397
## 17 12.90120 0.4112245 9.593511
## 18 12.96692 0.4104097 9.642338
## 19 12.99103 0.4120062 9.624802
## 20 12.87135 0.4203440 9.534101
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 9.
plot(pls_model)
cat("Optimal number of latent variables:", pls_model$bestTune$ncomp, "\n")
## Optimal number of latent variables: 9
cat("Resampled R^2 estimate:", max(pls_model$results$Rsquared), "\n")
## Resampled R^2 estimate: 0.4742938
pls_pred <- predict(pls_model, testX)
pls_R2 <- cor(pls_pred, testY)^2
cat("Test set R^2:", pls_R2, "\n")
## Test set R^2: 0.530305
set.seed(624)
rf_model <- train(
x = trainX, y = trainY,
method = "rf",
trControl = ctrl,
importance = TRUE
)
rf_pred <- predict(rf_model, testX)
rf_R2 <- cor(rf_pred, testY)^2
cat("Random Forest test R^2:", rf_R2, "\n")
## Random Forest test R^2: 0.353759
I would recommend the PLS model since it has R^2 value of 0.53 which is better then the other models I had tried.
6.3. A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the re- lationship between biological measurements of the raw materials (predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing pro- cess. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:
data("ChemicalManufacturingProcess")
processPredictors <- subset(ChemicalManufacturingProcess,select = -Yield)
yield <- subset(ChemicalManufacturingProcess, select = "Yield")
sum(is.na(processPredictors))
## [1] 106
set.seed(624)
trainingRows <- createDataPartition(yield$Yield,
p = 0.8,
list = FALSE)
trainPredictors <- processPredictors[trainingRows,]
testPredictors <- processPredictors[-trainingRows,]
trainYield <- yield[trainingRows,]
testYield <- yield[-trainingRows,]
preprocess <- preProcess(trainPredictors,method=c("medianImpute", "center", "scale"))
preprocessTrainPredictors <- predict(preprocess,trainPredictors)
preprocessTestPredictors <- predict(preprocess,testPredictors)
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
set.seed(123)
rfModel <- train(x = preprocessTrainPredictors, y = trainYield, method = "rf", trControl = ctrl, tuneLength = 5, metric = "RMSE")
rfModel
## Random Forest
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.222643 0.6222791 1.0063736
## 15 1.117762 0.6531000 0.9001873
## 29 1.115297 0.6446640 0.8984263
## 43 1.126134 0.6293753 0.8995005
## 57 1.136023 0.6178496 0.9042034
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
rfModel$results[which.min(rfModel$results$RMSE), ]
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 3 29 1.115297 0.644664 0.8984263 0.2456929 0.1280491 0.1939157
rfPred <- predict(rfModel, newdata = preprocessTestPredictors)
testResults <- postResample(pred = rfPred, obs = testYield)
testResults
## RMSE Rsquared MAE
## 1.2199526 0.7359082 0.8711505
resampled <- rfModel$results[which.min(rfModel$results$RMSE), ]
comparison <- data.frame(
Data = c("Training (CV)", "Test"),
RMSE = c(resampled$RMSE, testResults["RMSE"]),
Rsquared = c(resampled$Rsquared, testResults["Rsquared"])
)
comparison
## Data RMSE Rsquared
## Training (CV) 1.115297 0.6446640
## RMSE Test 1.219953 0.7359082
varImpRF <- varImp(rfModel)
plot(varImpRF, top = 15)
topVars <- varImpRF$importance %>%
arrange(desc(Overall)) %>%
head(10) %>%
rownames()
topVars
## [1] "ManufacturingProcess32" "BiologicalMaterial12" "BiologicalMaterial06"
## [4] "BiologicalMaterial03" "ManufacturingProcess09" "ManufacturingProcess31"
## [7] "ManufacturingProcess13" "ManufacturingProcess06" "ManufacturingProcess36"
## [10] "BiologicalMaterial11"
bio_predictors <- colnames(processPredictors)[1:12]
process_predictors <- colnames(processPredictors)[13:57]
data.frame(
Predictor = topVars,
Type = ifelse(topVars %in% bio_predictors, "Biological", "Process")
)
## Predictor Type
## 1 ManufacturingProcess32 Process
## 2 BiologicalMaterial12 Biological
## 3 BiologicalMaterial06 Biological
## 4 BiologicalMaterial03 Biological
## 5 ManufacturingProcess09 Process
## 6 ManufacturingProcess31 Process
## 7 ManufacturingProcess13 Process
## 8 ManufacturingProcess06 Process
## 9 ManufacturingProcess36 Process
## 10 BiologicalMaterial11 Biological
library(tidyr)
topData <- processPredictors[, topVars]
yield <- unlist(yield)
topData$Yield <- as.numeric(yield)
top6 <- topVars[1:6]
topData_long <- topData %>%
pivot_longer(cols = all_of(top6), names_to = "Predictor", values_to = "Value")
ggplot(topData_long, aes(x = Value, y = Yield)) +
geom_point(alpha = 0.6, color = "blue") +
geom_smooth(method = "loess", color = "red", se = FALSE) +
facet_wrap(~ Predictor, scales = "free", ncol = 3) +
labs(
title = "Relationships Between Top Predictors and Product Yield",
x = "Predictor Value",
y = "Percent Yield"
) + theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).