library(AppliedPredictiveModeling)
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:fabletools':
##
## MAE, RMSE
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.3
library(RANN)
Developinga model to predict permeability(see Sect.1.4) could savesignificant resources for a pharmaceutical company,while at the same time morerapidly identifying molecules that have a sufficient permeability to become a drug:
data(permeability)
#View(permeability)
glimpse(permeability)
## num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:165] "1" "2" "3" "4" ...
## ..$ : chr "permeability"
The matrix fingerprints contains the 1,107 binary molecular predic- tors for the 165 compounds, while permeability contains permeability response.
nearzerovar_fingerprints <- nearZeroVar(fingerprints)
filtered_fingerprints <- fingerprints[, -nearzerovar_fingerprints]
dim(filtered_fingerprints)
## [1] 165 388
After filtering, we have 388 predictors.
set.seed(0503)
train_index <- createDataPartition(permeability, p= 0.8, list = FALSE)
x_train <- filtered_fingerprints[train_index,]
x_test <- filtered_fingerprints[-train_index,]
y_train <- permeability[train_index,]
y_test <- permeability[-train_index,]
any(is.na(x_train))
## [1] FALSE
any(is.na(x_test))
## [1] FALSE
ctrl <- trainControl(method="CV",
number=10)
pls_tune <- train(x_train,
y_train,
method = "pls",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength=20)
pls_tune
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 120, 119, 120, 120, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.46999 0.3679526 10.556785
## 2 12.31298 0.4869120 9.041411
## 3 12.35822 0.4473346 9.651138
## 4 12.35352 0.4679431 9.839669
## 5 12.12249 0.4787008 9.408666
## 6 12.07879 0.4762009 9.332341
## 7 12.04802 0.4748375 9.423561
## 8 12.23943 0.4668724 9.481606
## 9 12.40785 0.4680487 9.615128
## 10 12.79892 0.4425942 9.840574
## 11 13.13415 0.4275947 9.956607
## 12 13.40018 0.4165695 10.129414
## 13 13.71757 0.4108012 10.220958
## 14 13.92359 0.4051230 10.476302
## 15 14.30322 0.3962766 10.666065
## 16 14.66161 0.3918510 10.897557
## 17 14.98470 0.3796551 11.098809
## 18 14.95145 0.3842898 11.161107
## 19 15.22247 0.3789123 11.458546
## 20 15.29168 0.3720533 11.474786
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 7.
plot(pls_tune)
The final value used for the model was ncomp = 7. with R^2 =
0.4814842(it is also the highest).
pls_pred <- predict(pls_tune, x_test)
R2(pls_pred, y_test) # 0.6389721
## [1] 0.6389721
RMSE(pls_pred, y_test) # 8.853719
## [1] 8.853719
The test set estimate of R2 is 0.6389 (64%)
#Principal Component Analysis
set.seed(0504)
pca_mod <- train(x_train,
y_train,
method = "pcr",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength=20)
pca_mod
## Principal Component Analysis
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 118, 121, 119, 120, 118, 121, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 14.98337 0.1821949 11.607003
## 2 14.99990 0.1810004 11.659622
## 3 14.18866 0.2693764 10.736653
## 4 13.24690 0.3443807 9.880768
## 5 13.16045 0.3529597 9.673684
## 6 13.08983 0.3634369 9.676521
## 7 13.00443 0.3646903 9.473810
## 8 12.82564 0.3822575 9.355700
## 9 12.82556 0.3997470 9.430378
## 10 12.96405 0.3920471 9.443043
## 11 12.66686 0.4295612 9.406121
## 12 12.44184 0.4484645 9.227178
## 13 12.59260 0.4515390 9.426943
## 14 12.78488 0.4439821 9.758528
## 15 12.83986 0.4389411 9.837777
## 16 12.89239 0.4354282 9.893251
## 17 13.00310 0.4217388 10.022086
## 18 12.69085 0.4410085 9.863376
## 19 12.71802 0.4387575 9.935759
## 20 12.81464 0.4328489 9.961656
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 12.
plot(pca_mod)
#The final value used for the model was ncomp = 12. Rsquared = 0.4484645
pca_pred <- predict(pca_mod, x_test)
R2(pca_pred, y_test) # 0.6004573
## [1] 0.6004573
RMSE(pca_pred, y_test) # 8.866286
## [1] 8.866286
# Lasso model
set.seed(0505)
lasso_model <- train(x_train,
y_train,
method = "glmnet",
trControl = ctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
lasso_pred <- predict(lasso_model, x_test)
R2(lasso_pred, y_test) #0.6361564
## [1] 0.6361564
RMSE(lasso_pred, y_test) #8.450471
## [1] 8.450471
Among all tested models (PLS, PCR, and Lasso), the Lasso regression using glmnet achieved the lowest test RMSE (8.45) and comparable Rsquared (0.636), making it the most accurate model for predicting molecular permeability based on fingerprint features.
No, while the lasso regression model using molecular fingerprint data achieved the best predictive performance among tested models, it still fails to explain approximately 36% of the variability in permeability.
A chemical manufacturing process for a pharmaceutical product was discussed in Sect.1.4. In this problem, the objective is to understand the re- lationshipbetween biologicalmeasurementsof the rawmaterials(predictors), measurements of the manufacturing process (predictors), and the response of product yield. Biological predictors cannot be changed but can be used to assess the quality of the raw material before processing. On the other hand, manufacturing process predictors can be changed in the manufacturing pro- cess. Improving product yield by 1% will boost revenue by approximately one hundred thousand dollars per batch:
data("ChemicalManufacturingProcess")
#head(ChemicalManufacturingProcess)
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
set.seed(0506)
cmp <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
cmp_imputed <- predict(cmp, ChemicalManufacturingProcess)
any(is.na(cmp_imputed)) # no missing data
## [1] FALSE
#removing near zero values
cmp_filtered <- cmp_imputed[,-nearZeroVar(cmp_imputed)]
set.seed(0507)
cmp_train_index <- createDataPartition(cmp_filtered$Yield, p= 0.8, list = FALSE)
cmp_train <- cmp_filtered[cmp_train_index,]
cmp_test <- cmp_filtered[-cmp_train_index,]
cmp_lasso_mod <- train(Yield ~.,
data=cmp_train,
method = "glmnet",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneGrid = expand.grid(.alpha = 1, .lambda = seq(0, 1, 0.05)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
cmp_lasso_mod
## glmnet
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 129, 128, 130, 129, 130, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.00 2.0505100 0.5152534 1.0297892
## 0.05 0.6377701 0.6205448 0.4986494
## 0.10 0.6232801 0.6392475 0.4988210
## 0.15 0.6442306 0.6340215 0.5177560
## 0.20 0.6697603 0.6322496 0.5389214
## 0.25 0.7023557 0.6293676 0.5681246
## 0.30 0.7389366 0.6273027 0.5991210
## 0.35 0.7795083 0.6236189 0.6331589
## 0.40 0.8238814 0.6182048 0.6693041
## 0.45 0.8707997 0.6057815 0.7081918
## 0.50 0.9199569 0.5592471 0.7483660
## 0.55 0.9651399 0.4639947 0.7856539
## 0.60 0.9910700 0.3131442 0.8069767
## 0.65 0.9926487 NaN 0.8077715
## 0.70 0.9926487 NaN 0.8077715
## 0.75 0.9926487 NaN 0.8077715
## 0.80 0.9926487 NaN 0.8077715
## 0.85 0.9926487 NaN 0.8077715
## 0.90 0.9926487 NaN 0.8077715
## 0.95 0.9926487 NaN 0.8077715
## 1.00 0.9926487 NaN 0.8077715
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.1.
#The final values used for the model were alpha = 1 and lambda = 0.1. (0.10 rmse= 0.6232801, Rsquared = 0.6392475, MAE = 0.4988210)
plot(cmp_lasso_mod)
cmp_lasso_pred <- predict(cmp_lasso_mod, cmp_test)
R2(cmp_lasso_pred, cmp_test$Yield) #0.4785318
## [1] 0.4785318
RMSE(cmp_lasso_pred, cmp_test$Yield) #0.7258316
## [1] 0.7258316
The lasso model achieved a test set RMSE of 0.726 and an R² of 0.479, indicating that it explains approximately 48% of the variance in yield on unseen data. In comparison, the resampled performance on the training set showed a lower RMSE of 0.623 and a higher R² of 0.639, suggesting the model performed better during cross-validation.
varImp(cmp_lasso_mod)
## glmnet variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09 67.071
## ManufacturingProcess17 24.745
## ManufacturingProcess34 10.763
## ManufacturingProcess13 10.232
## ManufacturingProcess06 7.916
## ManufacturingProcess36 4.542
## BiologicalMaterial06 1.553
## ManufacturingProcess37 1.392
## BiologicalMaterial05 0.103
## ManufacturingProcess44 0.000
## ManufacturingProcess42 0.000
## ManufacturingProcess07 0.000
## ManufacturingProcess41 0.000
## ManufacturingProcess04 0.000
## ManufacturingProcess08 0.000
## ManufacturingProcess22 0.000
## ManufacturingProcess35 0.000
## ManufacturingProcess39 0.000
## ManufacturingProcess38 0.000
The most important predictors in the lasso model are primarily process-related variables, with ManufacturingProcess32, ManufacturingProcess09, and ManufacturingProcess17 having the highest influence on yield prediction. Most of the predictos com efrom manufacturing process category.
ggplot(cmp_test, aes(x = ManufacturingProcess32, y = Yield)) +
geom_point() +
geom_smooth(method = "loess") +
labs(title = "Yield vs. ManufacturingProcess32")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(cmp_test, aes(x = ManufacturingProcess09, y = Yield)) +
geom_point() +
geom_smooth(method = "loess") +
labs(title = "Yield vs. ManufacturingProcess09")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(cmp_test, aes(x = ManufacturingProcess17, y = Yield)) +
geom_point() +
geom_smooth(method = "loess") +
labs(title = "Yield vs. ManufacturingProcess17")
## `geom_smooth()` using formula = 'y ~ x'
The top 3 predictors are clearly related to yield. For example,
when ManufacturingProcess32 increases, yield also tends to go up. The
relationships for ManufacturingProcess09 and 17 are more complex, with
certain levels leading to better results than others.