#(a)
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(pls)
## Warning: package 'pls' was built under R version 4.5.3
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.3
data(permeability)
#(b) Remove near-zero variance predictors
nzv <- nearZeroVar(fingerprints)
fp_filtered <- fingerprints[, -nzv]
ncol(fp_filtered) #how many predictors remain
## [1] 388
#(c) Train/test split, PLS with CV
set.seed(33)
trainingRows <- createDataPartition(permeability, p = .80, list = FALSE)
trainX <- fp_filtered[trainingRows, ]
trainY <- permeability[trainingRows]
testX <- fp_filtered[-trainingRows, ]
testY <- permeability[-trainingRows]
ctrl <- trainControl(method = "cv", number = 10)
plsTune <- train(x = trainX, y = trainY,
method = "pls",
tuneLength = 20,
trControl = ctrl,
preProc = c("center", "scale"))
plsTune #optimal ncomp and resampled R2
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 120, 119, 120, 119, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.96044 0.3385896 9.814830
## 2 11.71718 0.4794072 8.202805
## 3 11.60545 0.4836197 8.579933
## 4 11.75710 0.4769434 8.895524
## 5 11.78095 0.4830989 8.628903
## 6 11.39458 0.5065292 8.265292
## 7 10.90941 0.5397751 8.034502
## 8 11.08559 0.5327053 8.403888
## 9 10.86964 0.5497278 8.267434
## 10 11.21539 0.5363033 8.419640
## 11 11.13202 0.5471470 8.343467
## 12 11.15027 0.5524030 8.140555
## 13 11.46763 0.5403655 8.309229
## 14 11.78047 0.5212671 8.470645
## 15 12.16061 0.5013709 8.569210
## 16 12.43940 0.4913964 8.734320
## 17 13.09830 0.4631508 9.095790
## 18 13.23468 0.4602916 9.188634
## 19 13.33250 0.4568817 9.176185
## 20 13.65889 0.4466607 9.305059
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 9.
plot(plsTune)
(d)
#(d) Test set R2
plsPred <- predict(plsTune, testX)
postResample(pred = plsPred, obs = testY)
## RMSE Rsquared MAE
## 11.7137162 0.4225956 9.0405316
#(e) Compare Ridge, Lasso, Elastic Net
#Ridge Model
ridgeGrid <- data.frame(.lambda = seq(0, 0.1, length = 15))
set.seed(33)
ridgeTune <- train(x = trainX, y = trainY,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl, #Uses the same 10-fold CV from 6.2c
preProc = c("center", "scale"))
print("Ridge:")
## [1] "Ridge:"
postResample(predict(ridgeTune, testX), testY)
## RMSE Rsquared MAE
## 11.6066468 0.4374522 8.6189977
#Elastic Net Model
enetGrid <- expand.grid(.lambda = c(0, 0.01, .1),
.fraction = seq(.05, 1, length = 20))
set.seed(33)
enetTune <- train(x = trainX, y = trainY,
method = "enet",
tuneGrid = enetGrid,
trControl = ctrl,
preProc = c("center", "scale"))
print("Elastic Net:")
## [1] "Elastic Net:"
postResample(predict(enetTune, testX), testY)
## RMSE Rsquared MAE
## 12.3351189 0.3675049 9.4256366
#Compare Models
modelComp <- resamples(list(PLS = plsTune,
Ridge = ridgeTune,
Enet = enetTune))
summary(modelComp)
##
## Call:
## summary.resamples(object = modelComp)
##
## Models: PLS, Ridge, Enet
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 5.293014 6.796946 7.488752 8.267434 10.180476 11.19106 0
## Ridge 6.088223 7.628777 9.061338 9.283594 10.790152 12.66483 0
## Enet 5.317290 5.942186 8.432479 8.080082 9.085679 11.76096 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 5.972881 8.667151 10.28859 10.86964 13.59774 15.36246 0
## Ridge 8.687797 10.026379 11.96286 12.77383 15.68575 18.83456 0
## Enet 6.780212 8.378967 11.37924 10.92617 12.23813 15.99878 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## PLS 0.27505748 0.4037813 0.5358367 0.5497278 0.6786614 0.8460760 0
## Ridge 0.09763276 0.4783394 0.5029562 0.5011416 0.6743897 0.7564346 0
## Enet 0.12155830 0.4532943 0.6365406 0.5547601 0.6924048 0.8604844 0
Ridge had the best test Rsquared at 0.44, slightly better than PLS at 0.42
#(a)
data(ChemicalManufacturingProcess)
#(b) Impute missing values (kNN imputation)
preProcValues <- preProcess(ChemicalManufacturingProcess[, -1], method = "knnImpute")
imputedPredictors <- predict(preProcValues, ChemicalManufacturingProcess[, -1])
yield <- ChemicalManufacturingProcess$Yield
#(c) Split the data and tune a PLS model
set.seed(33)
trainingRows <- createDataPartition(yield, p = .80, list = FALSE)
trainX <- imputedPredictors[trainingRows, ]
trainY <- yield[trainingRows]
testX <- imputedPredictors[-trainingRows, ]
testY <- yield[-trainingRows]
ctrl <- trainControl(method = "cv", number = 10)
plsTune63 <- train(x = trainX, y = trainY,
method = "pls",
tuneLength = 20,
trControl = ctrl,
preProc = c("center", "scale"))
plsTune63
## Partial Least Squares
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 131, 129, 131, 130, 131, 130, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.545001 0.4302606 1.187743
## 2 1.834443 0.4848352 1.214064
## 3 1.370258 0.5216314 1.072437
## 4 1.553686 0.4817194 1.120148
## 5 1.767308 0.4742125 1.185193
## 6 1.859092 0.4657679 1.216020
## 7 2.065680 0.4481543 1.275070
## 8 2.112160 0.4265472 1.297104
## 9 2.336034 0.4102914 1.354632
## 10 2.445747 0.4177970 1.371772
## 11 2.535817 0.4139401 1.386860
## 12 2.513643 0.4054740 1.398265
## 13 2.477060 0.4068159 1.388435
## 14 2.433151 0.4073522 1.387000
## 15 2.339621 0.4220440 1.360800
## 16 2.303184 0.4507302 1.344827
## 17 2.332383 0.4559876 1.342788
## 18 2.384233 0.4485271 1.370815
## 19 2.429933 0.4436180 1.393820
## 20 2.584977 0.4080036 1.448723
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plot(plsTune63)
#(d) Predict and calculate performance
plsPred63 <- predict(plsTune63, testX)
postResample(pred = plsPred63, obs = testY)
## RMSE Rsquared MAE
## 1.1378032 0.6344559 0.8480494
#(e) Identify predictor importance
#The process variables dominate the list
varImp(plsTune63)
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 87.11
## ManufacturingProcess09 86.69
## ManufacturingProcess36 82.68
## ManufacturingProcess17 80.80
## BiologicalMaterial02 65.94
## ManufacturingProcess11 65.46
## BiologicalMaterial06 63.91
## ManufacturingProcess06 61.98
## ManufacturingProcess33 61.14
## BiologicalMaterial08 59.89
## BiologicalMaterial04 58.72
## BiologicalMaterial12 57.99
## BiologicalMaterial11 57.83
## ManufacturingProcess12 56.63
## BiologicalMaterial03 56.52
## BiologicalMaterial01 51.92
## ManufacturingProcess28 48.83
## ManufacturingProcess04 42.34
## BiologicalMaterial10 40.30
Looking at the variable importance output, manufacturing process predictors take up most of the top spots, with ManufacturingProcess32 ranking highest. This suggests that yield is more influenced by how the process is run than by the raw material inputs. The biological material predictors still show up in the top 20, but process variables seem to matter more overall. Since process variables can actually be adjusted, this information could help guide decisions about how to run future batches. The biological predictors are less actionable but could help identify when incoming raw materials might lead to a poor yield before the batch is started. But overall, the process variables dominating is a positive thing, because they allow more control over the process.