6.2

#(a)
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(pls)
## Warning: package 'pls' was built under R version 4.5.3
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.3
data(permeability)
#(b) Remove near-zero variance predictors
nzv <- nearZeroVar(fingerprints)
fp_filtered <- fingerprints[, -nzv]
ncol(fp_filtered)      #how many predictors remain
## [1] 388
#(c) Train/test split, PLS with CV
set.seed(33)
trainingRows <- createDataPartition(permeability, p = .80, list = FALSE)
trainX <- fp_filtered[trainingRows, ]
trainY <- permeability[trainingRows]
testX <- fp_filtered[-trainingRows, ]
testY <- permeability[-trainingRows]

ctrl <- trainControl(method = "cv", number = 10)

plsTune <- train(x = trainX, y = trainY,
                 method = "pls",
                 tuneLength = 20,
                 trControl = ctrl,
                 preProc = c("center", "scale"))

plsTune        #optimal ncomp and resampled R2
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 120, 120, 119, 120, 119, 120, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     12.96044  0.3385896  9.814830
##    2     11.71718  0.4794072  8.202805
##    3     11.60545  0.4836197  8.579933
##    4     11.75710  0.4769434  8.895524
##    5     11.78095  0.4830989  8.628903
##    6     11.39458  0.5065292  8.265292
##    7     10.90941  0.5397751  8.034502
##    8     11.08559  0.5327053  8.403888
##    9     10.86964  0.5497278  8.267434
##   10     11.21539  0.5363033  8.419640
##   11     11.13202  0.5471470  8.343467
##   12     11.15027  0.5524030  8.140555
##   13     11.46763  0.5403655  8.309229
##   14     11.78047  0.5212671  8.470645
##   15     12.16061  0.5013709  8.569210
##   16     12.43940  0.4913964  8.734320
##   17     13.09830  0.4631508  9.095790
##   18     13.23468  0.4602916  9.188634
##   19     13.33250  0.4568817  9.176185
##   20     13.65889  0.4466607  9.305059
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 9.
plot(plsTune)

(d)

#(d) Test set R2
plsPred <- predict(plsTune, testX)
postResample(pred = plsPred, obs = testY)
##       RMSE   Rsquared        MAE 
## 11.7137162  0.4225956  9.0405316
#(e) Compare Ridge, Lasso, Elastic Net

#Ridge Model
ridgeGrid <- data.frame(.lambda = seq(0, 0.1, length = 15))
set.seed(33)
ridgeTune <- train(x = trainX, y = trainY,
                   method = "ridge",
                   tuneGrid = ridgeGrid,
                   trControl = ctrl, #Uses the same 10-fold CV from 6.2c
                   preProc = c("center", "scale"))
print("Ridge:")
## [1] "Ridge:"
postResample(predict(ridgeTune, testX), testY)
##       RMSE   Rsquared        MAE 
## 11.6066468  0.4374522  8.6189977
#Elastic Net Model
enetGrid <- expand.grid(.lambda = c(0, 0.01, .1), 
                        .fraction = seq(.05, 1, length = 20))
set.seed(33)
enetTune <- train(x = trainX, y = trainY,
                   method = "enet",
                   tuneGrid = enetGrid,
                   trControl = ctrl,
                   preProc = c("center", "scale"))
print("Elastic Net:")
## [1] "Elastic Net:"
postResample(predict(enetTune, testX), testY)
##       RMSE   Rsquared        MAE 
## 12.3351189  0.3675049  9.4256366
#Compare Models
modelComp <- resamples(list(PLS = plsTune, 
                            Ridge = ridgeTune, 
                            Enet = enetTune))
summary(modelComp)
## 
## Call:
## summary.resamples(object = modelComp)
## 
## Models: PLS, Ridge, Enet 
## Number of resamples: 10 
## 
## MAE 
##           Min.  1st Qu.   Median     Mean   3rd Qu.     Max. NA's
## PLS   5.293014 6.796946 7.488752 8.267434 10.180476 11.19106    0
## Ridge 6.088223 7.628777 9.061338 9.283594 10.790152 12.66483    0
## Enet  5.317290 5.942186 8.432479 8.080082  9.085679 11.76096    0
## 
## RMSE 
##           Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## PLS   5.972881  8.667151 10.28859 10.86964 13.59774 15.36246    0
## Ridge 8.687797 10.026379 11.96286 12.77383 15.68575 18.83456    0
## Enet  6.780212  8.378967 11.37924 10.92617 12.23813 15.99878    0
## 
## Rsquared 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## PLS   0.27505748 0.4037813 0.5358367 0.5497278 0.6786614 0.8460760    0
## Ridge 0.09763276 0.4783394 0.5029562 0.5011416 0.6743897 0.7564346    0
## Enet  0.12155830 0.4532943 0.6365406 0.5547601 0.6924048 0.8604844    0

Ridge had the best test Rsquared at 0.44, slightly better than PLS at 0.42

  1. Based on the results, NO. These models are not ready to replace the lab experiment. The best Rsquared on the test set was only 0.44 (ridge), meaning the model explains only 44% of the variance in permeability, which is weak. That said, it could still be helpful as a preliminary filter to screen out obviously poor candidates before testing, but for any serious decision you would still want the actual lab result.

6.3

#(a)
data(ChemicalManufacturingProcess)
#(b) Impute missing values (kNN imputation) 
preProcValues <- preProcess(ChemicalManufacturingProcess[, -1], method = "knnImpute")
imputedPredictors <- predict(preProcValues, ChemicalManufacturingProcess[, -1])

yield <- ChemicalManufacturingProcess$Yield
#(c) Split the data and tune a PLS model
set.seed(33)

trainingRows <- createDataPartition(yield, p = .80, list = FALSE)
trainX <- imputedPredictors[trainingRows, ]
trainY <- yield[trainingRows]
testX  <- imputedPredictors[-trainingRows, ]
testY  <- yield[-trainingRows]

ctrl <- trainControl(method = "cv", number = 10)

plsTune63 <- train(x = trainX, y = trainY,
                   method = "pls",
                   tuneLength = 20,
                   trControl = ctrl,
                   preProc = c("center", "scale"))
plsTune63
## Partial Least Squares 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 131, 129, 131, 130, 131, 130, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     1.545001  0.4302606  1.187743
##    2     1.834443  0.4848352  1.214064
##    3     1.370258  0.5216314  1.072437
##    4     1.553686  0.4817194  1.120148
##    5     1.767308  0.4742125  1.185193
##    6     1.859092  0.4657679  1.216020
##    7     2.065680  0.4481543  1.275070
##    8     2.112160  0.4265472  1.297104
##    9     2.336034  0.4102914  1.354632
##   10     2.445747  0.4177970  1.371772
##   11     2.535817  0.4139401  1.386860
##   12     2.513643  0.4054740  1.398265
##   13     2.477060  0.4068159  1.388435
##   14     2.433151  0.4073522  1.387000
##   15     2.339621  0.4220440  1.360800
##   16     2.303184  0.4507302  1.344827
##   17     2.332383  0.4559876  1.342788
##   18     2.384233  0.4485271  1.370815
##   19     2.429933  0.4436180  1.393820
##   20     2.584977  0.4080036  1.448723
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plot(plsTune63)

#(d) Predict and calculate performance
plsPred63 <- predict(plsTune63, testX)
postResample(pred = plsPred63, obs = testY)
##      RMSE  Rsquared       MAE 
## 1.1378032 0.6344559 0.8480494
#(e) Identify predictor importance
#The process variables dominate the list
varImp(plsTune63)
## pls variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   87.11
## ManufacturingProcess09   86.69
## ManufacturingProcess36   82.68
## ManufacturingProcess17   80.80
## BiologicalMaterial02     65.94
## ManufacturingProcess11   65.46
## BiologicalMaterial06     63.91
## ManufacturingProcess06   61.98
## ManufacturingProcess33   61.14
## BiologicalMaterial08     59.89
## BiologicalMaterial04     58.72
## BiologicalMaterial12     57.99
## BiologicalMaterial11     57.83
## ManufacturingProcess12   56.63
## BiologicalMaterial03     56.52
## BiologicalMaterial01     51.92
## ManufacturingProcess28   48.83
## ManufacturingProcess04   42.34
## BiologicalMaterial10     40.30

Looking at the variable importance output, manufacturing process predictors take up most of the top spots, with ManufacturingProcess32 ranking highest. This suggests that yield is more influenced by how the process is run than by the raw material inputs. The biological material predictors still show up in the top 20, but process variables seem to matter more overall. Since process variables can actually be adjusted, this information could help guide decisions about how to run future batches. The biological predictors are less actionable but could help identify when incoming raw materials might lead to a poor yield before the batch is started. But overall, the process variables dominating is a positive thing, because they allow more control over the process.