6.2

library(AppliedPredictiveModeling)
library(pls)
library(caret)
library(tidyverse)
library(caTools)
library(imputeR)
library(RANN)
library(corrplot)
data(permeability)
str(permeability)
##  num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:165] "1" "2" "3" "4" ...
##   ..$ : chr "permeability"
fp <- fingerprints[,which(caret::nearZeroVar(fingerprints, saveMetrics = TRUE)$nzv == FALSE)]
length(colnames(fp))
## [1] 388

388 predictors are left.

  1. Split into train-test sets, 70-30.
set.seed(2023)

sample <- sample(c(TRUE, FALSE), nrow(fp), replace=TRUE, prob=c(0.7,0.3))
fp_train  <- fp[sample, ]
fp_test   <- fp[!sample, ]

perm_train <- permeability[sample, ]
perm_test <- permeability[!sample, ]

Fit PLS model to the training data.

plsFit <- train(x=fp_train,
                y=perm_train, 
                method='pls',
                metric='Rsquared',
                tuneLength=20,
                trControl=trainControl(method='cv'),
                preProcess=c('center', 'scale')
                )
plsFit
## Partial Least Squares 
## 
## 112 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 103, 100, 100, 101, 100, 100, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.52985  0.4064140  10.622771
##    2     12.13852  0.5477037   8.895561
##    3     12.42477  0.5294471   9.624263
##    4     12.70108  0.5169363  10.033632
##    5     12.99927  0.4943773  10.018969
##    6     13.08905  0.4937630   9.544333
##    7     13.61761  0.4616061   9.930939
##    8     14.01740  0.4467908  10.478495
##    9     14.58091  0.4051257  11.013284
##   10     14.57757  0.4099755  11.154281
##   11     14.91762  0.3967600  11.386112
##   12     15.28398  0.3830693  11.615241
##   13     15.69304  0.3565963  11.856757
##   14     16.02992  0.3402901  11.933335
##   15     16.33094  0.3196311  12.213135
##   16     16.68416  0.3045479  12.562069
##   17     16.93708  0.2880668  12.784575
##   18     17.24787  0.2829286  12.968920
##   19     17.41798  0.2784598  13.051885
##   20     17.68560  0.2663404  13.426939
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 2.

5 latent variables are optimal. The corresponding \(R^2\) is 0.55.

y_pred <- predict(plsFit, fp_test)
plsValues1 <- data.frame(obs=perm_test, pred=y_pred)
caret::defaultSummary(plsValues1)
##       RMSE   Rsquared        MAE 
## 11.5827866  0.3432959  7.9126393

\(R^2\) for the test set is 0.26.

OLS

lmFit <- train(x=fp_train,
                y=perm_train, 
                method='lm',
                metric='Rsquared',
                tuneLength=20,
                trControl=trainControl(method='cv'),
                preProcess=c('center', 'scale')
                )
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases

## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
lmValues1 <- data.frame(obs=perm_test, pred=predict(lmFit, fp_test))
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
caret::defaultSummary(lmValues1)
##       RMSE   Rsquared        MAE 
## 28.8481320  0.1502712 17.3058131

Ridge Regression

ridgeGrid <- data.frame(.lambda=seq(0.09,0.19,length=15))
ridgeRegFit <- train(fp_train, perm_train,
                     method="ridge",
                     tuneGrid = ridgeGrid,
                     trControl=trainControl(method='cv'),
                     preProc=c("center", "scale"))
ridgeRegFit
## Ridge Regression 
## 
## 112 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 101, 100, 101, 101, 101, 101, ... 
## Resampling results across tuning parameters:
## 
##   lambda      RMSE      Rsquared   MAE     
##   0.09000000  13.91542  0.4570979  10.71888
##   0.09714286  13.80724  0.4636023  10.62552
##   0.10428571  13.74813  0.4679485  10.57313
##   0.11142857  13.70987  0.4715927  10.53741
##   0.11857143  13.66342  0.4751519  10.49449
##   0.12571429  13.63209  0.4780563  10.46303
##   0.13285714  13.59427  0.4813051  10.42907
##   0.14000000  13.57259  0.4838577  10.40943
##   0.14714286  13.54868  0.4865985  10.38070
##   0.15428571  13.53189  0.4889904  10.35929
##   0.16142857  13.51768  0.4911542  10.33984
##   0.16857143  13.49583  0.4940736  10.32289
##   0.17571429  13.50365  0.4948727  10.31646
##   0.18285714  13.49655  0.4969546  10.29567
##   0.19000000  13.49523  0.4986061  10.28384
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.19.
ridgeRegValues1 <- data.frame(obs=perm_test, pred=predict(ridgeRegFit, fp_test))
caret::defaultSummary(ridgeRegValues1)
##       RMSE   Rsquared        MAE 
## 12.4433089  0.4758471  9.2618412

\(R^2\) is 0.34 for \(\lambda\) = 0.126.

Elastic net

enetGrid <- expand.grid(.lambda=c(0,0.01, 0.1),
                        .fraction=seq(0.05, 1, length=20))
enetFit <- train(fp_train, perm_train,
                  method="enet",
                  tuneGrid=enetGrid,
                  trControl = trainControl(method='cv'),
                  preProc = c("center", "scale"))
## Warning: model fit failed for Fold04: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold06: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
enetFit
## Elasticnet 
## 
## 112 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 101, 101, 101, 100, 102, 100, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE       Rsquared   MAE       
##   0.00    0.05       13.49085  0.4390815   10.818087
##   0.00    0.10       13.52793  0.3931288   10.148836
##   0.00    0.15       13.49570  0.3829701    9.990216
##   0.00    0.20       13.54795  0.3783042   10.243401
##   0.00    0.25       13.66982  0.3784918   10.352493
##   0.00    0.30       13.99155  0.3659896   10.525835
##   0.00    0.35       14.38458  0.3537674   10.714959
##   0.00    0.40       14.70947  0.3421817   10.860545
##   0.00    0.45       15.00643  0.3347216   11.039789
##   0.00    0.50       15.32483  0.3287930   11.204226
##   0.00    0.55       15.63020  0.3236367   11.391611
##   0.00    0.60       15.87683  0.3186143   11.552603
##   0.00    0.65       16.07157  0.3149693   11.618335
##   0.00    0.70       16.28390  0.3118922   11.733622
##   0.00    0.75       16.57148  0.3054544   11.983895
##   0.00    0.80       16.83430  0.2992465   12.221619
##   0.00    0.85       17.10483  0.2931047   12.435122
##   0.00    0.90       17.37499  0.2874976   12.653240
##   0.00    0.95       17.64191  0.2832744   12.908339
##   0.00    1.00       17.88991  0.2797537   13.198035
##   0.01    0.05       22.51503  0.4426000   16.282726
##   0.01    0.10       33.36855  0.4144366   23.513085
##   0.01    0.15       44.44389  0.3954547   30.761194
##   0.01    0.20       55.27020  0.3810975   38.005037
##   0.01    0.25       66.16109  0.3643499   45.540471
##   0.01    0.30       77.32099  0.3444027   53.314684
##   0.01    0.35       88.36872  0.3300942   61.105363
##   0.01    0.40       99.39284  0.3176869   68.898317
##   0.01    0.45      110.32577  0.3094629   76.559295
##   0.01    0.50      121.14730  0.3073861   84.042323
##   0.01    0.55      131.92676  0.3049393   91.385017
##   0.01    0.60      142.74110  0.3034785   98.762239
##   0.01    0.65      153.56296  0.3015215  106.138732
##   0.01    0.70      164.42936  0.2974431  113.560364
##   0.01    0.75      175.40014  0.2916935  121.071819
##   0.01    0.80      186.52736  0.2862411  128.640321
##   0.01    0.85      197.66741  0.2832659  136.171302
##   0.01    0.90      208.54404  0.2810403  143.509120
##   0.01    0.95      219.37310  0.2788047  150.816385
##   0.01    1.00      230.19182  0.2754254  158.108170
##   0.10    0.05       12.93279  0.5130220   10.515766
##   0.10    0.10       12.29215  0.5145981    9.330316
##   0.10    0.15       12.15497  0.5123721    9.120304
##   0.10    0.20       12.37399  0.4904010    9.445715
##   0.10    0.25       12.61572  0.4755153    9.665847
##   0.10    0.30       12.79141  0.4696586    9.771812
##   0.10    0.35       12.96353  0.4639780    9.809157
##   0.10    0.40       13.16242  0.4563807    9.849483
##   0.10    0.45       13.39655  0.4471064    9.941345
##   0.10    0.50       13.55341  0.4431284   10.011498
##   0.10    0.55       13.67007  0.4413891   10.092751
##   0.10    0.60       13.78927  0.4395460   10.217036
##   0.10    0.65       13.89710  0.4375694   10.336767
##   0.10    0.70       14.00715  0.4352059   10.460078
##   0.10    0.75       14.10795  0.4332948   10.575282
##   0.10    0.80       14.17904  0.4339272   10.662894
##   0.10    0.85       14.24089  0.4343330   10.735624
##   0.10    0.90       14.28246  0.4350946   10.790848
##   0.10    0.95       14.32410  0.4356979   10.845036
##   0.10    1.00       14.36160  0.4365495   10.888850
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.15 and lambda = 0.1.
enetValues1 <- data.frame(obs=perm_test, pred=predict(enetFit, fp_test))
caret::defaultSummary(enetValues1)
##       RMSE   Rsquared        MAE 
## 10.9854253  0.4665241  7.5849687

\(R^2\) is 0.39, which is the best so far. I recommend this elastic net model to the laboratory.

6.3

data("ChemicalManufacturingProcess")
X <- ChemicalManufacturingProcess[2:58]
y <- ChemicalManufacturingProcess[1]
sum(is.na(X))
## [1] 106

Impute the missing values.

prep <- preProcess(X, method=c("knnImpute"))
prep
## Created from 152 samples and 57 variables
## 
## Pre-processing:
##   - centered (57)
##   - ignored (0)
##   - 5 nearest neighbor imputation (57)
##   - scaled (57)
X_imputed <- predict(prep, X)
sum(is.na(X_imputed))
## [1] 0

Train-test split, 70-30

X_imputed_train <- X_imputed[sample, ]
X_imputed_test <- X_imputed[!sample, ]

y_train <- y[sample, ]
y_test <- y[!sample, ]
enetGrid <- expand.grid(.lambda=c(0,0.01, 0.1),
                        .fraction=seq(0.05, 1, length=20))
enetFit1 <- train(x=X_imputed_train,
                  y=y_train,
                  method="enet",
                  metric="Rsquared",
                  tuneGrid = enetGrid,
                  tuneLength=20,
                  trControl=trainControl(method='cv'),
                  preProcess=c('center', 'scale', 'nzv')
                  )
enetFit1
## Elasticnet 
## 
## 122 samples
##  57 predictor
## 
## Pre-processing: centered (56), scaled (56), remove (1) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 109, 110, 109, 110, 110, 110, ... 
## Resampling results across tuning parameters:
## 
##   lambda  fraction  RMSE       Rsquared   MAE      
##   0.00    0.05       1.217496  0.6008515  0.9980246
##   0.00    0.10       1.295821  0.5686964  1.0017868
##   0.00    0.15       1.400105  0.5562201  1.0313118
##   0.00    0.20       1.407037  0.5420843  1.0718490
##   0.00    0.25       1.531749  0.5164215  1.1572958
##   0.00    0.30       1.922847  0.4608039  1.2982429
##   0.00    0.35       2.172605  0.4461493  1.3909402
##   0.00    0.40       2.462178  0.4308700  1.4887419
##   0.00    0.45       2.773216  0.4239986  1.5820519
##   0.00    0.50       3.800187  0.3358725  1.9135588
##   0.00    0.55       4.677914  0.3235321  2.1823593
##   0.00    0.60       5.370345  0.3146906  2.3951199
##   0.00    0.65       6.002394  0.3067693  2.5923909
##   0.00    0.70       6.608647  0.3009509  2.7836815
##   0.00    0.75       7.194915  0.2952938  2.9692166
##   0.00    0.80       7.781423  0.2882713  3.1529841
##   0.00    0.85       8.397650  0.2790664  3.3446707
##   0.00    0.90       9.003324  0.2711198  3.5321354
##   0.00    0.95       9.521716  0.2645712  3.6929968
##   0.00    1.00      10.018380  0.2593721  3.8459100
##   0.01    0.05       1.516640  0.5849878  1.2543266
##   0.01    0.10       1.299864  0.5931283  1.0616577
##   0.01    0.15       1.231442  0.5750992  1.0095907
##   0.01    0.20       1.280832  0.5520518  1.0247302
##   0.01    0.25       1.316432  0.5491537  1.0296771
##   0.01    0.30       1.345982  0.5535104  1.0360158
##   0.01    0.35       1.367106  0.5638224  1.0335807
##   0.01    0.40       1.348014  0.5785550  1.0203630
##   0.01    0.45       1.306522  0.5928784  1.0072681
##   0.01    0.50       1.278121  0.6021565  1.0023196
##   0.01    0.55       1.307261  0.6057863  1.0278534
##   0.01    0.60       1.395829  0.5512804  1.0784749
##   0.01    0.65       1.465136  0.5110213  1.1220110
##   0.01    0.70       1.573883  0.4692190  1.1687608
##   0.01    0.75       1.689740  0.4389957  1.2132226
##   0.01    0.80       1.780102  0.4198436  1.2473541
##   0.01    0.85       1.847676  0.4072679  1.2735888
##   0.01    0.90       1.895158  0.3978885  1.2922191
##   0.01    0.95       1.892530  0.3945499  1.2956496
##   0.01    1.00       1.828961  0.4207451  1.2763176
##   0.10    0.05       1.654123  0.5225578  1.3765373
##   0.10    0.10       1.494510  0.5872995  1.2341190
##   0.10    0.15       1.363707  0.5940295  1.1173360
##   0.10    0.20       1.275245  0.5948095  1.0487989
##   0.10    0.25       1.243258  0.5754255  1.0235543
##   0.10    0.30       1.252725  0.5633482  1.0096142
##   0.10    0.35       1.274283  0.5587453  1.0143182
##   0.10    0.40       1.298047  0.5550045  1.0222547
##   0.10    0.45       1.379930  0.5425303  1.0508806
##   0.10    0.50       1.509656  0.5358381  1.0919461
##   0.10    0.55       1.560430  0.5385821  1.1083561
##   0.10    0.60       1.588268  0.5424496  1.1159818
##   0.10    0.65       1.626385  0.5453324  1.1265429
##   0.10    0.70       1.659798  0.5464381  1.1371331
##   0.10    0.75       1.657991  0.5473027  1.1409491
##   0.10    0.80       1.688724  0.5632703  1.1593949
##   0.10    0.85       1.774131  0.5305561  1.2054930
##   0.10    0.90       1.857591  0.4911887  1.2419082
##   0.10    0.95       1.931609  0.4644963  1.2713769
##   0.10    1.00       1.986795  0.4538738  1.2922078
## 
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were fraction = 0.55 and lambda = 0.01.

Resampled performance metric, \(R^2\), for the model fit is 0.59.

Evaluate model

enetValues2 <- data.frame(obs=y_test, pred=predict(enetFit1, X_imputed_test))
caret::defaultSummary(enetValues2)
##      RMSE  Rsquared       MAE 
## 1.0397350 0.6703975 0.8490078

The test set \(R^2\) = 0.58 for fraction = 0.4 and lambda = 0.1. This is close to the training set \(R^2\).

varImp(enetFit1)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   64.94
## ManufacturingProcess36   63.16
## ManufacturingProcess17   61.88
## BiologicalMaterial06     61.58
## BiologicalMaterial12     60.23
## BiologicalMaterial03     50.89
## ManufacturingProcess09   48.26
## ManufacturingProcess06   47.05
## BiologicalMaterial02     46.06
## ManufacturingProcess31   44.71
## ManufacturingProcess29   42.29
## BiologicalMaterial08     38.43
## BiologicalMaterial04     38.28
## BiologicalMaterial01     38.10
## ManufacturingProcess11   36.79
## BiologicalMaterial11     36.50
## ManufacturingProcess30   33.26
## ManufacturingProcess02   32.48
## ManufacturingProcess33   31.30

7 of the top 10 predictors are Manufacturing.

top8 <- c("ManufacturingProcess32","BiologicalMaterial06","ManufacturingProcess13", "ManufacturingProcess09",
            "ManufacturingProcess17","BiologicalMaterial02","BiologicalMaterial03","ManufacturingProcess36")
top8_X_imputed <- X_imputed %>% 
                    select(all_of(top8))
corrplot::corrplot(cor(top8_X_imputed), method = 'circle', type="lower")