library(AppliedPredictiveModeling)
library(pls)
library(caret)
library(tidyverse)
library(caTools)
library(imputeR)
library(RANN)
library(corrplot)
data(permeability)
str(permeability)
## num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:165] "1" "2" "3" "4" ...
## ..$ : chr "permeability"
fp <- fingerprints[,which(caret::nearZeroVar(fingerprints, saveMetrics = TRUE)$nzv == FALSE)]
length(colnames(fp))
## [1] 388
388 predictors are left.
set.seed(2023)
sample <- sample(c(TRUE, FALSE), nrow(fp), replace=TRUE, prob=c(0.7,0.3))
fp_train <- fp[sample, ]
fp_test <- fp[!sample, ]
perm_train <- permeability[sample, ]
perm_test <- permeability[!sample, ]
Fit PLS model to the training data.
plsFit <- train(x=fp_train,
y=perm_train,
method='pls',
metric='Rsquared',
tuneLength=20,
trControl=trainControl(method='cv'),
preProcess=c('center', 'scale')
)
plsFit
## Partial Least Squares
##
## 112 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 103, 100, 100, 101, 100, 100, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.52985 0.4064140 10.622771
## 2 12.13852 0.5477037 8.895561
## 3 12.42477 0.5294471 9.624263
## 4 12.70108 0.5169363 10.033632
## 5 12.99927 0.4943773 10.018969
## 6 13.08905 0.4937630 9.544333
## 7 13.61761 0.4616061 9.930939
## 8 14.01740 0.4467908 10.478495
## 9 14.58091 0.4051257 11.013284
## 10 14.57757 0.4099755 11.154281
## 11 14.91762 0.3967600 11.386112
## 12 15.28398 0.3830693 11.615241
## 13 15.69304 0.3565963 11.856757
## 14 16.02992 0.3402901 11.933335
## 15 16.33094 0.3196311 12.213135
## 16 16.68416 0.3045479 12.562069
## 17 16.93708 0.2880668 12.784575
## 18 17.24787 0.2829286 12.968920
## 19 17.41798 0.2784598 13.051885
## 20 17.68560 0.2663404 13.426939
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 2.
5 latent variables are optimal. The corresponding \(R^2\) is 0.55.
y_pred <- predict(plsFit, fp_test)
plsValues1 <- data.frame(obs=perm_test, pred=y_pred)
caret::defaultSummary(plsValues1)
## RMSE Rsquared MAE
## 11.5827866 0.3432959 7.9126393
\(R^2\) for the test set is 0.26.
lmFit <- train(x=fp_train,
y=perm_train,
method='lm',
metric='Rsquared',
tuneLength=20,
trControl=trainControl(method='cv'),
preProcess=c('center', 'scale')
)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
lmValues1 <- data.frame(obs=perm_test, pred=predict(lmFit, fp_test))
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
caret::defaultSummary(lmValues1)
## RMSE Rsquared MAE
## 28.8481320 0.1502712 17.3058131
ridgeGrid <- data.frame(.lambda=seq(0.09,0.19,length=15))
ridgeRegFit <- train(fp_train, perm_train,
method="ridge",
tuneGrid = ridgeGrid,
trControl=trainControl(method='cv'),
preProc=c("center", "scale"))
ridgeRegFit
## Ridge Regression
##
## 112 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 101, 100, 101, 101, 101, 101, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.09000000 13.91542 0.4570979 10.71888
## 0.09714286 13.80724 0.4636023 10.62552
## 0.10428571 13.74813 0.4679485 10.57313
## 0.11142857 13.70987 0.4715927 10.53741
## 0.11857143 13.66342 0.4751519 10.49449
## 0.12571429 13.63209 0.4780563 10.46303
## 0.13285714 13.59427 0.4813051 10.42907
## 0.14000000 13.57259 0.4838577 10.40943
## 0.14714286 13.54868 0.4865985 10.38070
## 0.15428571 13.53189 0.4889904 10.35929
## 0.16142857 13.51768 0.4911542 10.33984
## 0.16857143 13.49583 0.4940736 10.32289
## 0.17571429 13.50365 0.4948727 10.31646
## 0.18285714 13.49655 0.4969546 10.29567
## 0.19000000 13.49523 0.4986061 10.28384
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 0.19.
ridgeRegValues1 <- data.frame(obs=perm_test, pred=predict(ridgeRegFit, fp_test))
caret::defaultSummary(ridgeRegValues1)
## RMSE Rsquared MAE
## 12.4433089 0.4758471 9.2618412
\(R^2\) is 0.34 for \(\lambda\) = 0.126.
enetGrid <- expand.grid(.lambda=c(0,0.01, 0.1),
.fraction=seq(0.05, 1, length=20))
enetFit <- train(fp_train, perm_train,
method="enet",
tuneGrid=enetGrid,
trControl = trainControl(method='cv'),
preProc = c("center", "scale"))
## Warning: model fit failed for Fold04: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold06: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
enetFit
## Elasticnet
##
## 112 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 101, 101, 101, 100, 102, 100, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00 0.05 13.49085 0.4390815 10.818087
## 0.00 0.10 13.52793 0.3931288 10.148836
## 0.00 0.15 13.49570 0.3829701 9.990216
## 0.00 0.20 13.54795 0.3783042 10.243401
## 0.00 0.25 13.66982 0.3784918 10.352493
## 0.00 0.30 13.99155 0.3659896 10.525835
## 0.00 0.35 14.38458 0.3537674 10.714959
## 0.00 0.40 14.70947 0.3421817 10.860545
## 0.00 0.45 15.00643 0.3347216 11.039789
## 0.00 0.50 15.32483 0.3287930 11.204226
## 0.00 0.55 15.63020 0.3236367 11.391611
## 0.00 0.60 15.87683 0.3186143 11.552603
## 0.00 0.65 16.07157 0.3149693 11.618335
## 0.00 0.70 16.28390 0.3118922 11.733622
## 0.00 0.75 16.57148 0.3054544 11.983895
## 0.00 0.80 16.83430 0.2992465 12.221619
## 0.00 0.85 17.10483 0.2931047 12.435122
## 0.00 0.90 17.37499 0.2874976 12.653240
## 0.00 0.95 17.64191 0.2832744 12.908339
## 0.00 1.00 17.88991 0.2797537 13.198035
## 0.01 0.05 22.51503 0.4426000 16.282726
## 0.01 0.10 33.36855 0.4144366 23.513085
## 0.01 0.15 44.44389 0.3954547 30.761194
## 0.01 0.20 55.27020 0.3810975 38.005037
## 0.01 0.25 66.16109 0.3643499 45.540471
## 0.01 0.30 77.32099 0.3444027 53.314684
## 0.01 0.35 88.36872 0.3300942 61.105363
## 0.01 0.40 99.39284 0.3176869 68.898317
## 0.01 0.45 110.32577 0.3094629 76.559295
## 0.01 0.50 121.14730 0.3073861 84.042323
## 0.01 0.55 131.92676 0.3049393 91.385017
## 0.01 0.60 142.74110 0.3034785 98.762239
## 0.01 0.65 153.56296 0.3015215 106.138732
## 0.01 0.70 164.42936 0.2974431 113.560364
## 0.01 0.75 175.40014 0.2916935 121.071819
## 0.01 0.80 186.52736 0.2862411 128.640321
## 0.01 0.85 197.66741 0.2832659 136.171302
## 0.01 0.90 208.54404 0.2810403 143.509120
## 0.01 0.95 219.37310 0.2788047 150.816385
## 0.01 1.00 230.19182 0.2754254 158.108170
## 0.10 0.05 12.93279 0.5130220 10.515766
## 0.10 0.10 12.29215 0.5145981 9.330316
## 0.10 0.15 12.15497 0.5123721 9.120304
## 0.10 0.20 12.37399 0.4904010 9.445715
## 0.10 0.25 12.61572 0.4755153 9.665847
## 0.10 0.30 12.79141 0.4696586 9.771812
## 0.10 0.35 12.96353 0.4639780 9.809157
## 0.10 0.40 13.16242 0.4563807 9.849483
## 0.10 0.45 13.39655 0.4471064 9.941345
## 0.10 0.50 13.55341 0.4431284 10.011498
## 0.10 0.55 13.67007 0.4413891 10.092751
## 0.10 0.60 13.78927 0.4395460 10.217036
## 0.10 0.65 13.89710 0.4375694 10.336767
## 0.10 0.70 14.00715 0.4352059 10.460078
## 0.10 0.75 14.10795 0.4332948 10.575282
## 0.10 0.80 14.17904 0.4339272 10.662894
## 0.10 0.85 14.24089 0.4343330 10.735624
## 0.10 0.90 14.28246 0.4350946 10.790848
## 0.10 0.95 14.32410 0.4356979 10.845036
## 0.10 1.00 14.36160 0.4365495 10.888850
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were fraction = 0.15 and lambda = 0.1.
enetValues1 <- data.frame(obs=perm_test, pred=predict(enetFit, fp_test))
caret::defaultSummary(enetValues1)
## RMSE Rsquared MAE
## 10.9854253 0.4665241 7.5849687
\(R^2\) is 0.39, which is the best so far. I recommend this elastic net model to the laboratory.
data("ChemicalManufacturingProcess")
X <- ChemicalManufacturingProcess[2:58]
y <- ChemicalManufacturingProcess[1]
sum(is.na(X))
## [1] 106
Impute the missing values.
prep <- preProcess(X, method=c("knnImpute"))
prep
## Created from 152 samples and 57 variables
##
## Pre-processing:
## - centered (57)
## - ignored (0)
## - 5 nearest neighbor imputation (57)
## - scaled (57)
X_imputed <- predict(prep, X)
sum(is.na(X_imputed))
## [1] 0
Train-test split, 70-30
X_imputed_train <- X_imputed[sample, ]
X_imputed_test <- X_imputed[!sample, ]
y_train <- y[sample, ]
y_test <- y[!sample, ]
enetGrid <- expand.grid(.lambda=c(0,0.01, 0.1),
.fraction=seq(0.05, 1, length=20))
enetFit1 <- train(x=X_imputed_train,
y=y_train,
method="enet",
metric="Rsquared",
tuneGrid = enetGrid,
tuneLength=20,
trControl=trainControl(method='cv'),
preProcess=c('center', 'scale', 'nzv')
)
enetFit1
## Elasticnet
##
## 122 samples
## 57 predictor
##
## Pre-processing: centered (56), scaled (56), remove (1)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 109, 110, 109, 110, 110, 110, ...
## Resampling results across tuning parameters:
##
## lambda fraction RMSE Rsquared MAE
## 0.00 0.05 1.217496 0.6008515 0.9980246
## 0.00 0.10 1.295821 0.5686964 1.0017868
## 0.00 0.15 1.400105 0.5562201 1.0313118
## 0.00 0.20 1.407037 0.5420843 1.0718490
## 0.00 0.25 1.531749 0.5164215 1.1572958
## 0.00 0.30 1.922847 0.4608039 1.2982429
## 0.00 0.35 2.172605 0.4461493 1.3909402
## 0.00 0.40 2.462178 0.4308700 1.4887419
## 0.00 0.45 2.773216 0.4239986 1.5820519
## 0.00 0.50 3.800187 0.3358725 1.9135588
## 0.00 0.55 4.677914 0.3235321 2.1823593
## 0.00 0.60 5.370345 0.3146906 2.3951199
## 0.00 0.65 6.002394 0.3067693 2.5923909
## 0.00 0.70 6.608647 0.3009509 2.7836815
## 0.00 0.75 7.194915 0.2952938 2.9692166
## 0.00 0.80 7.781423 0.2882713 3.1529841
## 0.00 0.85 8.397650 0.2790664 3.3446707
## 0.00 0.90 9.003324 0.2711198 3.5321354
## 0.00 0.95 9.521716 0.2645712 3.6929968
## 0.00 1.00 10.018380 0.2593721 3.8459100
## 0.01 0.05 1.516640 0.5849878 1.2543266
## 0.01 0.10 1.299864 0.5931283 1.0616577
## 0.01 0.15 1.231442 0.5750992 1.0095907
## 0.01 0.20 1.280832 0.5520518 1.0247302
## 0.01 0.25 1.316432 0.5491537 1.0296771
## 0.01 0.30 1.345982 0.5535104 1.0360158
## 0.01 0.35 1.367106 0.5638224 1.0335807
## 0.01 0.40 1.348014 0.5785550 1.0203630
## 0.01 0.45 1.306522 0.5928784 1.0072681
## 0.01 0.50 1.278121 0.6021565 1.0023196
## 0.01 0.55 1.307261 0.6057863 1.0278534
## 0.01 0.60 1.395829 0.5512804 1.0784749
## 0.01 0.65 1.465136 0.5110213 1.1220110
## 0.01 0.70 1.573883 0.4692190 1.1687608
## 0.01 0.75 1.689740 0.4389957 1.2132226
## 0.01 0.80 1.780102 0.4198436 1.2473541
## 0.01 0.85 1.847676 0.4072679 1.2735888
## 0.01 0.90 1.895158 0.3978885 1.2922191
## 0.01 0.95 1.892530 0.3945499 1.2956496
## 0.01 1.00 1.828961 0.4207451 1.2763176
## 0.10 0.05 1.654123 0.5225578 1.3765373
## 0.10 0.10 1.494510 0.5872995 1.2341190
## 0.10 0.15 1.363707 0.5940295 1.1173360
## 0.10 0.20 1.275245 0.5948095 1.0487989
## 0.10 0.25 1.243258 0.5754255 1.0235543
## 0.10 0.30 1.252725 0.5633482 1.0096142
## 0.10 0.35 1.274283 0.5587453 1.0143182
## 0.10 0.40 1.298047 0.5550045 1.0222547
## 0.10 0.45 1.379930 0.5425303 1.0508806
## 0.10 0.50 1.509656 0.5358381 1.0919461
## 0.10 0.55 1.560430 0.5385821 1.1083561
## 0.10 0.60 1.588268 0.5424496 1.1159818
## 0.10 0.65 1.626385 0.5453324 1.1265429
## 0.10 0.70 1.659798 0.5464381 1.1371331
## 0.10 0.75 1.657991 0.5473027 1.1409491
## 0.10 0.80 1.688724 0.5632703 1.1593949
## 0.10 0.85 1.774131 0.5305561 1.2054930
## 0.10 0.90 1.857591 0.4911887 1.2419082
## 0.10 0.95 1.931609 0.4644963 1.2713769
## 0.10 1.00 1.986795 0.4538738 1.2922078
##
## Rsquared was used to select the optimal model using the largest value.
## The final values used for the model were fraction = 0.55 and lambda = 0.01.
Resampled performance metric, \(R^2\), for the model fit is 0.59.
Evaluate model
enetValues2 <- data.frame(obs=y_test, pred=predict(enetFit1, X_imputed_test))
caret::defaultSummary(enetValues2)
## RMSE Rsquared MAE
## 1.0397350 0.6703975 0.8490078
The test set \(R^2\) = 0.58 for fraction = 0.4 and lambda = 0.1. This is close to the training set \(R^2\).
varImp(enetFit1)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 64.94
## ManufacturingProcess36 63.16
## ManufacturingProcess17 61.88
## BiologicalMaterial06 61.58
## BiologicalMaterial12 60.23
## BiologicalMaterial03 50.89
## ManufacturingProcess09 48.26
## ManufacturingProcess06 47.05
## BiologicalMaterial02 46.06
## ManufacturingProcess31 44.71
## ManufacturingProcess29 42.29
## BiologicalMaterial08 38.43
## BiologicalMaterial04 38.28
## BiologicalMaterial01 38.10
## ManufacturingProcess11 36.79
## BiologicalMaterial11 36.50
## ManufacturingProcess30 33.26
## ManufacturingProcess02 32.48
## ManufacturingProcess33 31.30
7 of the top 10 predictors are Manufacturing.
top8 <- c("ManufacturingProcess32","BiologicalMaterial06","ManufacturingProcess13", "ManufacturingProcess09",
"ManufacturingProcess17","BiologicalMaterial02","BiologicalMaterial03","ManufacturingProcess36")
top8_X_imputed <- X_imputed %>%
select(all_of(top8))
corrplot::corrplot(cor(top8_X_imputed), method = 'circle', type="lower")