#install.packages("AppliedPredictiveModeling")
library(AppliedPredictiveModeling)
#AppliedPredictiveModeling::getPackages(6)
data(permeability)
Answer: There are 1107 features, of which 719 are near 0 variables. So this leaves us with 1107-719, 388 variables which we can potentially use in the prediction.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(AppliedPredictiveModeling)
#AppliedPredictiveModeling::getPackages(6)
data(permeability)
dim(fingerprints)
## [1] 165 1107
length(nearZeroVar(fingerprints))
## [1] 719
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
set.seed(100)
# select only columns which are not nearZeroVariables
fingerprints_wo_0cols <- fingerprints[,-nearZeroVar(fingerprints)]
dim(fingerprints_wo_0cols)
## [1] 165 388
# create a training partition on the Y variable and store the rows selected in trainset
trainset <- createDataPartition(permeability, p=0.8, list=FALSE)
# create a cross-validation control
ctrl <- trainControl(method = "cv", number = 10)
# create training and testing X,Y data using rows as per the partition selected in "trainset" and on the reduced set columns (post filtering out near zeros).
fingerprints_training = fingerprints_wo_0cols[trainset,]
permeability_training = permeability[trainset,]
# by placing a "-" we select all rows of X & Y which are not the ones selected in training-set
fingerprints_testing = fingerprints_wo_0cols[-trainset,]
permeability_testing = permeability[-trainset,]
# Tune partial least squares
plsTune <- train(x = fingerprints_training,
y = permeability_training,
method = "pls",
metric='Rsquared',
tuneLength = 20,
# tuneGrid = expand.grid(ncomp = 1:25),
trControl = ctrl,
preProcess=c('center', 'scale')
)
plsTune
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 119, 121, 119, 120, 120, 120, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 12.89902 0.3366671 9.723079
## 2 11.57073 0.4528188 8.008647
## 3 11.74004 0.4540994 8.638716
## 4 11.93784 0.4435949 8.785540
## 5 11.72550 0.4550101 8.696446
## 6 11.50509 0.4693658 8.546633
## 7 11.39297 0.4871664 8.653850
## 8 11.13762 0.5051883 8.558643
## 9 11.18859 0.5013221 8.642897
## 10 11.29239 0.4985107 8.727017
## 11 11.47258 0.4836285 8.824869
## 12 11.42934 0.4880340 8.975986
## 13 11.74486 0.4702584 9.189660
## 14 12.08391 0.4502299 9.380611
## 15 12.27229 0.4497430 9.560355
## 16 12.53866 0.4429663 9.685105
## 17 12.57991 0.4380633 9.684354
## 18 12.60036 0.4371346 9.779651
## 19 12.65043 0.4323267 9.815696
## 20 12.94446 0.4207868 10.028232
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was ncomp = 8.
plot(plsTune$results$Rsquared)
The optimal number of latent variables is 9, which can be seen from the chart above.
plsTune$results[9,]
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 9 9 11.18859 0.5013221 8.642897 2.332769 0.2286991 1.788398
prediction_testing <- predict(object = plsTune, newdata = fingerprints_testing)
stats_testing <- postResample(pred = prediction_testing, obs = permeability_testing)
stats_testing
## RMSE Rsquared MAE
## 11.0223004 0.5357105 7.9250542
plot(permeability_testing~prediction_testing)
postResample(pred = prediction_testing, obs = permeability_testing)
## RMSE Rsquared MAE
## 11.0223004 0.5357105 7.9250542
Answer: It is interesting to note that the testing set R^2 is higher than training.
set.seed(100)
ridgeGrid <- data.frame(.lambda = seq(0.001, 0.5, length = 25))
ridgeTune <- train(x = fingerprints_training,
y = permeability_training,
method = "ridge",
metric='Rsquared',
tuneGrid = ridgeGrid,
trControl = ctrl,
preProc = c("center", "scale"))
ridgeTune
## Ridge Regression
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 120, 119, 121, 120, 119, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.00100000 8374.84921 0.1548766 4478.724793
## 0.02179167 13.69534 0.3602158 10.311567
## 0.04258333 12.90226 0.4045338 9.691977
## 0.06337500 12.41151 0.4322788 9.296068
## 0.08416667 12.17953 0.4486337 9.106024
## 0.10495833 12.04575 0.4597633 8.982475
## 0.12575000 11.94947 0.4692236 8.911641
## 0.14654167 11.89854 0.4758843 8.889335
## 0.16733333 11.87122 0.4815072 8.876062
## 0.18812500 11.86686 0.4859442 8.885368
## 0.20891667 11.87382 0.4899514 8.912888
## 0.22970833 11.88634 0.4933107 8.928839
## 0.25050000 11.91520 0.4962512 8.957070
## 0.27129167 11.95089 0.4986777 8.982170
## 0.29208333 11.99695 0.5007229 9.016407
## 0.31287500 12.04647 0.5027796 9.052256
## 0.33366667 12.10295 0.5045281 9.089425
## 0.35445833 12.16517 0.5060785 9.132745
## 0.37525000 12.23347 0.5074451 9.187169
## 0.39604167 12.30488 0.5087224 9.257600
## 0.41683333 12.38044 0.5098781 9.333471
## 0.43762500 12.46063 0.5109085 9.415252
## 0.45841667 12.54475 0.5118412 9.498672
## 0.47920833 12.63233 0.5127017 9.583252
## 0.50000000 12.72327 0.5134888 9.671607
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was lambda = 0.5.
Answer: Just looking at the R^2 in-sample I would select the ridge regression model , given the higher R^2 in-sample (52% vs 49% for PLS)
The models are quite similar in terms of R^2 i.e : the explanatory power is quite high as measured by 50%. However a lot of variance in Y is still unexplained. Until more variance is accounted for by a model I would continue withe the experiments.
``