library(AppliedPredictiveModeling)
library(caret)
library(tidyverse)
library(RANN)
library(corrplot)
data(permeability)
fp <- as.data.frame(fingerprints)
The matrix fingerprints contains the 1,107 binary molecular predictors for the 165 compounds, while permeability contains permeability response.
df <- fp %>% mutate(permeability = permeability)
ncol(df)-1
## [1] 1107
near0 <- nearZeroVar(fingerprints)
df <- df[-near0]
ncol(df)-1
## [1] 388
The fingerprints matrix has 719 predictors that have low frequencies. Filter them out leaves us with 388 predictors left for modeling.
set.seed(123)
split <- createDataPartition(df$permeability, times = 1, p = 0.8, list = FALSE)
train <- df[split, ]
test <- df[-split, ]
pls <- train(
permeability ~ ., data = train, method = "pls",
center = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
pls
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.91024 0.2908286 10.752868
## 2 11.87088 0.4689347 8.684835
## 3 11.75764 0.4908305 8.904269
## 4 12.01773 0.4912720 9.367372
## 5 11.82378 0.5150347 9.055927
## 6 11.44294 0.5321660 8.498770
## 7 11.41370 0.5373659 8.523544
## 8 11.74730 0.5166471 8.903613
## 9 12.20011 0.4974678 9.304159
## 10 12.61005 0.4728003 9.539143
## 11 13.09291 0.4362551 9.890107
## 12 13.17487 0.4290470 9.912101
## 13 13.18343 0.4329855 9.897183
## 14 13.50906 0.4096387 10.151260
## 15 13.76110 0.3942498 10.249432
## 16 13.98671 0.3836236 10.437043
## 17 14.14000 0.3758722 10.550890
## 18 14.25007 0.3716504 10.575748
## 19 14.36508 0.3667410 10.694289
## 20 14.27893 0.3725028 10.664425
## 21 14.32757 0.3731680 10.760465
## 22 14.42754 0.3666868 10.809506
## 23 14.45381 0.3672322 10.785738
## 24 14.54159 0.3638389 10.764917
## 25 14.68244 0.3612149 10.792626
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 7.
The optimal ncomp is 7 with an estimated \(R^2\) of 0.537.
pls_predict <- predict(pls, test)
acc <- data.frame(obs = test$permeability, pred = pls_predict)
colnames(acc) <- c('obs','pred')
defaultSummary(acc)
## RMSE Rsquared MAE
## 11.9762238 0.3497131 8.4615399
The test estimate of \(R^2\) is 0.35.
set.seed(124)
lasso <- train(permeability ~ .,
data = train,
method='lasso',
metric='Rsquared',
tuneGrid=data.frame(.fraction = seq(0, 0.5, by=0.05)),
trControl=trainControl(method='cv'),
preProcess=c('center','scale')
)
lasso_predict <- predict(lasso, test)
acc <- data.frame(obs = test$permeability, pred = lasso_predict)
colnames(acc) <- c('obs','pred')
defaultSummary(acc)
## RMSE Rsquared MAE
## 11.0952801 0.3661505 7.3149745
set.seed(125)
ridge <- train(permeability ~ .,
data = train,
method='ridge',
metric='Rsquared',
tuneGrid=data.frame(.lambda = seq(0, 1, by=0.1)),
trControl=trainControl(method='cv'),
preProcess=c('center','scale')
)
ridge_predict <- predict(ridge, test)
acc <- data.frame(obs = test$permeability, pred = ridge_predict)
colnames(acc) <- c('obs','pred')
defaultSummary(acc)
## RMSE Rsquared MAE
## 19.9078007 0.3695863 14.7915075
All three models, pls, lasso, and ridge have similar \(R^2\) values. PLS however has the lowest \(R^2\).
data(ChemicalManufacturingProcess)
cmp <- data.frame(ChemicalManufacturingProcess)
The matrix processPredictors contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs. yield contains the percent yield for each run.
preProcess <- preProcess(cmp, method = c("knnImpute"))
df2 <- predict(preProcess, ChemicalManufacturingProcess)
near02 <- nearZeroVar(df2)
df2 <- df2[-near02]
split2 <- createDataPartition(df2$Yield, times = 1, p = 0.8, list = FALSE)
train2 <- df2[split, ]
test2 <- df2[-split, ]
pls2 <- train( Yield ~ ., data = train2, method = "pls",
center = TRUE,
trControl = trainControl("cv", number = 10),
tuneLength = 25
)
pls2
## Partial Least Squares
##
## 133 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 120, 120, 119, 120, 120, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 0.7366876 0.5030891 0.5997426
## 2 0.6590654 0.5768740 0.5265507
## 3 0.6331098 0.6100503 0.5197640
## 4 0.6318924 0.6221822 0.5208045
## 5 0.6293016 0.6368549 0.5196403
## 6 0.6294737 0.6412500 0.5169550
## 7 0.6250970 0.6436297 0.5212808
## 8 0.6280093 0.6417660 0.5242772
## 9 0.6470408 0.6245440 0.5306484
## 10 0.6566609 0.6144314 0.5334303
## 11 0.6573081 0.6141465 0.5361606
## 12 0.6662026 0.6092021 0.5420083
## 13 0.6733999 0.6083402 0.5429599
## 14 0.6868973 0.6013636 0.5541103
## 15 0.6975769 0.5994795 0.5621204
## 16 0.7017185 0.6019151 0.5660800
## 17 0.7145361 0.5942577 0.5695616
## 18 0.7411623 0.5870942 0.5820990
## 19 0.7779535 0.5766898 0.5948971
## 20 0.8199084 0.5672641 0.6076388
## 21 0.8739428 0.5523103 0.6277127
## 22 0.9283078 0.5363993 0.6483694
## 23 1.0015173 0.5211964 0.6768896
## 24 1.0998810 0.5038800 0.7133812
## 25 1.2074458 0.4945373 0.7479405
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 7.
The optimal ncomp is 4 with a \(R^2\) of 0.65.
pls2_predict <- predict(pls2, test2)
acc <- data.frame(obs = test2$Yield, pred = pls2_predict)
colnames(acc) <- c('obs','pred')
defaultSummary(acc)
## RMSE Rsquared MAE
## 0.8374605 0.3625748 0.6651504
varImp(pls2)
## pls variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess09 97.23
## ManufacturingProcess13 97.13
## ManufacturingProcess17 89.01
## ManufacturingProcess36 83.51
## BiologicalMaterial03 76.32
## BiologicalMaterial06 72.82
## BiologicalMaterial02 72.54
## ManufacturingProcess12 64.05
## ManufacturingProcess33 61.88
## BiologicalMaterial08 60.73
## BiologicalMaterial12 60.39
## ManufacturingProcess06 60.16
## ManufacturingProcess11 59.07
## BiologicalMaterial11 58.67
## BiologicalMaterial04 53.24
## BiologicalMaterial01 52.42
## ManufacturingProcess28 51.50
## ManufacturingProcess04 49.96
## ManufacturingProcess10 39.74
Of the top 20 predictors, it seems that there is a mix of both Manufacturing Process and Biological Material. However, the top five predictors are Manufacturing Processes.
df2 %>%
select(c('ManufacturingProcess32','ManufacturingProcess13','ManufacturingProcess09','ManufacturingProcess17','ManufacturingProcess36',
'BiologicalMaterial03', 'BiologicalMaterial06', 'BiologicalMaterial02', 'ManufacturingProcess12', 'BiologicalMaterial08', 'Yield')) %>%
cor() %>%
corrplot(method = 'circle')