data(tecator)
# ?tecator
str(absorp)
## num [1:215, 1:100] 2.62 2.83 2.58 2.82 2.79 ...
str(endpoints)
## num [1:215, 1:3] 60.5 46 71 72.8 58.3 44 44 69.3 61.4 61.4 ...
moisture = endpoints[,1]
fat = endpoints[,2]
protein = endpoints[,3]
# Split data
set.seed(123)
trainIndex <- createDataPartition(moisture, p = 0.8, list = FALSE)
absorp_df <- as.data.frame(absorp)
x_train <- absorp_df[trainIndex, ]
x_test <- absorp_df[-trainIndex, ]
y_train <- moisture[trainIndex]
y_test <- moisture[-trainIndex]
# Pre-process
preProc <- preProcess(x_train, method = c("center", "scale"))
x_train <- predict(preProc, x_train)
x_test <- predict(preProc, x_test)
# OLS
set.seed(100)
indx <- createFolds(y_train, returnTrain = TRUE)
ctrl <- trainControl(method = "cv", index = indx)
set.seed(100)
lmTune0 <- train(x = x_train, y = y_train,
method = "lm",
trControl = ctrl)
testResults <- data.frame(obs = y_test,
OLS = predict(lmTune0, newdata = x_test))
# PLS
set.seed(100)
plsTune <- train(x = x_train, y = y_train,
method = "pls",
tuneGrid = expand.grid(ncomp = 1:50),
trControl = ctrl)
testResults$PLS <- predict(plsTune, newdata = x_test)
# PCR
set.seed(100)
pcrTune <- train(x = x_train, y = y_train,
method = "pcr",
tuneGrid = expand.grid(ncomp = 1:50),
trControl = ctrl)
testResults$PCR <- predict(pcrTune, newdata = x_test)
# Ridge Regression
ridgeGrid <- expand.grid(lambda = seq(0, .1, length = 10))
set.seed(100)
ridgeTune <- train(x = x_train, y = y_train,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl)
testResults$Ridge <- predict(ridgeTune, newdata = x_test)
# ENET
enetGrid <- expand.grid(lambda = c(0, 0.01, .1),
fraction = seq(.05, 1, length = 20))
set.seed(100)
enetTune <- train(x = x_train, y = y_train,
method = "enet",
tuneGrid = enetGrid,
trControl = ctrl)
testResults$ENET <- predict(enetTune, newdata = x_test)
# Best params
plsTune$bestTune
## ncomp
## 17 17
pcrTune$bestTune
## ncomp
## 28 28
ridgeTune$bestTune
## lambda
## 1 0
enetTune$bestTune
## fraction lambda
## 1 0.05 0
The optimal tuning parameters selected via cross-validation were:
PCR: ncomp = 17
PLS: ncomp = 28
Ridge: lambda = 0
Elastic Net: lambda = 0, fraction = 0.05
# Compare models
data.frame(rbind(OLS = postResample(pred = testResults$OLS, obs = y_test),
PLS = postResample(pred = testResults$PLS, obs = y_test),
PCR = postResample(pred = testResults$PCR, obs = y_test),
Ridge = postResample(pred = testResults$Ridge, obs = y_test),
ENET = postResample(pred = testResults$ENET, obs = y_test),
SVM = postResample(pred = testResults$SVM, obs = y_test),
KNN = postResample(pred = testResults$KNN, obs = y_test),
MARS = postResample(pred = testResults$MARS, obs = y_test),
NNet = postResample(pred = testResults$NN, obs = y_test),
NNetPCA = postResample(pred = testResults$NN_PCA, obs = y_test)))
## RMSE Rsquared MAE
## OLS 3.545768 0.8771040 1.5960313
## PLS 1.615357 0.9738681 1.1855205
## PCR 1.696345 0.9736020 1.2714619
## Ridge 3.545632 0.8771120 1.5959803
## ENET 1.431302 0.9799521 1.0447150
## SVM 2.588153 0.9245771 1.8895282
## KNN 7.587939 0.5543348 5.3562500
## MARS 1.971870 0.9569271 1.4368790
## NNet 0.561155 0.9966528 0.4408398
## NNetPCA 7.281670 0.3977038 5.6333212
Among all models tested, the neural network trained on the original predictors (NNet) had the best predictive performance, achieving the lowest RMSE (0.56), highest R2 (0.997), and lowest MAE (0.44). Models like KNN and the neural network trained on PCA-transformed predictors (NNetPCA) performed the worst, with high RMSE and low R2, indicating poor predictive ability. This suggests that, for this dataset, the neural network benefits from the full set of original predictors and that PCA pre-processing may have removed important information.
library(AppliedPredictiveModeling)
data(permeability)
str(fingerprints)
## num [1:165, 1:1107] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:165] "1" "2" "3" "4" ...
## ..$ : chr [1:1107] "X1" "X2" "X3" "X4" ...
str(permeability)
## num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:165] "1" "2" "3" "4" ...
## ..$ : chr "permeability"
nzv <- nearZeroVar(fingerprints)
fingerprints_filtered <- fingerprints[, -nzv]
dim(fingerprints_filtered)
## [1] 165 388
388 predictors remain for modeling.
set.seed(123)
trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
trainX <- fingerprints_filtered[trainIndex, ]
trainY <- permeability[trainIndex]
testX <- fingerprints_filtered[-trainIndex, ]
testY <- permeability[-trainIndex]
# PLS
ctrl <- trainControl(method = "cv", number = 10)
plsFit <- train(
x = trainX,
y = trainY,
method = "pls",
preProcess = c("center", "scale"),
tuneLength = 20,
trControl = ctrl
)
plsFit
## Partial Least Squares
##
## 133 samples
## 388 predictors
##
## Pre-processing: centered (388), scaled (388)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 13.31894 0.3442124 10.254018
## 2 11.78898 0.4830504 8.534741
## 3 11.98818 0.4792649 9.219285
## 4 12.04349 0.4923322 9.448926
## 5 11.79823 0.5193195 9.049121
## 6 11.53275 0.5335956 8.658301
## 7 11.64053 0.5229621 8.878265
## 8 11.86459 0.5144801 9.265252
## 9 11.98385 0.5188205 9.218594
## 10 12.55634 0.4808614 9.610747
## 11 12.69674 0.4758068 9.702325
## 12 13.01534 0.4538906 9.956623
## 13 13.12637 0.4367362 9.878017
## 14 13.44865 0.4140715 10.065088
## 15 13.60135 0.4034269 10.188150
## 16 13.79361 0.3943904 10.247160
## 17 14.00756 0.3845119 10.412776
## 18 14.18113 0.3711378 10.587027
## 19 14.25674 0.3703610 10.575726
## 20 14.33121 0.3723176 10.679764
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.
plsFit$bestTune
## ncomp
## 6 6
plsFit$results
## ncomp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 13.31894 0.3442124 10.254018 2.992891 0.2338804 2.391627
## 2 2 11.78898 0.4830504 8.534741 3.246743 0.1956815 2.070321
## 3 3 11.98818 0.4792649 9.219285 2.846426 0.1865983 2.036920
## 4 4 12.04349 0.4923322 9.448926 3.097630 0.2096670 2.365493
## 5 5 11.79823 0.5193195 9.049121 2.874689 0.2123931 2.210030
## 6 6 11.53275 0.5335956 8.658301 2.638831 0.1912053 2.007613
## 7 7 11.64053 0.5229621 8.878265 2.102121 0.1532401 1.627023
## 8 8 11.86459 0.5144801 9.265252 2.238164 0.1485683 1.727048
## 9 9 11.98385 0.5188205 9.218594 2.191476 0.1467131 1.685842
## 10 10 12.55634 0.4808614 9.610747 2.380454 0.1554564 1.762340
## 11 11 12.69674 0.4758068 9.702325 2.531396 0.1563200 1.858442
## 12 12 13.01534 0.4538906 9.956623 2.613228 0.1593663 1.944053
## 13 13 13.12637 0.4367362 9.878017 2.419911 0.1702133 1.685927
## 14 14 13.44865 0.4140715 10.065088 2.460808 0.1798710 1.754670
## 15 15 13.60135 0.4034269 10.188150 2.794828 0.1927099 1.861764
## 16 16 13.79361 0.3943904 10.247160 2.996247 0.2029007 2.082037
## 17 17 14.00756 0.3845119 10.412776 3.331556 0.2201622 2.252877
## 18 18 14.18113 0.3711378 10.587027 3.373322 0.2213465 2.269240
## 19 19 14.25674 0.3703610 10.575726 3.277469 0.2103426 2.249600
## 20 20 14.33121 0.3723176 10.679764 3.464429 0.2143669 2.335032
Optimal latent variables = 6
Corresponding R2 estimate = 0.5335956
testResults2 <- data.frame(obs = testY,
PLS = predict(plsFit, newdata = testX))
postResample(pred = testResults2$PLS, obs = testY)
## RMSE Rsquared MAE
## 12.3486900 0.3244542 8.2881075
R2 estimate = 3244542
# OLS
set.seed(100)
indx <- createFolds(trainY, returnTrain = TRUE)
set.seed(100)
lmTune2 <- train(x = trainX, y = trainY,
method = "lm",
trControl = ctrl)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning: predictions failed for Fold05: intercept=TRUE Error in qr.default(tR) : NA/NaN/Inf in foreign function call (arg 1)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
testResults2$OLS <- predict(lmTune2, newdata = testX)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
# PCR
set.seed(100)
pcrTune2 <- train(x = trainX, y = trainY,
method = "pcr",
tuneGrid = expand.grid(ncomp = 1:50),
trControl = ctrl)
testResults2$PCR <- predict(pcrTune2, newdata = testX)
# Ridge Regression
set.seed(100)
ridgeTune2 <- train(x = trainX, y = trainY,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl)
## Warning: model fit failed for Fold02: lambda=0.00000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold03: lambda=0.00000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold04: lambda=0.00000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold10: lambda=0.00000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
testResults2$Ridge <- predict(ridgeTune2, newdata = testX)
# ENET
set.seed(100)
enetTune2 <- train(x = trainX, y = trainY,
method = "enet",
tuneGrid = enetGrid,
trControl = ctrl)
## Warning: model fit failed for Fold02: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold03: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold04: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold10: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
testResults2$ENET <- predict(enetTune2, newdata = testX)
data.frame(rbind(OLS = postResample(pred = testResults2$OLS, obs = testY),
PCR = postResample(pred = testResults2$PCR, obs = testY),
Ridge = postResample(pred = testResults2$Ridge, obs = testY),
ENET = postResample(pred = testResults2$ENET, obs = testY)))
## RMSE Rsquared MAE
## OLS 29.90647 0.07853504 19.332111
## PCR 12.03322 0.32942582 8.598332
## Ridge 12.84303 0.40044272 9.211695
## ENET 11.56584 0.40355503 7.592838
Among the linear models tested, Elastic Net (ENET) demonstrated the best overall predictive performance with the lowest RMSE (11.57), highest R2 (0.404), and lowest MAE (7.59). In contrast, Ordinary Least Squares (OLS) performed the worst, with the highest RMSE (29.91), highest MAE (19.33), and lowest R2 (0.08).
Although the ENET model achieved the best predictive performance among the linear models, its R2 of approximately 0.40 indicates that the model explains less than half of the variability in permeability. While this level of performance may be useful for identifying general trends, it is not sufficient to fully replace laboratory permeability experiments. However, the ENET model could still be valuable as a screening tool in early drug discovery. It can help prioritize compounds that are more likely to have favorable permeability, which could reduce the number of compounds that need to be tested experimentally and save time and resources. Laboratory experiments would still be necessary for final validation and confirmation. Overall, I would not recommend replacing the permeability experiment entirely, but the predictive model could serve as a complementary tool to guide experimental testing and improve efficiency in the drug development process.
# Neural Network
set.seed(100)
nnetGrid2 <- expand.grid(size = c(1, 3, 5),
decay = c(0, 0.01),
bag = FALSE)
ctrl2 <- trainControl(method = "cv", number = 5)
nnetTune2 <- train(x = trainX, y = trainY,
method = "avNNet",
tuneGrid = nnetGrid2,
trControl = ctrl2,
preProc = c("center", "scale", "pca"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10000,
maxit = 300,
allowParallel = FALSE)
testResults2$NN <- predict(nnetTune2, newdata = testX)
# SVM
set.seed(100)
svmTune2 <- train(x = trainX, y = trainY,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = ctrl)
testResults2$SVM <- predict(svmTune2, newdata = testX)
# MARS
set.seed(100)
marsTune2 <- train(x = trainX, y = trainY,
method = "earth",
tuneGrid = expand.grid(degree = 1, nprune = 2:38),
trControl = ctrl)
testResults2$MARS <- predict(marsTune2, newdata = testX)
# KNN
set.seed(100)
knnTune2 <- train(x = trainX, y = trainY,
method = "knn",
preProc = c("center", "scale"),
tuneGrid = data.frame(k = 1:20),
trControl = ctrl)
testResults2$KNN <- predict(knnTune2, newdata = testX)
data.frame(rbind(NN = postResample(pred = testResults2$NN, obs = testY),
SVM = postResample(pred = testResults2$SVM, obs = testY),
MARS = postResample(pred = testResults2$MARS, obs = testY),
KNN = postResample(pred = testResults2$KNN, obs = testY)))
## RMSE Rsquared MAE
## NN 11.23302 0.2975757 7.994713
## SVM 10.66573 0.4854155 6.844795
## MARS 11.71754 0.2541716 8.883862
## KNN 11.73373 0.2366438 8.099273
The SVM model outperformed the optimal linear model from Problem 2 with R2 of 0.49 and RMSE of 10.67. In comparison, the ENET model from Problem 2 achieved an R2 of approximately 0.40, indicating lower predictive accuracy. The improved performance of SVM and MARS suggests that the relationship between the molecular fingerprint predictors and permeability is likely nonlinear.
The SVM model achieved the best predictive performance among the models tested, with an R2 of approximately 0.49, yet it still explains only about half of the variability in permeability. While this represents an improvement over the linear models from Problem 2, a substantial amount of variation remains unexplained. Therefore, even the SVM model is not accurate enough to fully replace laboratory permeability experiments. However, the model could still be useful as a screening tool in early drug discovery to help prioritize compounds that are more likely to have favorable permeability. Overall, I would not recommend replacing the permeability experiment entirely, but the model could help guide experimental testing.