Problem 1

a) Start R and use these commands to load the data:

data(tecator)
# ?tecator
str(absorp)
##  num [1:215, 1:100] 2.62 2.83 2.58 2.82 2.79 ...
str(endpoints)
##  num [1:215, 1:3] 60.5 46 71 72.8 58.3 44 44 69.3 61.4 61.4 ...
moisture = endpoints[,1]
fat = endpoints[,2]
protein = endpoints[,3]

b) Split the data into a training and a test set the response of the percentage of protein, pre-process the data as appropriate.

# Split data
set.seed(123)
trainIndex <- createDataPartition(moisture, p = 0.8, list = FALSE)

absorp_df <- as.data.frame(absorp)
x_train <- absorp_df[trainIndex, ]
x_test  <- absorp_df[-trainIndex, ]
y_train <- moisture[trainIndex]
y_test  <- moisture[-trainIndex]


# Pre-process
preProc <- preProcess(x_train, method = c("center", "scale"))

x_train <- predict(preProc, x_train)
x_test  <- predict(preProc, x_test)

c) Build at least three models described Chapter 6: ordinary least squares, PCR, PLS, Ridge, and ENET. For those models with tuning parameters, what are the optimal values of the tuning parameter(s)?

# OLS
set.seed(100)
indx <- createFolds(y_train, returnTrain = TRUE)
ctrl <- trainControl(method = "cv", index = indx)

set.seed(100)
lmTune0 <- train(x = x_train, y = y_train,
                 method = "lm",
                 trControl = ctrl)

testResults <- data.frame(obs = y_test,
                          OLS = predict(lmTune0, newdata = x_test))

# PLS
set.seed(100)
plsTune <- train(x = x_train, y = y_train,
                 method = "pls",
                 tuneGrid = expand.grid(ncomp = 1:50),
                 trControl = ctrl)
testResults$PLS <- predict(plsTune, newdata = x_test)


# PCR
set.seed(100)
pcrTune <- train(x = x_train, y = y_train,
                 method = "pcr",
                 tuneGrid = expand.grid(ncomp = 1:50),
                 trControl = ctrl)
testResults$PCR <- predict(pcrTune, newdata = x_test)


# Ridge Regression
ridgeGrid <- expand.grid(lambda = seq(0, .1, length = 10))

set.seed(100)
ridgeTune <- train(x = x_train, y = y_train,
                   method = "ridge",
                   tuneGrid = ridgeGrid,
                   trControl = ctrl)
testResults$Ridge <- predict(ridgeTune, newdata = x_test)


# ENET
enetGrid <- expand.grid(lambda = c(0, 0.01, .1), 
                        fraction = seq(.05, 1, length = 20))
set.seed(100)
enetTune <- train(x = x_train, y = y_train,
                  method = "enet",
                  tuneGrid = enetGrid,
                  trControl = ctrl)
testResults$ENET <- predict(enetTune, newdata = x_test)


# Best params
plsTune$bestTune
##    ncomp
## 17    17
pcrTune$bestTune
##    ncomp
## 28    28
ridgeTune$bestTune
##   lambda
## 1      0
enetTune$bestTune
##   fraction lambda
## 1     0.05      0

The optimal tuning parameters selected via cross-validation were:

  • PCR: ncomp = 17

  • PLS: ncomp = 28

  • Ridge: lambda = 0

  • Elastic Net: lambda = 0, fraction = 0.05

d) Build nonlinear models in Chapter 7: SVM, neural network, MARS, and KNN models. Since neural networks are especially sensitive to highly correlated predictors, does pre-processing using PCA help the model? For those models with tuning parameters, what are the optimal values of the tuning parameter(s)?

# SVM
set.seed(100)
svmTune <- train(x = x_train, y = y_train,
                 method = "svmRadial",
                 preProc = c("center", "scale"),
                 tuneLength = 14,
                 trControl = ctrl)
testResults$SVM <- predict(svmTune, newdata = x_test)

# Neural Network
set.seed(100)
nnetGrid <- expand.grid(decay = c(0, 0.01, .1),
                        size = c(1, 3, 5, 7),
                        bag = FALSE)
nnetTune <- train(x = x_train, y = y_train,
                  method = "avNNet", 
                  tuneGrid = nnetGrid, 
                  trControl = ctrl, 
                  preProc = c("center", "scale"), 
                  linout = TRUE, 
                  trace = FALSE, 
                  MaxNWts = 13 * (ncol(x_train) + 1) + 13 + 1, 
                  maxit = 1000,
                  allowParallel = FALSE)
testResults$NN <- predict(nnetTune, newdata = x_test)

# MARS
set.seed(100)
marsTune <- train(x = x_train, y = y_train,
                  method = "earth",
                  tuneGrid = expand.grid(degree = 1, nprune = 2:38),
                  trControl = ctrl)
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
testResults$MARS <- predict(marsTune, newdata = x_test)

# KNN
set.seed(100)
knnTune <- train(x = x_train, y = y_train,
                 method = "knn",
                 preProc = c("center", "scale"),
                 tuneGrid = data.frame(k = 1:20),
                 trControl = ctrl)
testResults$KNN <- predict(knnTune, newdata = x_test)


# PCA preprocessing
preProc_pca <- preProcess(x_train, method = c("center","scale","pca"), thresh = 0.95)

x_train_pca <- predict(preProc_pca, newdata = x_train)
x_test_pca  <- predict(preProc_pca, newdata = x_test)

set.seed(100)
nnetPcaTune <- train(x = x_train_pca, y = y_train,
                     method = "avNNet",
                     tuneGrid = nnetGrid,
                     linout = TRUE,
                     trace = FALSE,
                     trControl = ctrl,
                     MaxNWts = 13 * (ncol(x_train) + 1) + 13 + 1, 
                     maxit = 1000,
                     allowParallel = FALSE
                     )

testResults$NN_PCA <- predict(nnetPcaTune, newdata = x_test_pca)


# Best params
svmTune$bestTune
##      sigma  C
## 9 0.149536 64
nnetTune$bestTune
##   size decay   bag
## 7    5  0.01 FALSE
marsTune$bestTune
##    nprune degree
## 16     17      1
knnTune$bestTune
##   k
## 1 1
nnetPcaTune$bestTune
##   size decay   bag
## 6    3  0.01 FALSE

The optimal tuning parameters selected via cross-validation were:

  • SVM: sigma = 0.08334105, C = 256

  • Neural network: size = 5, decay = 0.01

  • MARS: nprune = 17, degree = 1

  • KNN: k = 1

  • Neural net with PCA: size = 1, decay = 0.1

e) Which model from parts c) and d) has the best predictive ability? Is any model significantly better or worse than the others?

# Compare models
data.frame(rbind(OLS = postResample(pred = testResults$OLS, obs = y_test),
                 PLS = postResample(pred = testResults$PLS, obs = y_test),
                 PCR = postResample(pred = testResults$PCR, obs = y_test),
                 Ridge = postResample(pred = testResults$Ridge, obs = y_test),
                 ENET = postResample(pred = testResults$ENET, obs = y_test),
                 SVM = postResample(pred = testResults$SVM, obs = y_test),
                 KNN = postResample(pred = testResults$KNN, obs = y_test),
                 MARS = postResample(pred = testResults$MARS, obs = y_test),
                 NNet = postResample(pred = testResults$NN, obs = y_test),
                 NNetPCA = postResample(pred = testResults$NN_PCA, obs = y_test)))
##             RMSE  Rsquared       MAE
## OLS     3.545768 0.8771040 1.5960313
## PLS     1.615357 0.9738681 1.1855205
## PCR     1.696345 0.9736020 1.2714619
## Ridge   3.545632 0.8771120 1.5959803
## ENET    1.431302 0.9799521 1.0447150
## SVM     2.588153 0.9245771 1.8895282
## KNN     7.587939 0.5543348 5.3562500
## MARS    1.971870 0.9569271 1.4368790
## NNet    0.561155 0.9966528 0.4408398
## NNetPCA 7.281670 0.3977038 5.6333212

Among all models tested, the neural network trained on the original predictors (NNet) had the best predictive performance, achieving the lowest RMSE (0.56), highest R2 (0.997), and lowest MAE (0.44). Models like KNN and the neural network trained on PCA-transformed predictors (NNetPCA) performed the worst, with high RMSE and low R2, indicating poor predictive ability. This suggests that, for this dataset, the neural network benefits from the full set of original predictors and that PCA pre-processing may have removed important information.

Problem 2

a) Start R and use these commands to load the data

library(AppliedPredictiveModeling)
data(permeability)
str(fingerprints)
##  num [1:165, 1:1107] 0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:165] "1" "2" "3" "4" ...
##   ..$ : chr [1:1107] "X1" "X2" "X3" "X4" ...
str(permeability)
##  num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:165] "1" "2" "3" "4" ...
##   ..$ : chr "permeability"

b) The fingerprint predictors indicate the presence or absence of substructures of a molecule and are often sparse meaning that relatively few of the molecules contain each substructure. Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

nzv <- nearZeroVar(fingerprints)
fingerprints_filtered <- fingerprints[, -nzv]
dim(fingerprints_filtered)
## [1] 165 388

388 predictors remain for modeling.

c) Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

set.seed(123)

trainIndex <- createDataPartition(permeability, p = 0.8, list = FALSE)
trainX <- fingerprints_filtered[trainIndex, ]
trainY <- permeability[trainIndex]
testX <- fingerprints_filtered[-trainIndex, ]
testY <- permeability[-trainIndex]

# PLS
ctrl <- trainControl(method = "cv", number = 10)
plsFit <- train(
  x = trainX,
  y = trainY,
  method = "pls",
  preProcess = c("center", "scale"),
  tuneLength = 20,
  trControl = ctrl
)

plsFit
## Partial Least Squares 
## 
## 133 samples
## 388 predictors
## 
## Pre-processing: centered (388), scaled (388) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 121, 121, 118, 119, 119, 119, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE      
##    1     13.31894  0.3442124  10.254018
##    2     11.78898  0.4830504   8.534741
##    3     11.98818  0.4792649   9.219285
##    4     12.04349  0.4923322   9.448926
##    5     11.79823  0.5193195   9.049121
##    6     11.53275  0.5335956   8.658301
##    7     11.64053  0.5229621   8.878265
##    8     11.86459  0.5144801   9.265252
##    9     11.98385  0.5188205   9.218594
##   10     12.55634  0.4808614   9.610747
##   11     12.69674  0.4758068   9.702325
##   12     13.01534  0.4538906   9.956623
##   13     13.12637  0.4367362   9.878017
##   14     13.44865  0.4140715  10.065088
##   15     13.60135  0.4034269  10.188150
##   16     13.79361  0.3943904  10.247160
##   17     14.00756  0.3845119  10.412776
##   18     14.18113  0.3711378  10.587027
##   19     14.25674  0.3703610  10.575726
##   20     14.33121  0.3723176  10.679764
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 6.
plsFit$bestTune
##   ncomp
## 6     6
plsFit$results
##    ncomp     RMSE  Rsquared       MAE   RMSESD RsquaredSD    MAESD
## 1      1 13.31894 0.3442124 10.254018 2.992891  0.2338804 2.391627
## 2      2 11.78898 0.4830504  8.534741 3.246743  0.1956815 2.070321
## 3      3 11.98818 0.4792649  9.219285 2.846426  0.1865983 2.036920
## 4      4 12.04349 0.4923322  9.448926 3.097630  0.2096670 2.365493
## 5      5 11.79823 0.5193195  9.049121 2.874689  0.2123931 2.210030
## 6      6 11.53275 0.5335956  8.658301 2.638831  0.1912053 2.007613
## 7      7 11.64053 0.5229621  8.878265 2.102121  0.1532401 1.627023
## 8      8 11.86459 0.5144801  9.265252 2.238164  0.1485683 1.727048
## 9      9 11.98385 0.5188205  9.218594 2.191476  0.1467131 1.685842
## 10    10 12.55634 0.4808614  9.610747 2.380454  0.1554564 1.762340
## 11    11 12.69674 0.4758068  9.702325 2.531396  0.1563200 1.858442
## 12    12 13.01534 0.4538906  9.956623 2.613228  0.1593663 1.944053
## 13    13 13.12637 0.4367362  9.878017 2.419911  0.1702133 1.685927
## 14    14 13.44865 0.4140715 10.065088 2.460808  0.1798710 1.754670
## 15    15 13.60135 0.4034269 10.188150 2.794828  0.1927099 1.861764
## 16    16 13.79361 0.3943904 10.247160 2.996247  0.2029007 2.082037
## 17    17 14.00756 0.3845119 10.412776 3.331556  0.2201622 2.252877
## 18    18 14.18113 0.3711378 10.587027 3.373322  0.2213465 2.269240
## 19    19 14.25674 0.3703610 10.575726 3.277469  0.2103426 2.249600
## 20    20 14.33121 0.3723176 10.679764 3.464429  0.2143669 2.335032

Optimal latent variables = 6

Corresponding R2 estimate = 0.5335956

d) Predict the response for the test set. What is the test set estimate of R2?

testResults2 <- data.frame(obs = testY,
                          PLS = predict(plsFit, newdata = testX))
postResample(pred = testResults2$PLS, obs = testY)
##       RMSE   Rsquared        MAE 
## 12.3486900  0.3244542  8.2881075

R2 estimate = 3244542

e) Try building other models discussed in this chapter. Do any have better predictive performance?

# OLS
set.seed(100)
indx <- createFolds(trainY, returnTrain = TRUE)

set.seed(100)
lmTune2 <- train(x = trainX, y = trainY,
                 method = "lm",
                 trControl = ctrl)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning: predictions failed for Fold05: intercept=TRUE Error in qr.default(tR) : NA/NaN/Inf in foreign function call (arg 1)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
testResults2$OLS <- predict(lmTune2, newdata = testX)
## Warning in predict.lm(modelFit, newdata): prediction from rank-deficient fit;
## attr(*, "non-estim") has doubtful cases
# PCR
set.seed(100)
pcrTune2 <- train(x = trainX, y = trainY,
                 method = "pcr",
                 tuneGrid = expand.grid(ncomp = 1:50),
                 trControl = ctrl)
testResults2$PCR <- predict(pcrTune2, newdata = testX)


# Ridge Regression
set.seed(100)
ridgeTune2 <- train(x = trainX, y = trainY,
                   method = "ridge",
                   tuneGrid = ridgeGrid,
                   trControl = ctrl)
## Warning: model fit failed for Fold02: lambda=0.00000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold03: lambda=0.00000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold04: lambda=0.00000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold10: lambda=0.00000 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
testResults2$Ridge <- predict(ridgeTune2, newdata = testX)


# ENET
set.seed(100)
enetTune2 <- train(x = trainX, y = trainY,
                  method = "enet",
                  tuneGrid = enetGrid,
                  trControl = ctrl)
## Warning: model fit failed for Fold02: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold03: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold04: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning: model fit failed for Fold10: lambda=0.00, fraction=1 Error in if (zmin < gamhat) { : missing value where TRUE/FALSE needed
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
testResults2$ENET <- predict(enetTune2, newdata = testX)


data.frame(rbind(OLS = postResample(pred = testResults2$OLS, obs = testY),
                 PCR = postResample(pred = testResults2$PCR, obs = testY),
                 Ridge = postResample(pred = testResults2$Ridge, obs = testY),
                 ENET = postResample(pred = testResults2$ENET, obs = testY)))
##           RMSE   Rsquared       MAE
## OLS   29.90647 0.07853504 19.332111
## PCR   12.03322 0.32942582  8.598332
## Ridge 12.84303 0.40044272  9.211695
## ENET  11.56584 0.40355503  7.592838

Among the linear models tested, Elastic Net (ENET) demonstrated the best overall predictive performance with the lowest RMSE (11.57), highest R2 (0.404), and lowest MAE (7.59). In contrast, Ordinary Least Squares (OLS) performed the worst, with the highest RMSE (29.91), highest MAE (19.33), and lowest R2 (0.08).

f) Would you recommend any of your models to replace the permeability laboratory experiment?

Although the ENET model achieved the best predictive performance among the linear models, its R2 of approximately 0.40 indicates that the model explains less than half of the variability in permeability. While this level of performance may be useful for identifying general trends, it is not sufficient to fully replace laboratory permeability experiments. However, the ENET model could still be valuable as a screening tool in early drug discovery. It can help prioritize compounds that are more likely to have favorable permeability, which could reduce the number of compounds that need to be tested experimentally and save time and resources. Laboratory experiments would still be necessary for final validation and confirmation. Overall, I would not recommend replacing the permeability experiment entirely, but the predictive model could serve as a complementary tool to guide experimental testing and improve efficiency in the drug development process.

Problem 3

a) Which nonlinear regression model that we learned in Chapter 7 gives the optimal resampling and test set performance?

# Neural Network
set.seed(100)
nnetGrid2 <- expand.grid(size = c(1, 3, 5),
                         decay = c(0, 0.01),
                         bag = FALSE)
ctrl2 <- trainControl(method = "cv", number = 5)
nnetTune2 <- train(x = trainX, y = trainY,
                  method = "avNNet", 
                  tuneGrid = nnetGrid2, 
                  trControl = ctrl2, 
                  preProc = c("center", "scale", "pca"), 
                  linout = TRUE, 
                  trace = FALSE, 
                  MaxNWts = 10000, 
                  maxit = 300,
                  allowParallel = FALSE)
testResults2$NN <- predict(nnetTune2, newdata = testX)

# SVM
set.seed(100)
svmTune2 <- train(x = trainX, y = trainY,
                 method = "svmRadial",
                 preProc = c("center", "scale"),
                 tuneLength = 14,
                 trControl = ctrl)
testResults2$SVM <- predict(svmTune2, newdata = testX)

# MARS
set.seed(100)
marsTune2 <- train(x = trainX, y = trainY,
                  method = "earth",
                  tuneGrid = expand.grid(degree = 1, nprune = 2:38),
                  trControl = ctrl)
testResults2$MARS <- predict(marsTune2, newdata = testX)

# KNN
set.seed(100)
knnTune2 <- train(x = trainX, y = trainY,
                 method = "knn",
                 preProc = c("center", "scale"),
                 tuneGrid = data.frame(k = 1:20),
                 trControl = ctrl)
testResults2$KNN <- predict(knnTune2, newdata = testX)

b) Do any of the nonlinear models outperform the optimal linear model you previously developed in Problem 2? If so, what might this tell you about the underlying relationship between the predictors and the response?

data.frame(rbind(NN = postResample(pred = testResults2$NN, obs = testY),
                 SVM = postResample(pred = testResults2$SVM, obs = testY),
                 MARS = postResample(pred = testResults2$MARS, obs = testY),
                 KNN = postResample(pred = testResults2$KNN, obs = testY)))
##          RMSE  Rsquared      MAE
## NN   11.23302 0.2975757 7.994713
## SVM  10.66573 0.4854155 6.844795
## MARS 11.71754 0.2541716 8.883862
## KNN  11.73373 0.2366438 8.099273

The SVM model outperformed the optimal linear model from Problem 2 with R2 of 0.49 and RMSE of 10.67. In comparison, the ENET model from Problem 2 achieved an R2 of approximately 0.40, indicating lower predictive accuracy. The improved performance of SVM and MARS suggests that the relationship between the molecular fingerprint predictors and permeability is likely nonlinear.

c) Would you recommend any of the models you have developed to replace the permeability laboratory experiment?

The SVM model achieved the best predictive performance among the models tested, with an R2 of approximately 0.49, yet it still explains only about half of the variability in permeability. While this represents an improvement over the linear models from Problem 2, a substantial amount of variation remains unexplained. Therefore, even the SVM model is not accurate enough to fully replace laboratory permeability experiments. However, the model could still be useful as a screening tool in early drug discovery to help prioritize compounds that are more likely to have favorable permeability. Overall, I would not recommend replacing the permeability experiment entirely, but the model could help guide experimental testing.