Homework 2 Predictive Modeling

Author

Sara Nishiyama

Problem 1

library(caret)
Loading required package: ggplot2
Loading required package: lattice
data(tecator)
?tecator
str(absorp)
 num [1:215, 1:100] 2.62 2.83 2.58 2.82 2.79 ...
str(endpoints)
 num [1:215, 1:3] 60.5 46 71 72.8 58.3 44 44 69.3 61.4 61.4 ...
moisture=endpoints[,1]
fat=endpoints[,2]
protein=endpoints[,3]

B) Split data training/test set 80/20 split resulting in 174 samples for training and 41 samples for testing. Centering and scaling were applied to the 100 absorbance predictors, ensuring the IR spectroscopy data that consist of different scales doesnt disproportionately influence the model.

predictors<-absorp
protein<-endpoints[,3]
set.seed(123)
trainIndex<-createDataPartition(protein, p=0.8, list=FALSE)
trainX<-predictors[trainIndex, ]
trainY<-protein[trainIndex]
testX<-predictors[-trainIndex, ]
testY<-protein[-trainIndex]
ctrl<-trainControl(method="cv", number=10)
cat("training set size:", nrow(trainX), "samples\n")
training set size: 174 samples
cat("test set size:", nrow(testX), "samples\n")
test set size: 41 samples
cat("Number of predictors:", ncol(trainX), "\n")
Number of predictors: 100 
colnames(trainX)<-paste0("freq", 1:ncol(trainX))
colnames(testX)<-paste0("Freq", 1:ncol(testX))

C) Ordinary Least Squares (No Tunning Parameteres)

set.seed(123)
model_ols<-train(x=trainX, y=trainY,
                 method="lm",
                 trControl=ctrl, 
                 preProc=c("center", "scale"))

C) PCR ncomp20

set.seed(123)
model_pcr<-train(x=trainX, y=trainY,
                 method="pcr",
                 tuneLength=50,
                 trControl=ctrl,
                 preProc=c("center", "scale"))
print(model_pcr$bestTune)
   ncomp
20    20

C)PLS ncomp15

set.seed(123)
model_pls<-train(x=trainX, y=trainY,
                 method="pls",
                 tuneLength=50,
                 trControl=ctrl,
                 preProc=c("center", "scale"))
print(model_pls$bestTune)
   ncomp
15    15

C) Ridge Regrassion alpha 0 lambda 0.0526

set.seed(123)
model_ridge<-train(x=trainX, y=trainY,
                   method="glmnet",
                   tuneGrid=expand.grid(alpha=0,
                                        lambda=seq(0,1,length=20)),
                   trControl=ctrl,
                   preProc=c("center", "scale"))
print(model_ridge$bestTune)
  alpha     lambda
2     0 0.05263158

C)ENET alpha 0.089 lambda 0

set.seed(123)
model_enet<-train(x=trainX, y=trainY, 
                  method="glmnet",
                  tuneGrid=expand.grid(alpha=seq(0,1,length=10),
                                       lambda=seq(0,0.5, length=20)),
                  trControl=ctrl,
                  preProc=c("center","scale"))
print(model_enet$bestTune)
        alpha lambda
161 0.8888889      0

D) SVM

set.seed(123)
model_svm<-train(x=trainX, y=trainY, 
                 method="svmRadial",
                 tuneLength=10,
                 trControl=ctrl,
                 preProc=c("center", "scale"))
print(model_svm$bestTune)
      sigma  C
7 0.1923092 16

D)Neural Network

Yes PCA does help the nueral network.

set.seed(123)
model_nnet<-train(x=trainX, y=trainY,
                  method="nnet",
                  tuneGrid=expand.grid(size=c(1:5),decay=c(0,0.1,1)),
                  trControl=ctrl,
                  preProc=c("center", "scale", "pca"),
                  lineout=TRUE, trace=FALSE, maxit=500)
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
print(model_nnet$bestTune)
  size decay
1    1     0

D)MARS

set.seed(123)
model_mars<-train(x=trainX, y=trainY,
                  method="earth",
                  tuneGrid=expand.grid(degree=1:2, nprune=2:20),
                  trControl=ctrl)
Loading required package: earth
Loading required package: Formula
Loading required package: plotmo
Loading required package: plotrix
print(model_mars$bestTune)
   nprune degree
15     16      1

D) KNN

set.seed(123)
model_knn<-train(x=trainX, y=trainY,
                 method="knn",
                 tuneLength=10,
                 trControl=ctrl,
                 preProc=c("center","scale"))
print(model_knn$bestTune)
  k
2 7

E) which model is best?

all_models<-list(
  OLS=model_ols, PCR=model_pcr, PLS=model_pls,
  Ridge=model_ridge, ENET=model_enet,
  SVM=model_svm, NNET=model_nnet,
  MARS=model_mars, KNN=model_knn
)
results<-resamples(all_models)
summary(results)

Call:
summary.resamples(object = results)

Models: OLS, PCR, PLS, Ridge, ENET, SVM, NNET, MARS, KNN 
Number of resamples: 10 

MAE 
            Min.    1st Qu.     Median       Mean    3rd Qu.       Max. NA's
OLS    0.3755649  0.6177857  0.7137387  0.7803177  0.9326208  1.2884116    0
PCR    0.3468953  0.4230088  0.4641341  0.4659080  0.4915248  0.5690212    0
PLS    0.3921856  0.4261255  0.4823712  0.4766054  0.4891491  0.5956456    0
Ridge  0.8346081  1.1110247  1.2846377  1.2919543  1.5406462  1.6094924    0
ENET   0.5820637  0.8281065  0.9062255  0.9065488  0.9834219  1.1554332    0
SVM    0.7545651  1.2709061  1.3540686  1.3213403  1.4489355  1.6833443    0
NNET  16.5250000 16.5495098 16.6000000 16.6610174 16.6847222 16.9722222    0
MARS   0.3713135  0.7301951  0.7982749  0.7977330  0.9173724  1.1166270    0
KNN    1.6246849  1.7658336  2.0366939  2.0463251  2.3206752  2.4328373    0

RMSE 
            Min.    1st Qu.     Median       Mean    3rd Qu.       Max. NA's
OLS    0.4653571  0.8373194  0.9450299  1.1384127  1.2961756  2.5659682    0
PCR    0.4849923  0.5176268  0.5863863  0.5981827  0.6659495  0.7582914    0
PLS    0.4924363  0.5482592  0.5907226  0.6127603  0.6698472  0.7938063    0
Ridge  0.9365989  1.3711942  1.7587585  1.6596581  2.0465525  2.1035231    0
ENET   0.7150978  1.0044351  1.1391555  1.1146114  1.2447615  1.4543191    0
SVM    1.0419341  1.5702793  1.8899139  1.8301993  2.1208766  2.3996622    0
NNET  16.7974152 16.8498384 16.8788794 16.9358325 16.9491264 17.2607615    0
MARS   0.4687931  0.9130901  1.0617572  1.0077958  1.1086998  1.3425030    0
KNN    1.8847723  2.1529983  2.3169293  2.4128713  2.7582092  2.8280206    0

Rsquared 
           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
OLS   0.4496297 0.8377383 0.9244505 0.8615363 0.9371232 0.9799236    0
PCR   0.9429007 0.9634038 0.9682903 0.9658147 0.9740495 0.9800551    0
PLS   0.9369330 0.9608704 0.9691314 0.9644614 0.9716783 0.9773931    0
Ridge 0.4903439 0.6123857 0.7124328 0.7158218 0.8349411 0.9369174    0
ENET  0.8161293 0.8816382 0.8928496 0.8839335 0.8996568 0.9438155    0
SVM   0.4433408 0.5925672 0.6656731 0.6577083 0.7054214 0.8931931    0
NNET         NA        NA        NA       NaN        NA        NA   10
MARS  0.8288235 0.8540705 0.9094526 0.8986173 0.9301002 0.9754086    0
KNN   0.1877713 0.2534334 0.3814231 0.4234022 0.6178922 0.7018706    0
dotplot(results, metric="RMSE")

PCR and PLS performed the best with RMSE being 0.598 and 0.612. These models also have the highers RSquared values and lowest erros. The linear models outperformed the nonlinear models. Meaning that the relationship between predictors in linear. NNET was significantly worse than all other models.

Problem 2

library(AppliedPredictiveModeling)
library(caret)
data(permeability)
str(fingerprints)
 num [1:165, 1:1107] 0 0 0 0 0 0 0 0 0 0 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:165] "1" "2" "3" "4" ...
  ..$ : chr [1:1107] "X1" "X2" "X3" "X4" ...
str(permeability)
 num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:165] "1" "2" "3" "4" ...
  ..$ : chr "permeability"

B)NearZeroVar 388 predictors are left for modeling.

nzv_cols<-nearZeroVar(fingerprints)
filtered_fingerprints<-fingerprints[,-nzv_cols]
ncol(filtered_fingerprints)
[1] 388

C) PLS splitting data was split 80/20. PLS was tuned using 10-fold cross validation with centering and scaling. Optimal Latent variables ncomp=6. The model explains about 53% of the variability in permeability within the training data during cross-validation.

set.seed(123)
trainRow<-createDataPartition(permeability, p=0.8, list=FALSE)
trainX<-filtered_fingerprints[trainRow, ]
trainY<-permeability[trainRow]
testX<-filtered_fingerprints[-trainRow, ]
testY<-permeability[-trainRow]

ctrl<-trainControl(method="cv", number=10)
pls_model<-train(x=trainX, y=trainY,
                 method="pls",
                 tuneLength=20,
                 trControl=ctrl,
                 preProc=c("center", "scale"))
print(pls_model$bestTune)
  ncomp
6     6
max(pls_model$results$Rsquared)
[1] 0.5335956

D) when applying the trained PLS model to the unseen test set the performance dropped R^2=0.3245. This suggests some overfitting.

pls_pred<-predict(pls_model, testX)
postResample(pred=pls_pred, obs=testY)
      RMSE   Rsquared        MAE 
12.3486900  0.3244542  8.2881075 

E) Ridge Regression

set.seed(123)
ridge_model<-train(x=trainX, y=trainY, method="glmnet",
                   tuneGrid=expand.grid(alpha=0, lambda=seq(0,1,length=20)),
                   trControl=ctrl, preProc=c("center", "scale"))

E)PCR

set.seed(123)
pcr_model<-train(x=trainX, y=trainY, method="pcr",
                 tuneLength=20, trControl=ctrl, preProc=c("center", "scale"))

E)Comparison

Ridge Regression output the highest mean R^2 (0.5634) slightly outperforming pls and pcr.

results_p2<-resamples(list(PLS=pls_model, PCR=pcr_model, Ridge=ridge_model))
summary(results_p2)

Call:
summary.resamples(object = results_p2)

Models: PLS, PCR, Ridge 
Number of resamples: 10 

MAE 
          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
PLS   5.940748 7.855195 8.101540 8.658301 8.587837 12.53980    0
PCR   7.283000 7.435242 8.217707 8.669652 9.707578 11.05651    0
Ridge 6.177802 7.808006 8.346997 8.465851 9.718546 10.28790    0

RMSE 
          Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
PLS   7.479049 10.662269 11.18215 11.53275 11.66271 16.49145    0
PCR   8.921171  9.200496 11.16664 11.46347 12.02834 16.53709    0
Ridge 8.361707  9.619727 10.72220 11.29799 12.96550 14.75129    0

Rsquared 
           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
PLS   0.2072535 0.3875424 0.5626977 0.5335956 0.6291758 0.8326590    0
PCR   0.1153226 0.3997770 0.6038702 0.5293304 0.7034825 0.8572556    0
Ridge 0.1814282 0.4535147 0.5723011 0.5633878 0.7525841 0.8960319    0

F) I would not recommend replacing the lab experiment with any of these models. the R^2 of .32 means that nearly 70% of the variation in permeability is not captured by the model. I would recommend using Ridge Regression as a screening tool to rank molecules to find the top molecules to be most permeable, it can be a cost saving filter but not a replacement.

Problem 3

a)Nonlinear models

svm

set.seed(123)
model_svm_p3<-train(x=trainX, y=trainY,
                    method="svmRadial",
                    tuneLength=10,
                    trControl=ctrl,
                    preProc=c("center", "scale"))

MARS

set.seed(123)
model_mars_p3<-train(x=trainX, y=trainY,
                     method="earth",
                     tuneGrid=expand.grid(degree=1:2, nprune=2:20),
                     trControl=ctrl)

KNN

set.seed(123)
model_knn_p3<-train(x=trainX, y=trainY,
                    method="knn",
                    tuneLength=10,
                    trControl=ctrl,
                    preProc=c("center", "scale"))

Neural Network

set.seed(123)
model_nnet_p3<-train(x=trainX, y=trainY,
                     method="nnet",
                     tuneGrid=expand.grid(size=c(1:5), decay=c(0,0.1,1)),
                     trControl=ctrl,
                     preProc=c("center", "scale", "pca"),
                     lineout=TRUE, trace=FALSE, maxit=500)
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.

The SVM (R ^ 2 = 0.5038) performed higher than the linear PLS model (R ^ 2 = 0.3245) it also provided the optimal resampling performance with a mean r^2 of 0.6557 and losest RMSEof 9.469.

results_p3<-resamples(list(SVM=model_svm_p3, MARS=model_mars_p3,
                            KNN=model_knn_p3, NNET=model_nnet_p3))
summary(results_p3)

Call:
summary.resamples(object = results_p3)

Models: SVM, MARS, KNN, NNET 
Number of resamples: 10 

MAE 
         Min.   1st Qu.    Median      Mean   3rd Qu.     Max. NA's
SVM  4.213428  5.252082  7.196579  6.979417  7.915720 10.35928    0
MARS 4.243920  6.225447  8.121751  8.036346  9.252100 12.91006    0
KNN  5.182738  6.767481  7.755186  8.121054  9.461309 11.15534    0
NNET 9.848929 11.360016 11.882262 11.783290 12.257277 13.45423    0

RMSE 
          Min.   1st Qu.   Median      Mean  3rd Qu.     Max. NA's
SVM   4.803695  7.864968  8.89035  9.469353 11.26622 14.64101    0
MARS  5.459105  8.136931 12.44591 11.619216 13.36864 19.23540    0
KNN   7.398072  9.880665 11.15714 11.705350 13.04052 18.19556    0
NNET 15.865754 19.179919 19.91049 19.851286 20.46818 23.50821    0

Rsquared 
             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
SVM  0.1927160111 0.5144728 0.7167798 0.6557369 0.7828028 0.9675302    0
MARS 0.0003492616 0.2927572 0.4320556 0.5109589 0.8431597 0.9132229    0
KNN  0.0090651218 0.3065722 0.5773361 0.5145225 0.7149048 0.9328090    0
NNET           NA        NA        NA       NaN        NA        NA   10
postResample(predict(model_svm_p3, testX), testY)
      RMSE   Rsquared        MAE 
10.2957152  0.5037946  6.5552744 

B)Linear Ridge Regression Mean Rsquared (Best Linear Model) = 0.5634

Nonlinear SVM (best Non Linear Model) mean Rsquared=0.6557

On the test set, the SVM significantly outperformed PLS model.

compare_all<-resamples(list(
  Linear_PLS=pls_model,
  Linear_Ridge=ridge_model,
  Nonlinear_SVM=model_svm_p3,
  Nonlinear_MARS=model_mars_p3
))
summary(compare_all)$statistics$Rsquared
                       Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
Linear_PLS     0.2072534541 0.3875424 0.5626977 0.5335956 0.6291758 0.8326590
Linear_Ridge   0.1814282098 0.4535147 0.5723011 0.5633878 0.7525841 0.8960319
Nonlinear_SVM  0.1927160111 0.5144728 0.7167798 0.6557369 0.7828028 0.9675302
Nonlinear_MARS 0.0003492616 0.2927572 0.4320556 0.5109589 0.8431597 0.9132229
               NA's
Linear_PLS        0
Linear_Ridge      0
Nonlinear_SVM     0
Nonlinear_MARS    0
pls_test<-postResample(predict(pls_model, testX), testY)
svm_test<-postResample(predict(model_svm_p3, testX), testY)
test_comparison<-rbind(PLS=pls_test, SVM=svm_test)
print(test_comparison)
        RMSE  Rsquared      MAE
PLS 12.34869 0.3244542 8.288107
SVM 10.29572 0.5037946 6.555274

C) I would not recommend replacing the lab experiment with any model. the SVM test RMSE is 10.29 (large margin of error) and the Rsquared of .50 means the model is “guessing” half of the outcomes.