Problem 1
Loading required package: ggplot2
Loading required package: lattice
data (tecator)
?tecator
str (absorp)
num [1:215, 1:100] 2.62 2.83 2.58 2.82 2.79 ...
num [1:215, 1:3] 60.5 46 71 72.8 58.3 44 44 69.3 61.4 61.4 ...
moisture= endpoints[,1 ]
fat= endpoints[,2 ]
protein= endpoints[,3 ]
B) Split data training/test set 80/20 split resulting in 174 samples for training and 41 samples for testing. Centering and scaling were applied to the 100 absorbance predictors, ensuring the IR spectroscopy data that consist of different scales doesnt disproportionately influence the model.
predictors<- absorp
protein<- endpoints[,3 ]
set.seed (123 )
trainIndex<- createDataPartition (protein, p= 0.8 , list= FALSE )
trainX<- predictors[trainIndex, ]
trainY<- protein[trainIndex]
testX<- predictors[- trainIndex, ]
testY<- protein[- trainIndex]
ctrl<- trainControl (method= "cv" , number= 10 )
cat ("training set size:" , nrow (trainX), "samples \n " )
training set size: 174 samples
cat ("test set size:" , nrow (testX), "samples \n " )
test set size: 41 samples
cat ("Number of predictors:" , ncol (trainX), " \n " )
Number of predictors: 100
colnames (trainX)<- paste0 ("freq" , 1 : ncol (trainX))
colnames (testX)<- paste0 ("Freq" , 1 : ncol (testX))
C) Ordinary Least Squares (No Tunning Parameteres)
set.seed (123 )
model_ols<- train (x= trainX, y= trainY,
method= "lm" ,
trControl= ctrl,
preProc= c ("center" , "scale" ))
C) PCR ncomp20
set.seed (123 )
model_pcr<- train (x= trainX, y= trainY,
method= "pcr" ,
tuneLength= 50 ,
trControl= ctrl,
preProc= c ("center" , "scale" ))
print (model_pcr$ bestTune)
C)PLS ncomp15
set.seed (123 )
model_pls<- train (x= trainX, y= trainY,
method= "pls" ,
tuneLength= 50 ,
trControl= ctrl,
preProc= c ("center" , "scale" ))
print (model_pls$ bestTune)
C) Ridge Regrassion alpha 0 lambda 0.0526
set.seed (123 )
model_ridge<- train (x= trainX, y= trainY,
method= "glmnet" ,
tuneGrid= expand.grid (alpha= 0 ,
lambda= seq (0 ,1 ,length= 20 )),
trControl= ctrl,
preProc= c ("center" , "scale" ))
print (model_ridge$ bestTune)
alpha lambda
2 0 0.05263158
C)ENET alpha 0.089 lambda 0
set.seed (123 )
model_enet<- train (x= trainX, y= trainY,
method= "glmnet" ,
tuneGrid= expand.grid (alpha= seq (0 ,1 ,length= 10 ),
lambda= seq (0 ,0.5 , length= 20 )),
trControl= ctrl,
preProc= c ("center" ,"scale" ))
print (model_enet$ bestTune)
alpha lambda
161 0.8888889 0
D) SVM
set.seed (123 )
model_svm<- train (x= trainX, y= trainY,
method= "svmRadial" ,
tuneLength= 10 ,
trControl= ctrl,
preProc= c ("center" , "scale" ))
print (model_svm$ bestTune)
D)Neural Network
Yes PCA does help the nueral network.
set.seed (123 )
model_nnet<- train (x= trainX, y= trainY,
method= "nnet" ,
tuneGrid= expand.grid (size= c (1 : 5 ),decay= c (0 ,0.1 ,1 )),
trControl= ctrl,
preProc= c ("center" , "scale" , "pca" ),
lineout= TRUE , trace= FALSE , maxit= 500 )
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
print (model_nnet$ bestTune)
D)MARS
set.seed (123 )
model_mars<- train (x= trainX, y= trainY,
method= "earth" ,
tuneGrid= expand.grid (degree= 1 : 2 , nprune= 2 : 20 ),
trControl= ctrl)
Loading required package: earth
Loading required package: Formula
Loading required package: plotmo
Loading required package: plotrix
print (model_mars$ bestTune)
D) KNN
set.seed (123 )
model_knn<- train (x= trainX, y= trainY,
method= "knn" ,
tuneLength= 10 ,
trControl= ctrl,
preProc= c ("center" ,"scale" ))
print (model_knn$ bestTune)
E) which model is best?
all_models<- list (
OLS= model_ols, PCR= model_pcr, PLS= model_pls,
Ridge= model_ridge, ENET= model_enet,
SVM= model_svm, NNET= model_nnet,
MARS= model_mars, KNN= model_knn
)
results<- resamples (all_models)
summary (results)
Call:
summary.resamples(object = results)
Models: OLS, PCR, PLS, Ridge, ENET, SVM, NNET, MARS, KNN
Number of resamples: 10
MAE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
OLS 0.3755649 0.6177857 0.7137387 0.7803177 0.9326208 1.2884116 0
PCR 0.3468953 0.4230088 0.4641341 0.4659080 0.4915248 0.5690212 0
PLS 0.3921856 0.4261255 0.4823712 0.4766054 0.4891491 0.5956456 0
Ridge 0.8346081 1.1110247 1.2846377 1.2919543 1.5406462 1.6094924 0
ENET 0.5820637 0.8281065 0.9062255 0.9065488 0.9834219 1.1554332 0
SVM 0.7545651 1.2709061 1.3540686 1.3213403 1.4489355 1.6833443 0
NNET 16.5250000 16.5495098 16.6000000 16.6610174 16.6847222 16.9722222 0
MARS 0.3713135 0.7301951 0.7982749 0.7977330 0.9173724 1.1166270 0
KNN 1.6246849 1.7658336 2.0366939 2.0463251 2.3206752 2.4328373 0
RMSE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
OLS 0.4653571 0.8373194 0.9450299 1.1384127 1.2961756 2.5659682 0
PCR 0.4849923 0.5176268 0.5863863 0.5981827 0.6659495 0.7582914 0
PLS 0.4924363 0.5482592 0.5907226 0.6127603 0.6698472 0.7938063 0
Ridge 0.9365989 1.3711942 1.7587585 1.6596581 2.0465525 2.1035231 0
ENET 0.7150978 1.0044351 1.1391555 1.1146114 1.2447615 1.4543191 0
SVM 1.0419341 1.5702793 1.8899139 1.8301993 2.1208766 2.3996622 0
NNET 16.7974152 16.8498384 16.8788794 16.9358325 16.9491264 17.2607615 0
MARS 0.4687931 0.9130901 1.0617572 1.0077958 1.1086998 1.3425030 0
KNN 1.8847723 2.1529983 2.3169293 2.4128713 2.7582092 2.8280206 0
Rsquared
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
OLS 0.4496297 0.8377383 0.9244505 0.8615363 0.9371232 0.9799236 0
PCR 0.9429007 0.9634038 0.9682903 0.9658147 0.9740495 0.9800551 0
PLS 0.9369330 0.9608704 0.9691314 0.9644614 0.9716783 0.9773931 0
Ridge 0.4903439 0.6123857 0.7124328 0.7158218 0.8349411 0.9369174 0
ENET 0.8161293 0.8816382 0.8928496 0.8839335 0.8996568 0.9438155 0
SVM 0.4433408 0.5925672 0.6656731 0.6577083 0.7054214 0.8931931 0
NNET NA NA NA NaN NA NA 10
MARS 0.8288235 0.8540705 0.9094526 0.8986173 0.9301002 0.9754086 0
KNN 0.1877713 0.2534334 0.3814231 0.4234022 0.6178922 0.7018706 0
dotplot (results, metric= "RMSE" )
PCR and PLS performed the best with RMSE being 0.598 and 0.612. These models also have the highers RSquared values and lowest erros. The linear models outperformed the nonlinear models. Meaning that the relationship between predictors in linear. NNET was significantly worse than all other models.
Problem 2
library (AppliedPredictiveModeling)
library (caret)
data (permeability)
str (fingerprints)
num [1:165, 1:1107] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:165] "1" "2" "3" "4" ...
..$ : chr [1:1107] "X1" "X2" "X3" "X4" ...
num [1:165, 1] 12.52 1.12 19.41 1.73 1.68 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:165] "1" "2" "3" "4" ...
..$ : chr "permeability"
B)NearZeroVar 388 predictors are left for modeling.
nzv_cols<- nearZeroVar (fingerprints)
filtered_fingerprints<- fingerprints[,- nzv_cols]
ncol (filtered_fingerprints)
C) PLS splitting data was split 80/20. PLS was tuned using 10-fold cross validation with centering and scaling. Optimal Latent variables ncomp=6. The model explains about 53% of the variability in permeability within the training data during cross-validation.
set.seed (123 )
trainRow<- createDataPartition (permeability, p= 0.8 , list= FALSE )
trainX<- filtered_fingerprints[trainRow, ]
trainY<- permeability[trainRow]
testX<- filtered_fingerprints[- trainRow, ]
testY<- permeability[- trainRow]
ctrl<- trainControl (method= "cv" , number= 10 )
pls_model<- train (x= trainX, y= trainY,
method= "pls" ,
tuneLength= 20 ,
trControl= ctrl,
preProc= c ("center" , "scale" ))
print (pls_model$ bestTune)
max (pls_model$ results$ Rsquared)
D) when applying the trained PLS model to the unseen test set the performance dropped R^2=0.3245. This suggests some overfitting.
pls_pred<- predict (pls_model, testX)
postResample (pred= pls_pred, obs= testY)
RMSE Rsquared MAE
12.3486900 0.3244542 8.2881075
E) Ridge Regression
set.seed (123 )
ridge_model<- train (x= trainX, y= trainY, method= "glmnet" ,
tuneGrid= expand.grid (alpha= 0 , lambda= seq (0 ,1 ,length= 20 )),
trControl= ctrl, preProc= c ("center" , "scale" ))
E)PCR
set.seed (123 )
pcr_model<- train (x= trainX, y= trainY, method= "pcr" ,
tuneLength= 20 , trControl= ctrl, preProc= c ("center" , "scale" ))
E)Comparison
Ridge Regression output the highest mean R^2 (0.5634) slightly outperforming pls and pcr.
results_p2<- resamples (list (PLS= pls_model, PCR= pcr_model, Ridge= ridge_model))
summary (results_p2)
Call:
summary.resamples(object = results_p2)
Models: PLS, PCR, Ridge
Number of resamples: 10
MAE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
PLS 5.940748 7.855195 8.101540 8.658301 8.587837 12.53980 0
PCR 7.283000 7.435242 8.217707 8.669652 9.707578 11.05651 0
Ridge 6.177802 7.808006 8.346997 8.465851 9.718546 10.28790 0
RMSE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
PLS 7.479049 10.662269 11.18215 11.53275 11.66271 16.49145 0
PCR 8.921171 9.200496 11.16664 11.46347 12.02834 16.53709 0
Ridge 8.361707 9.619727 10.72220 11.29799 12.96550 14.75129 0
Rsquared
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
PLS 0.2072535 0.3875424 0.5626977 0.5335956 0.6291758 0.8326590 0
PCR 0.1153226 0.3997770 0.6038702 0.5293304 0.7034825 0.8572556 0
Ridge 0.1814282 0.4535147 0.5723011 0.5633878 0.7525841 0.8960319 0
F) I would not recommend replacing the lab experiment with any of these models. the R^2 of .32 means that nearly 70% of the variation in permeability is not captured by the model. I would recommend using Ridge Regression as a screening tool to rank molecules to find the top molecules to be most permeable, it can be a cost saving filter but not a replacement.
Problem 3
a)Nonlinear models
svm
set.seed (123 )
model_svm_p3<- train (x= trainX, y= trainY,
method= "svmRadial" ,
tuneLength= 10 ,
trControl= ctrl,
preProc= c ("center" , "scale" ))
MARS
set.seed (123 )
model_mars_p3<- train (x= trainX, y= trainY,
method= "earth" ,
tuneGrid= expand.grid (degree= 1 : 2 , nprune= 2 : 20 ),
trControl= ctrl)
KNN
set.seed (123 )
model_knn_p3<- train (x= trainX, y= trainY,
method= "knn" ,
tuneLength= 10 ,
trControl= ctrl,
preProc= c ("center" , "scale" ))
Neural Network
set.seed (123 )
model_nnet_p3<- train (x= trainX, y= trainY,
method= "nnet" ,
tuneGrid= expand.grid (size= c (1 : 5 ), decay= c (0 ,0.1 ,1 )),
trControl= ctrl,
preProc= c ("center" , "scale" , "pca" ),
lineout= TRUE , trace= FALSE , maxit= 500 )
Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
: There were missing values in resampled performance measures.
The SVM (R ^ 2 = 0.5038) performed higher than the linear PLS model (R ^ 2 = 0.3245) it also provided the optimal resampling performance with a mean r^2 of 0.6557 and losest RMSEof 9.469.
results_p3<- resamples (list (SVM= model_svm_p3, MARS= model_mars_p3,
KNN= model_knn_p3, NNET= model_nnet_p3))
summary (results_p3)
Call:
summary.resamples(object = results_p3)
Models: SVM, MARS, KNN, NNET
Number of resamples: 10
MAE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
SVM 4.213428 5.252082 7.196579 6.979417 7.915720 10.35928 0
MARS 4.243920 6.225447 8.121751 8.036346 9.252100 12.91006 0
KNN 5.182738 6.767481 7.755186 8.121054 9.461309 11.15534 0
NNET 9.848929 11.360016 11.882262 11.783290 12.257277 13.45423 0
RMSE
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
SVM 4.803695 7.864968 8.89035 9.469353 11.26622 14.64101 0
MARS 5.459105 8.136931 12.44591 11.619216 13.36864 19.23540 0
KNN 7.398072 9.880665 11.15714 11.705350 13.04052 18.19556 0
NNET 15.865754 19.179919 19.91049 19.851286 20.46818 23.50821 0
Rsquared
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
SVM 0.1927160111 0.5144728 0.7167798 0.6557369 0.7828028 0.9675302 0
MARS 0.0003492616 0.2927572 0.4320556 0.5109589 0.8431597 0.9132229 0
KNN 0.0090651218 0.3065722 0.5773361 0.5145225 0.7149048 0.9328090 0
NNET NA NA NA NaN NA NA 10
postResample (predict (model_svm_p3, testX), testY)
RMSE Rsquared MAE
10.2957152 0.5037946 6.5552744
B)Linear Ridge Regression Mean Rsquared (Best Linear Model) = 0.5634
Nonlinear SVM (best Non Linear Model) mean Rsquared=0.6557
On the test set, the SVM significantly outperformed PLS model.
compare_all<- resamples (list (
Linear_PLS= pls_model,
Linear_Ridge= ridge_model,
Nonlinear_SVM= model_svm_p3,
Nonlinear_MARS= model_mars_p3
))
summary (compare_all)$ statistics$ Rsquared
Min. 1st Qu. Median Mean 3rd Qu. Max.
Linear_PLS 0.2072534541 0.3875424 0.5626977 0.5335956 0.6291758 0.8326590
Linear_Ridge 0.1814282098 0.4535147 0.5723011 0.5633878 0.7525841 0.8960319
Nonlinear_SVM 0.1927160111 0.5144728 0.7167798 0.6557369 0.7828028 0.9675302
Nonlinear_MARS 0.0003492616 0.2927572 0.4320556 0.5109589 0.8431597 0.9132229
NA's
Linear_PLS 0
Linear_Ridge 0
Nonlinear_SVM 0
Nonlinear_MARS 0
pls_test<- postResample (predict (pls_model, testX), testY)
svm_test<- postResample (predict (model_svm_p3, testX), testY)
test_comparison<- rbind (PLS= pls_test, SVM= svm_test)
print (test_comparison)
RMSE Rsquared MAE
PLS 12.34869 0.3244542 8.288107
SVM 10.29572 0.5037946 6.555274
C) I would not recommend replacing the lab experiment with any model. the SVM test RMSE is 10.29 (large margin of error) and the Rsquared of .50 means the model is “guessing” half of the outcomes.