library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(tecator)
?tecator
## starting httpd help server ...
## done
str(absorp)
## num [1:215, 1:100] 2.62 2.83 2.58 2.82 2.79 ...
str(endpoints)
## num [1:215, 1:3] 60.5 46 71 72.8 58.3 44 44 69.3 61.4 61.4 ...
moisture = endpoints[,1]
fat = endpoints[,2]
protein = endpoints[,3]
The results show that the PC1 already accounts for 98.62% of the total variance. First two account for 99.60%, first three accounts for 99.87%, and finally first four account for 99.99%. Meaning these predictors have a VERY high correlation to one another. The effective dimension is much much smaller than the original 100 values. 8 are needed for virtually 100%.
pca_model <- prcomp(absorp, center = TRUE, scale. = TRUE)
summary(pca_model)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 9.9311 0.9847 0.52851 0.33827 0.08038 0.05123 0.02681
## Proportion of Variance 0.9863 0.0097 0.00279 0.00114 0.00006 0.00003 0.00001
## Cumulative Proportion 0.9863 0.9960 0.99875 0.99990 0.99996 0.99999 0.99999
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.01961 0.008564 0.006739 0.004442 0.003361 0.001867
## Proportion of Variance 0.00000 0.000000 0.000000 0.000000 0.000000 0.000000
## Cumulative Proportion 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000
## PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.001377 0.0009449 0.0008641 0.0007558 0.0006977
## Proportion of Variance 0.000000 0.0000000 0.0000000 0.0000000 0.0000000
## Cumulative Proportion 1.000000 1.0000000 1.0000000 1.0000000 1.0000000
## PC19 PC20 PC21 PC22 PC23
## Standard deviation 0.0005884 0.0004628 0.0003897 0.0003341 0.0003123
## Proportion of Variance 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## Cumulative Proportion 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.0002721 0.0002616 0.000211 0.0001954 0.0001857
## Proportion of Variance 0.0000000 0.0000000 0.000000 0.0000000 0.0000000
## Cumulative Proportion 1.0000000 1.0000000 1.000000 1.0000000 1.0000000
## PC29 PC30 PC31 PC32 PC33
## Standard deviation 0.0001729 0.0001656 0.0001539 0.0001473 0.0001392
## Proportion of Variance 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## Cumulative Proportion 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## PC34 PC35 PC36 PC37 PC38
## Standard deviation 0.0001339 0.0001269 0.0001082 0.000104 9.98e-05
## Proportion of Variance 0.0000000 0.0000000 0.0000000 0.000000 0.00e+00
## Cumulative Proportion 1.0000000 1.0000000 1.0000000 1.000000 1.00e+00
## PC39 PC40 PC41 PC42 PC43
## Standard deviation 9.081e-05 8.668e-05 8.026e-05 7.762e-05 7.36e-05
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.00e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.00e+00
## PC44 PC45 PC46 PC47 PC48
## Standard deviation 6.808e-05 6.541e-05 6.44e-05 5.897e-05 5.422e-05
## Proportion of Variance 0.000e+00 0.000e+00 0.00e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.00e+00 1.000e+00 1.000e+00
## PC49 PC50 PC51 PC52 PC53
## Standard deviation 5.027e-05 4.893e-05 4.608e-05 4.419e-05 4.037e-05
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC54 PC55 PC56 PC57 PC58 PC59
## Standard deviation 3.854e-05 3.8e-05 3.64e-05 3.497e-05 3.443e-05 3.264e-05
## Proportion of Variance 0.000e+00 0.0e+00 0.00e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.0e+00 1.00e+00 1.000e+00 1.000e+00 1.000e+00
## PC60 PC61 PC62 PC63 PC64
## Standard deviation 3.104e-05 3.04e-05 2.959e-05 2.844e-05 2.699e-05
## Proportion of Variance 0.000e+00 0.00e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.00e+00 1.000e+00 1.000e+00 1.000e+00
## PC65 PC66 PC67 PC68 PC69
## Standard deviation 2.586e-05 2.388e-05 2.364e-05 2.284e-05 2.173e-05
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC70 PC71 PC72 PC73 PC74
## Standard deviation 2.058e-05 1.997e-05 1.93e-05 1.854e-05 1.807e-05
## Proportion of Variance 0.000e+00 0.000e+00 0.00e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.00e+00 1.000e+00 1.000e+00
## PC75 PC76 PC77 PC78 PC79
## Standard deviation 1.728e-05 1.693e-05 1.612e-05 1.569e-05 1.516e-05
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC80 PC81 PC82 PC83 PC84
## Standard deviation 1.445e-05 1.408e-05 1.356e-05 1.275e-05 1.224e-05
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC85 PC86 PC87 PC88 PC89
## Standard deviation 1.178e-05 1.09e-05 1.045e-05 1.009e-05 9.396e-06
## Proportion of Variance 0.000e+00 0.00e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.00e+00 1.000e+00 1.000e+00 1.000e+00
## PC90 PC91 PC92 PC93 PC94
## Standard deviation 8.728e-06 8.27e-06 7.613e-06 6.83e-06 6.383e-06
## Proportion of Variance 0.000e+00 0.00e+00 0.000e+00 0.00e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.00e+00 1.000e+00 1.00e+00 1.000e+00
## PC95 PC96 PC97 PC98 PC99
## Standard deviation 5.946e-06 5.478e-06 4.826e-06 4.521e-06 4.164e-06
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC100
## Standard deviation 4.122e-06
## Proportion of Variance 0.000e+00
## Cumulative Proportion 1.000e+00
pve <- pca_model$sdev^2 / sum(pca_model$sdev^2)
cum_pve <- cumsum(pve)
which(cum_pve >= 0.95)[1]
## [1] 1
which(cum_pve >= 0.99)[1]
## [1] 2
The optimal tuning parameter for PCR was 25 and the optimal tuning parameter for PLS was 17.
set.seed(67)
trainIndex <- createDataPartition(moisture, p = 0.80, list = FALSE)
trainX <- absorp[trainIndex, ]
testX <- absorp[-trainIndex, ]
trainY <- moisture[trainIndex]
testY <- moisture[-trainIndex]
indx <- createFolds(trainY, returnTrain = TRUE)
ctrl <- trainControl(method = "cv", index = indx)
colnames(trainX) <- paste0("V", 1:ncol(trainX))
colnames(testX) <- paste0("V", 1:ncol(testX))
cat("\n----------------------------\n")
##
## ----------------------------
lmTune <- train(x = trainX, y = trainY,
method = "lm",
preProc = c("center", "scale"),
trControl = ctrl)
lmTune
## Linear Regression
##
## 175 samples
## 100 predictors
##
## Pre-processing: centered (100), scaled (100)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 157, 158, 156, 158, 159, 158, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 3.839397 0.8601396 2.027833
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
cat("\n----------------------------\n")
##
## ----------------------------
cat("\n----------------------------\n")
##
## ----------------------------
pcrTune <- train(x = trainX, y = trainY,
method = "pcr",
preProc = c("center", "scale"),
tuneLength = 30,
trControl = ctrl)
pcrTune
## Principal Component Analysis
##
## 175 samples
## 100 predictors
##
## Pre-processing: centered (100), scaled (100)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 157, 158, 156, 158, 159, 158, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 8.918516 0.2738412 7.446747
## 2 8.873592 0.2964492 7.254325
## 3 5.718315 0.7037865 4.615354
## 4 3.421404 0.8951783 2.811201
## 5 2.873770 0.9300424 2.359552
## 6 2.828363 0.9316329 2.346264
## 7 2.762611 0.9338621 2.276888
## 8 2.723571 0.9383551 2.226541
## 9 2.737587 0.9371441 2.234680
## 10 2.762930 0.9360845 2.262079
## 11 2.643292 0.9363912 2.113640
## 12 2.456207 0.9416103 1.934253
## 13 2.441096 0.9420819 1.913166
## 14 2.495958 0.9401713 1.948425
## 15 2.357023 0.9473157 1.859512
## 16 2.306905 0.9495149 1.788434
## 17 2.259419 0.9516102 1.765510
## 18 2.301545 0.9491246 1.783582
## 19 2.530310 0.9418761 1.861921
## 20 2.629929 0.9373754 1.918056
## 21 2.363114 0.9491115 1.722864
## 22 2.387254 0.9475752 1.730017
## 23 2.335965 0.9498549 1.700054
## 24 2.106572 0.9590438 1.603053
## 25 2.039725 0.9649313 1.518804
## 26 2.161037 0.9588776 1.573855
## 27 2.242274 0.9562550 1.608536
## 28 2.233768 0.9571140 1.597431
## 29 2.190263 0.9588667 1.592598
## 30 2.257868 0.9517397 1.596023
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 25.
cat("\n----------------------------\n")
##
## ----------------------------
plot(pcrTune, main = "PCR:RMSE vs # of components")
cat("\n----------------------------\n")
##
## ----------------------------
plsTune <- train(x = trainX, y = trainY,
method = "pls",
preProc = c("center", "scale"),
tuneLength = 30,
trControl = ctrl)
plsTune
## Partial Least Squares
##
## 175 samples
## 100 predictors
##
## Pre-processing: centered (100), scaled (100)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 157, 158, 156, 158, 159, 158, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 8.898517 0.2777581 7.426684
## 2 5.738113 0.7117791 4.484318
## 3 4.017755 0.8518553 3.222734
## 4 3.333640 0.9008473 2.735535
## 5 2.816489 0.9327490 2.324863
## 6 2.755151 0.9349325 2.279849
## 7 2.701453 0.9377714 2.210825
## 8 2.658599 0.9413408 2.161465
## 9 2.539368 0.9394499 2.054988
## 10 2.447510 0.9421967 1.901083
## 11 2.414457 0.9431232 1.896990
## 12 2.362995 0.9462841 1.858345
## 13 2.286316 0.9520588 1.757587
## 14 2.308996 0.9528490 1.736289
## 15 2.397282 0.9459794 1.683319
## 16 2.385624 0.9473760 1.616656
## 17 2.139900 0.9590627 1.542687
## 18 2.161794 0.9589688 1.525616
## 19 2.292204 0.9510776 1.567784
## 20 2.622930 0.9279935 1.666183
## 21 2.677956 0.9243340 1.672864
## 22 2.709574 0.9220305 1.675718
## 23 2.703456 0.9223025 1.678688
## 24 2.540745 0.9281428 1.601314
## 25 2.472579 0.9314009 1.588459
## 26 2.460867 0.9339831 1.537221
## 27 2.403697 0.9387607 1.470915
## 28 2.438613 0.9379271 1.532823
## 29 2.452226 0.9370948 1.515925
## 30 2.558352 0.9285030 1.529636
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 17.
cat("\n----------------------------\n")
##
## ----------------------------
plot(plsTune, main = "PLS: RMSE vs # of components")
cat("\n----------------------------\n")
##
## ----------------------------
cat("\nOptimal Tuning Parameters\n")
##
## Optimal Tuning Parameters
cat("PCR:", pcrTune$bestTune$ncomp,"\n")
## PCR: 25
cat("PLS:", plsTune$bestTune$ncomp,"\n")
## PLS: 17
It seems that PLS is the best predictor for this task.
It has the lowest RMSE at 1.7251, the highest R^2 at 0.9654, and the
lowest MAE at 1.2686.
Meanwhile ordinary least squares is the worse model. It has the highest
RMSE by far 2.3170 and the and the highest MAE by far at 1.5950
lm.pred <- predict(lmTune, testX)
pcr.pred <- predict(pcrTune, testX)
pls.pred <- predict(plsTune, testX)
results <- data.frame(rbind(
OLS = postResample(pred = lm.pred, obs = testY),
PCR = postResample(pred = pcr.pred, obs = testY),
PLS = postResample(pred = pls.pred, obs = testY)))
print(round(results, 4))
## RMSE Rsquared MAE
## OLS 2.3170 0.9546 1.5950
## PCR 1.7985 0.9624 1.2911
## PLS 1.7251 0.9654 1.2686
par(mfrow = c(2, 3))
for(mod in list(list(lm.pred,"OLS"), list(pcr.pred,"PCR"), list(pls.pred,"PLS"))){
plot(testY, mod[[1]], main = mod[[2]],
xlab = "observed moisture", ylab = "predicted moisture",
pch = 19, col = "blue")
abline(0, 1, col = "red", lwd = 2)
}
par(mfrow = c(1, 1))
I would use PLS as it has the lowest RMSE which means it has the most accurate predictions out of the three. PLS also has the lowest MAE which means it has the smallest average absolute error.