library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'caret'
## The following object is masked from 'package:pls':
##
## R2
data(tecator)
# ?tecator
str(absorp)
## num [1:215, 1:100] 2.62 2.83 2.58 2.82 2.79 ...
str(endpoints)
## num [1:215, 1:3] 60.5 46 71 72.8 58.3 44 44 69.3 61.4 61.4 ...
moisture = endpoints[,1]
fat = endpoints[,2]
protein = endpoints[,3]
absorp_df <- as.data.frame(absorp)
trans <- preProcess(absorp_df, method = c("BoxCox", "center", "scale", "pca"))
trans
## Created from 215 samples and 100 variables
##
## Pre-processing:
## - Box-Cox transformation (100)
## - centered (100)
## - ignored (0)
## - principal component signal extraction (100)
## - scaled (100)
##
## Lambda estimates for Box-Cox transformation:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.700 -1.600 -1.300 -1.303 -1.075 -0.900
##
## PCA needed 2 components to capture 95 percent of the variance
The first two principal components explain approximately 95% of the variance. Since this captures most of the variation in the predictors, the effective dimension is approximately 2.
# Split data
set.seed(123)
trainIndex <- createDataPartition(moisture, p = 0.8, list = FALSE)
x_train <- absorp_df[trainIndex, ]
x_test <- absorp_df[-trainIndex, ]
y_train <- moisture[trainIndex]
y_test <- moisture[-trainIndex]
# Pre-process
preProc <- preProcess(x_train, method = c("center", "scale"))
x_train <- predict(preProc, x_train)
x_test <- predict(preProc, x_test)
# OLS
set.seed(100)
indx <- createFolds(y_train, returnTrain = TRUE)
ctrl <- trainControl(method = "cv", index = indx)
set.seed(100)
lmTune0 <- train(x = x_train, y = y_train,
method = "lm",
trControl = ctrl)
testResults <- data.frame(obs = y_test,
Linear_Regression = predict(lmTune0, x_test))
# PLS
set.seed(100)
plsTune <- train(x = x_train, y = y_train,
method = "pls",
tuneGrid = expand.grid(ncomp = 1:50),
trControl = ctrl)
# PCR
set.seed(100)
pcrTune <- train(x = x_train, y = y_train,
method = "pcr",
tuneGrid = expand.grid(ncomp = 1:50),
trControl = ctrl)
# Ridge Regression
ridgeGrid <- expand.grid(lambda = seq(0, .1, length = 10))
set.seed(100)
ridgeTune <- train(x = x_train, y = y_train,
method = "ridge",
tuneGrid = ridgeGrid,
trControl = ctrl)
# ENET
enetGrid <- expand.grid(lambda = c(0, 0.01, .1),
fraction = seq(.05, 1, length = 20))
set.seed(100)
enetTune <- train(x = x_train, y = y_train,
method = "enet",
tuneGrid = enetGrid,
trControl = ctrl)
# Best params
plsTune$bestTune
## ncomp
## 17 17
pcrTune$bestTune
## ncomp
## 28 28
ridgeTune$bestTune
## lambda
## 1 0
enetTune$bestTune
## fraction lambda
## 1 0.05 0
The optimal tuning parameters selected via cross-validation were:
PCR: ncomp = 17
PLS: ncomp = 28
Ridge: lambda = 0
Elastic Net: lambda = 0, fraction = 0.05
R2 <- RMSE <-MAE <- numeric(0)
# OLS
testResults$OLS<- predict(lmTune0, x_test)
R2[1] = cor(testResults$OLS, y_test)^2
RMSE[1] = sqrt(mean((testResults$OLS - y_test)^2))
MAE[1] = mean(abs(testResults$OLS - y_test))
# PCR
testResults$PCR <- predict(pcrTune, x_test)
R2[2] = cor(testResults$PCR, y_test)^2
RMSE[2] = sqrt(mean((testResults$PCR - y_test)^2))
MAE[2] = mean(abs(testResults$PCR - y_test))
# PLS
testResults$PLS <- predict(plsTune, x_test)
R2[3] = cor(testResults$PLS, y_test)^2
RMSE[3] = sqrt(mean((testResults$PLS - y_test)^2))
MAE[3] = mean(abs(testResults$PLS - y_test))
# Ridge regression
testResults$Ridge <- predict(ridgeTune, x_test)
R2[4] = cor(testResults$Ridge, y_test)^2
RMSE[4] = sqrt(mean((testResults$Ridge - y_test)^2))
MAE[4] = mean(abs(testResults$Ridge - y_test))
# ENET regression
testResults$ENET <- predict(enetTune, x_test)
R2[5] = cor(testResults$ENET, y_test)^2
RMSE[5] = sqrt(mean((testResults$ENET - y_test)^2))
MAE[5] = mean(abs(testResults$ENET - y_test))
# Compare models
results = cbind(R2, RMSE, MAE)
row.names(results) = c("OLS", "PCR", "PLS", "Ridge", "ENET")
results
## R2 RMSE MAE
## OLS 0.8771040 3.545768 1.596031
## PCR 0.9736020 1.696345 1.271462
## PLS 0.9738681 1.615357 1.185521
## Ridge 0.8771120 3.545632 1.595980
## ENET 0.9799521 1.431302 1.044715
models <- list(
OLS = lmTune0,
PCR = pcrTune,
PLS = plsTune,
Ridge = ridgeTune,
ENET = enetTune
)
resamps <- resamples(models)
diffs <- diff(resamps)
summary(diffs)
##
## Call:
## summary.diff.resamples(object = diffs)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## MAE
## OLS PCR PLS Ridge ENET
## OLS 1.249e-01 1.723e-01 -3.341e-05 2.923e-01
## PCR 1.00 4.738e-02 -1.250e-01 1.673e-01
## PLS 1.00 1.00 -1.723e-01 1.200e-01
## Ridge 1.00 1.00 1.00 2.923e-01
## ENET 1.00 0.68 1.00 1.00
##
## RMSE
## OLS PCR PLS Ridge ENET
## OLS 4.503e-01 4.353e-01 -6.358e-05 6.219e-01
## PCR 0.8728 -1.505e-02 -4.504e-01 1.716e-01
## PLS 0.9888 1.0000 -4.353e-01 1.866e-01
## Ridge 1.0000 0.8727 0.9887 6.220e-01
## ENET 0.5034 1.0000 1.0000 0.5034
##
## Rsquared
## OLS PCR PLS Ridge ENET
## OLS -1.359e-02 -1.159e-02 2.997e-06 -2.261e-02
## PCR 1 2.001e-03 1.359e-02 -9.018e-03
## PLS 1 1 1.159e-02 -1.102e-02
## Ridge 1 1 1 -2.261e-02
## ENET 1 1 1 1
ENET had the lowest RMSE and MAE and the highest R2 on the test set. Based on these metrics, ENET appears to have the best predictive performance. Although ENET had the lowest test RMSE, pairwise comparisons of cross-validated results indicate that the performance differences were not statistically significant.
I would select ENET since it achieved the lowest RMSE and highest R2 on the test set, meaning it provides both strong predictive performance and better stability compared to the other models.