Exercise 3

a) Start R and use these commands to load the data:

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'caret'

## The following object is masked from 'package:pls':
## 
##     R2

data(tecator)
# ?tecator
str(absorp)

##  num [1:215, 1:100] 2.62 2.83 2.58 2.82 2.79 ...

str(endpoints)

##  num [1:215, 1:3] 60.5 46 71 72.8 58.3 44 44 69.3 61.4 61.4 ...

moisture = endpoints[,1]
fat = endpoints[,2]
protein = endpoints[,3]

b) In this example the predictors are the measurements at the individual frequencies. Because the frequencies lie in a systematic order (850–1,050nm), the predictors have a high degree of correlation. Hence, the data lie in a smaller dimension than the total number of predictors (215). Use PCA to determine the effective dimension of these data. What is the effective dimension?

absorp_df <- as.data.frame(absorp)
trans <- preProcess(absorp_df, method = c("BoxCox", "center", "scale", "pca"))
trans

## Created from 215 samples and 100 variables
## 
## Pre-processing:
##   - Box-Cox transformation (100)
##   - centered (100)
##   - ignored (0)
##   - principal component signal extraction (100)
##   - scaled (100)
## 
## Lambda estimates for Box-Cox transformation:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -1.700  -1.600  -1.300  -1.303  -1.075  -0.900 
## 
## PCA needed 2 components to capture 95 percent of the variance

The first two principal components explain approximately 95% of the variance. Since this captures most of the variation in the predictors, the effective dimension is approximately 2.

c) Split the data into a training and a test set the response of the percentage of moisture, pre-process the data, and build at least three models described in this chapter (i.e., ordinary least squares, PCR, PLS, Ridge, and ENET). For those models with tuning parameters, what are the optimal values of the tuning parameter(s)?

# Split data
set.seed(123)
trainIndex <- createDataPartition(moisture, p = 0.8, list = FALSE)

x_train <- absorp_df[trainIndex, ]
x_test  <- absorp_df[-trainIndex, ]
y_train <- moisture[trainIndex]
y_test  <- moisture[-trainIndex]


# Pre-process
preProc <- preProcess(x_train, method = c("center", "scale"))

x_train <- predict(preProc, x_train)
x_test  <- predict(preProc, x_test)


# OLS
set.seed(100)
indx <- createFolds(y_train, returnTrain = TRUE)
ctrl <- trainControl(method = "cv", index = indx)

set.seed(100)
lmTune0 <- train(x = x_train, y = y_train,
                 method = "lm",
                 trControl = ctrl)

testResults <- data.frame(obs = y_test,
                          Linear_Regression = predict(lmTune0, x_test))


# PLS
set.seed(100)
plsTune <- train(x = x_train, y = y_train,
                 method = "pls",
                 tuneGrid = expand.grid(ncomp = 1:50),
                 trControl = ctrl)


# PCR
set.seed(100)
pcrTune <- train(x = x_train, y = y_train,
                 method = "pcr",
                 tuneGrid = expand.grid(ncomp = 1:50),
                 trControl = ctrl)


# Ridge Regression
ridgeGrid <- expand.grid(lambda = seq(0, .1, length = 10))

set.seed(100)
ridgeTune <- train(x = x_train, y = y_train,
                   method = "ridge",
                   tuneGrid = ridgeGrid,
                   trControl = ctrl)


# ENET
enetGrid <- expand.grid(lambda = c(0, 0.01, .1), 
                        fraction = seq(.05, 1, length = 20))
set.seed(100)
enetTune <- train(x = x_train, y = y_train,
                  method = "enet",
                  tuneGrid = enetGrid,
                  trControl = ctrl)


# Best params
plsTune$bestTune

##    ncomp
## 17    17

pcrTune$bestTune

##    ncomp
## 28    28

ridgeTune$bestTune

##   lambda
## 1      0

enetTune$bestTune

##   fraction lambda
## 1     0.05      0

The optimal tuning parameters selected via cross-validation were:

PCR: ncomp = 17
PLS: ncomp = 28
Ridge: lambda = 0
Elastic Net: lambda = 0, fraction = 0.05

d) Which model has the best predictive ability? Is any model significantly better or worse than the others?

R2 <- RMSE <-MAE <- numeric(0)

# OLS
testResults$OLS<- predict(lmTune0, x_test)
R2[1] = cor(testResults$OLS, y_test)^2
RMSE[1] = sqrt(mean((testResults$OLS - y_test)^2))
MAE[1] = mean(abs(testResults$OLS - y_test))

# PCR
testResults$PCR <- predict(pcrTune, x_test)
R2[2] = cor(testResults$PCR, y_test)^2
RMSE[2] = sqrt(mean((testResults$PCR - y_test)^2))
MAE[2] = mean(abs(testResults$PCR - y_test))

# PLS
testResults$PLS <- predict(plsTune, x_test)
R2[3] = cor(testResults$PLS, y_test)^2
RMSE[3] = sqrt(mean((testResults$PLS - y_test)^2))
MAE[3] = mean(abs(testResults$PLS - y_test))

# Ridge regression
testResults$Ridge <- predict(ridgeTune, x_test)
R2[4] = cor(testResults$Ridge, y_test)^2
RMSE[4] = sqrt(mean((testResults$Ridge - y_test)^2))
MAE[4] = mean(abs(testResults$Ridge - y_test))

# ENET regression
testResults$ENET <- predict(enetTune, x_test)
R2[5] = cor(testResults$ENET, y_test)^2
RMSE[5] = sqrt(mean((testResults$ENET - y_test)^2))
MAE[5] = mean(abs(testResults$ENET - y_test))


# Compare models
results = cbind(R2, RMSE, MAE)
row.names(results) = c("OLS", "PCR", "PLS", "Ridge", "ENET")
results

##              R2     RMSE      MAE
## OLS   0.8771040 3.545768 1.596031
## PCR   0.9736020 1.696345 1.271462
## PLS   0.9738681 1.615357 1.185521
## Ridge 0.8771120 3.545632 1.595980
## ENET  0.9799521 1.431302 1.044715

models <- list(
  OLS = lmTune0,
  PCR = pcrTune,
  PLS = plsTune,
  Ridge = ridgeTune,
  ENET = enetTune
)

resamps <- resamples(models)
diffs <- diff(resamps)
summary(diffs)

## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## MAE 
##       OLS  PCR        PLS        Ridge      ENET      
## OLS         1.249e-01  1.723e-01 -3.341e-05  2.923e-01
## PCR   1.00             4.738e-02 -1.250e-01  1.673e-01
## PLS   1.00 1.00                  -1.723e-01  1.200e-01
## Ridge 1.00 1.00       1.00                   2.923e-01
## ENET  1.00 0.68       1.00       1.00                 
## 
## RMSE 
##       OLS    PCR        PLS        Ridge      ENET      
## OLS           4.503e-01  4.353e-01 -6.358e-05  6.219e-01
## PCR   0.8728            -1.505e-02 -4.504e-01  1.716e-01
## PLS   0.9888 1.0000                -4.353e-01  1.866e-01
## Ridge 1.0000 0.8727     0.9887                 6.220e-01
## ENET  0.5034 1.0000     1.0000     0.5034               
## 
## Rsquared 
##       OLS PCR        PLS        Ridge      ENET      
## OLS       -1.359e-02 -1.159e-02  2.997e-06 -2.261e-02
## PCR   1               2.001e-03  1.359e-02 -9.018e-03
## PLS   1   1                      1.159e-02 -1.102e-02
## Ridge 1   1          1                     -2.261e-02
## ENET  1   1          1          1

ENET had the lowest RMSE and MAE and the highest R2 on the test set. Based on these metrics, ENET appears to have the best predictive performance. Although ENET had the lowest test RMSE, pairwise comparisons of cross-validated results indicate that the performance differences were not statistically significant.

e) Explain which model you would use for predicting the percentage of moisture of a sample.

I would select ENET since it achieved the lowest RMSE and highest R2 on the test set, meaning it provides both strong predictive performance and better stability compared to the other models.