library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(tecator)
?tecator
## starting httpd help server ...
## done
str(absorp)
## num [1:215, 1:100] 2.62 2.83 2.58 2.82 2.79 ...
str(endpoints)
## num [1:215, 1:3] 60.5 46 71 72.8 58.3 44 44 69.3 61.4 61.4 ...
moisture <- endpoints[,1]
fat <- endpoints[,2]
protein <- endpoints[,3]
Analysis: The tecator dataset consists of 100 absorbance values from 250 observations, used to predict fat, protein, and moisture content.
pca_result <- prcomp(absorp, center = TRUE, scale. = TRUE)
explained <- cumsum(pca_result$sdev^2) / sum(pca_result$sdev^2)
which(explained >= 0.99)[1]
## [1] 2
which(explained >= 0.999)[1]
## [1] 4
plot(explained, type = "b",
xlab = "PC",
ylab = "Variance",
main = "PCA on Absorbance Data")
Answer: Out of 100 original frequency measurements, PCA reveals that the first 2 PC’s capture 99% of the variance in the absorbance data, and 4 components explain 99.9%. The effective dimension of the data is approximately 2-4, reflecting the high degree of collinearity among adjacent IR frequencies.
#Moisture
set.seed(123)
trainIndex <- createDataPartition(moisture, p = 0.75, list = FALSE)
x_train <- absorp[trainIndex, ]
x_test <- absorp[-trainIndex, ]
y_train <- moisture[trainIndex]
y_test <- moisture[-trainIndex]
colnames(x_train) <- paste0("V", 1:100)
colnames(x_test) <- paste0("V", 1:100)
preProc <- preProcess(x_train, method = c("center", "scale", "pca"), thresh = 0.99)
x_train_pp <- predict(preProc, x_train)
x_test_pp <- predict(preProc, x_test)
train_df <- data.frame(moisture = y_train, x_train_pp)
test_df <- data.frame(moisture = y_test, x_test_pp)
ctrl <- trainControl(method = "cv", number = 10, seed = set.seed(123))
set.seed(123)
ols_model <- train(moisture ~ ., data = train_df,
method = "lm", trControl = ctrl)
Answer: OLS has no tuning parameters to optimize. With an RMSE of 9.06, OLS assumes a perfect linear relationship and cannot handles any complexity or noise in the data.
set.seed(123)
pcr_model <- train(x = x_train, y = y_train,
method = "pcr",
tuneGrid = data.frame(ncomp = 1:20),
trControl = ctrl,
preProcess = c("center", "scale"))
pcr_model$bestTune
## ncomp
## 20 20
Answer: PCR’s optimal value is 20. This is high considering 99% of variance was captured in 2 PC’s.
set.seed(123)
pls_model <- train(x = x_train, y = y_train,
method = "pls",
tuneGrid = data.frame(ncomp = 1:20),
trControl = ctrl,
preProcess = c("center","scale"))
pls_model$bestTune
## ncomp
## 17 17
Answer: PLS’s optimal value is 17, extracting the moisture relevant signal in fewer steps than all other models.
set.seed(123)
ridge_model <- train(x = x_train, y = y_train,
method = "ridge",
tuneGrid = data.frame(lambda = 10^seq(-4, 2, length = 50)),
trControl = ctrl,
preProcess = c("center","scale"))
ridge_model$bestTune
## lambda
## 1 1e-04
Answer: Optimal value is 0.0001. The model applies almost no shrinkage.
models <- list(OLS = ols_model, PCR = pcr_model,
PLS = pls_model, Ridge = ridge_model)
results <- sapply(models, function(m) {
preds <- predict(m, newdata = if(m$method == "lm") test_df else x_test)
RMSE(preds, y_test)
})
print(sort(results))
## PLS Ridge PCR OLS
## 2.369104 2.586292 2.659928 9.056101
resamps <- resamples(models)
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: OLS, PCR, PLS, Ridge
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 5.446855 5.658757 6.495583 6.512544 7.142210 7.886826 0
## PCR 1.066259 1.368132 1.553602 1.511833 1.661615 1.833062 0
## PLS 1.023076 1.228533 1.405257 1.360726 1.427443 1.663493 0
## Ridge 1.766060 1.963904 2.037103 2.100236 2.271645 2.561486 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 6.428505 6.954599 8.645614 8.247932 9.430920 9.798508 0
## PCR 1.308971 1.809206 2.037026 2.088590 2.322411 2.974439 0
## PLS 1.368628 1.612490 1.795787 1.795749 1.852478 2.508862 0
## Ridge 1.921949 2.335287 2.549805 2.577315 2.953836 3.052742 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 0.1078058 0.2265206 0.3901654 0.3549907 0.4409990 0.5540652 0
## PCR 0.9378243 0.9539646 0.9708617 0.9648029 0.9757082 0.9830365 0
## PLS 0.9469358 0.9715995 0.9745968 0.9724449 0.9783105 0.9868762 0
## Ridge 0.9227000 0.9507495 0.9543645 0.9525356 0.9598810 0.9656493 0
dotplot(resamps)
Answer: PLS achieves the best predictive performance with a test RMSE of 2.369 and cross-validated R^2 of 0.972. PCR is comparable and not significantly different than PLS. Ridge performs worse than PLS and PCR but far better than OLS, which has a test RMSE of 9.056 and a mean R^2 of only 0.355. OLS cannot handle severe mutlicollinearity in these data set.
Answer: I would recommend PLS for this applications due to several factors:
PLS was designed for multicollinear predictors and continuous responses, which is what we see in the IR data set
PLS typically achieves strong prediction with few components and produces an interpretable stable model.
PLS matches or slightly outperforms the other methods on this data set, with consistently low cross-validated RMSE.
Unlike OLS which requires dimension reduction as a pre step, PLS handles collinearity natively, making CV simpler and more relaible with the sample size available.