library(caret)
data(tecator)
library(pls)
library(elasticnet)
library(glmnet)
library(caret)
data(tecator)
# The absorb matrix contains the spectroscopy data
# Perform PCA
pca_result <- prcomp(absorp, center = TRUE, scale. = TRUE)
# Calculate the proportion of variance explained by each PC
variance_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)
# Calculate cumulative variance explained
cumulative_variance <- cumsum(variance_explained)
# Find effective dimension at different thresholds
dim_95 <- which(cumulative_variance >= 0.95)[1]
dim_99 <- which(cumulative_variance >= 0.99)[1]
cat("Effective dimension (95% variance):", dim_95, "\n")
## Effective dimension (95% variance): 1
cat("Effective dimension (99% variance):", dim_99, "\n")
## Effective dimension (99% variance): 2
# Visualize with a scree plot
par(mfrow = c(1, 2))
# Scree plot
plot(1:20, variance_explained[1:20], type = "b", 
     xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained",
     main = "Scree Plot")
# Cumulative variance plot
plot(1:20, cumulative_variance[1:20], type = "b",
     xlab = "Number of Components",
     ylab = "Cumulative Variance Explained",
     main = "Cumulative Variance")
abline(h = 0.95, col = "red", lty = 2)
abline(h = 0.99, col = "blue", lty = 2)
legend("bottomright", legend = c("95%", "99%"), 
       col = c("red", "blue"), lty = 2)
# Check dimensions
cat("Dimensions of absorp matrix:\n")
## Dimensions of absorp matrix:
print(dim(absorp))
## [1] 215 100
cat("This means", nrow(absorp), "samples with", ncol(absorp), "wavelength measurements\n\n")
## This means 215 samples with 100 wavelength measurements
cat("Dimensions of endpoints matrix:\n")
## Dimensions of endpoints matrix:
print(dim(endpoints))
## [1] 215   3
cat("This means", nrow(endpoints), "samples with", ncol(endpoints), "chemical composition measurements\n\n")
## This means 215 samples with 3 chemical composition measurements
# View first few rows
cat("First few rows of absorp (spectroscopy data):\n")
## First few rows of absorp (spectroscopy data):
print(head(absorp, 5))
##         [,1]    [,2]    [,3]    [,4]    [,5]    [,6]    [,7]    [,8]    [,9]
## [1,] 2.61776 2.61814 2.61859 2.61912 2.61981 2.62071 2.62186 2.62334 2.62511
## [2,] 2.83454 2.83871 2.84283 2.84705 2.85138 2.85587 2.86060 2.86566 2.87093
## [3,] 2.58284 2.58458 2.58629 2.58808 2.58996 2.59192 2.59401 2.59627 2.59873
## [4,] 2.82286 2.82460 2.82630 2.82814 2.83001 2.83192 2.83392 2.83606 2.83842
## [5,] 2.78813 2.78989 2.79167 2.79350 2.79538 2.79746 2.79984 2.80254 2.80553
##        [,10]   [,11]   [,12]   [,13]   [,14]   [,15]   [,16]   [,17]   [,18]
## [1,] 2.62722 2.62964 2.63245 2.63565 2.63933 2.64353 2.64825 2.65350 2.65937
## [2,] 2.87661 2.88264 2.88898 2.89577 2.90308 2.91097 2.91953 2.92873 2.93863
## [3,] 2.60131 2.60414 2.60714 2.61029 2.61361 2.61714 2.62089 2.62486 2.62909
## [4,] 2.84097 2.84374 2.84664 2.84975 2.85307 2.85661 2.86038 2.86437 2.86860
## [5,] 2.80890 2.81272 2.81704 2.82184 2.82710 2.83294 2.83945 2.84664 2.85458
##        [,19]   [,20]   [,21]   [,22]   [,23]   [,24]   [,25]   [,26]   [,27]
## [1,] 2.66585 2.67281 2.68008 2.68733 2.69427 2.70073 2.70684 2.71281 2.71914
## [2,] 2.94929 2.96072 2.97272 2.98493 2.99690 3.00833 3.01920 3.02990 3.04101
## [3,] 2.63361 2.63835 2.64330 2.64838 2.65354 2.65870 2.66375 2.66880 2.67383
## [4,] 2.87308 2.87789 2.88301 2.88832 2.89374 2.89917 2.90457 2.90991 2.91521
## [5,] 2.86331 2.87280 2.88291 2.89335 2.90374 2.91371 2.92305 2.93187 2.94060
##        [,28]   [,29]   [,30]   [,31]   [,32]   [,33]   [,34]   [,35]   [,36]
## [1,] 2.72628 2.73462 2.74416 2.75466 2.76568 2.77679 2.78790 2.79949 2.81225
## [2,] 3.05345 3.06777 3.08416 3.10221 3.12106 3.13983 3.15810 3.17623 3.19519
## [3,] 2.67892 2.68411 2.68937 2.69470 2.70012 2.70563 2.71141 2.71775 2.72490
## [4,] 2.92043 2.92565 2.93082 2.93604 2.94128 2.94658 2.95202 2.95777 2.96419
## [5,] 2.94986 2.96035 2.97241 2.98606 3.00097 3.01652 3.03220 3.04793 3.06413
##        [,37]   [,38]   [,39]   [,40]   [,41]   [,42]   [,43]   [,44]   [,45]
## [1,] 2.82706 2.84356 2.86106 2.87857 2.89497 2.90924 2.92085 2.93015 2.93846
## [2,] 3.21584 3.23747 3.25889 3.27835 3.29384 3.30362 3.30681 3.30393 3.29700
## [3,] 2.73344 2.74327 2.75433 2.76642 2.77931 2.79272 2.80649 2.82064 2.83541
## [4,] 2.97159 2.98045 2.99090 3.00284 3.01611 3.03048 3.04579 3.06194 3.07889
## [5,] 3.08153 3.10078 3.12185 3.14371 3.16510 3.18470 3.20140 3.21477 3.22544
##        [,46]   [,47]   [,48]   [,49]   [,50]   [,51]   [,52]   [,53]   [,54]
## [1,] 2.94771 2.96019 2.97831 3.00306 3.03506 3.07428 3.11963 3.16868 3.21771
## [2,] 3.28925 3.28409 3.28505 3.29326 3.30923 3.33267 3.36251 3.39661 3.43188
## [3,] 2.85121 2.86872 2.88905 2.91289 2.94088 2.97325 3.00946 3.04780 3.08554
## [4,] 3.09686 3.11629 3.13775 3.16217 3.19068 3.22376 3.26172 3.30379 3.34793
## [5,] 3.23505 3.24586 3.26027 3.28063 3.30889 3.34543 3.39019 3.44198 3.49800
##        [,55]   [,56]   [,57]   [,58]   [,59]   [,60]   [,61]   [,62]   [,63]
## [1,] 3.26254 3.29988 3.32847 3.34899 3.36342 3.37379 3.38152 3.38741 3.39164
## [2,] 3.46492 3.49295 3.51458 3.53004 3.54067 3.54797 3.55306 3.55675 3.55921
## [3,] 3.11947 3.14696 3.16677 3.17938 3.18631 3.18924 3.18950 3.18801 3.18498
## [4,] 3.39093 3.42920 3.45998 3.48227 3.49687 3.50558 3.51026 3.51221 3.51215
## [5,] 3.55407 3.60534 3.64789 3.68011 3.70272 3.71815 3.72863 3.73574 3.74059
##        [,64]   [,65]   [,66]   [,67]   [,68]   [,69]   [,70]   [,71]   [,72]
## [1,] 3.39418 3.39490 3.39366 3.39045 3.38541 3.37869 3.37041 3.36073 3.34979
## [2,] 3.56045 3.56034 3.55876 3.55571 3.55132 3.54585 3.53950 3.53235 3.52442
## [3,] 3.18039 3.17411 3.16611 3.15641 3.14512 3.13241 3.11843 3.10329 3.08714
## [4,] 3.51036 3.50682 3.50140 3.49398 3.48457 3.47333 3.46041 3.44595 3.43005
## [5,] 3.74357 3.74453 3.74336 3.73991 3.73418 3.72638 3.71676 3.70553 3.69289
##        [,73]   [,74]   [,75]   [,76]   [,77]   [,78]   [,79]   [,80]   [,81]
## [1,] 3.33769 3.32443 3.31013 3.29487 3.27891 3.26232 3.24542 3.22828 3.21080
## [2,] 3.51583 3.50668 3.49700 3.48683 3.47626 3.46552 3.45501 3.44481 3.43477
## [3,] 3.07014 3.05237 3.03393 3.01504 2.99569 2.97612 2.95642 2.93660 2.91667
## [4,] 3.41285 3.39450 3.37511 3.35482 3.33376 3.31204 3.28986 3.26730 3.24442
## [5,] 3.67900 3.66396 3.64785 3.63085 3.61305 3.59463 3.57582 3.55695 3.53796
##        [,82]   [,83]   [,84]   [,85]   [,86]   [,87]   [,88]   [,89]   [,90]
## [1,] 3.19287 3.17433 3.15503 3.13475 3.11339 3.09116 3.06850 3.04596 3.02393
## [2,] 3.42465 3.41419 3.40303 3.39082 3.37731 3.36265 3.34745 3.33245 3.31818
## [3,] 2.89655 2.87622 2.85563 2.83474 2.81361 2.79235 2.77113 2.75015 2.72956
## [4,] 3.22117 3.19757 3.17357 3.14915 3.12429 3.09908 3.07366 3.04825 3.02308
## [5,] 3.51880 3.49936 3.47938 3.45869 3.43711 3.41458 3.39129 3.36772 3.34450
##        [,91]   [,92]   [,93]   [,94]   [,95]   [,96]   [,97]   [,98]   [,99]
## [1,] 3.00247 2.98145 2.96072 2.94013 2.91978 2.89966 2.87964 2.85960 2.83940
## [2,] 3.30473 3.29186 3.27921 3.26655 3.25369 3.24045 3.22659 3.21181 3.19600
## [3,] 2.70934 2.68951 2.67009 2.65112 2.63262 2.61461 2.59718 2.58034 2.56404
## [4,] 2.99820 2.97367 2.94951 2.92576 2.90251 2.87988 2.85794 2.83672 2.81617
## [5,] 3.32201 3.30025 3.27907 3.25831 3.23784 3.21765 3.19766 3.17770 3.15770
##       [,100]
## [1,] 2.81920
## [2,] 3.17942
## [3,] 2.54816
## [4,] 2.79622
## [5,] 3.13753
cat("\n")
cat("First few rows of endpoints (moisture, fat, protein):\n")
## First few rows of endpoints (moisture, fat, protein):
print(head(endpoints, 5))
##      [,1] [,2] [,3]
## [1,] 60.5 22.5 16.7
## [2,] 46.0 40.1 13.5
## [3,] 71.0  8.4 20.5
## [4,] 72.8  5.9 20.7
## [5,] 58.3 25.5 15.5
cat("\n")
# Look at the endpoints more closely
cat("Summary statistics for endpoints:\n")
## Summary statistics for endpoints:
cat("(Columns are: moisture, fat, protein)\n")
## (Columns are: moisture, fat, protein)
print(summary(endpoints))
##        V1              V2              V3       
##  Min.   :39.30   Min.   : 0.90   Min.   :11.00  
##  1st Qu.:55.55   1st Qu.: 7.30   1st Qu.:15.35  
##  Median :65.70   Median :14.00   Median :18.70  
##  Mean   :63.20   Mean   :18.14   Mean   :17.68  
##  3rd Qu.:71.80   3rd Qu.:28.00   3rd Qu.:20.10  
##  Max.   :76.60   Max.   :49.10   Max.   :21.80
cat("\n")
# Additional useful information
cat("Range of values for each endpoint:\n")
## Range of values for each endpoint:
cat("Moisture: ", range(endpoints[,1]), "\n")
## Moisture:  39.3 76.6
cat("Fat: ", range(endpoints[,2]), "\n")
## Fat:  0.9 49.1
cat("Protein: ", range(endpoints[,3]), "\n")
## Protein:  11 21.8
# Prepare the data
# Use moisture (column 1) as the response for this example
X <- absorp
y <- endpoints[, 1]  # moisture
# Prepare the data with column names from the start
X <- as.data.frame(absorp)
colnames(X) <- paste0("V", 1:ncol(X))
y <- endpoints[, 1]  # moisture
# Create training and test sets (80/20 split)
set.seed(103025)
training_indices <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[training_indices, ]
y_train <- y[training_indices]
X_test <- X[-training_indices, ]
y_test <- y[-training_indices]
# Set up cross-validation
ctrl <- trainControl(method = "cv", number = 10)
# 2. PRINCIPAL COMPONENT REGRESSION (PCR)
pcr_model <- train(X_train, y_train,
                   method = "pcr",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)
cat("Optimal number of components for PCR:", pcr_model$bestTune$ncomp, "\n")
## Optimal number of components for PCR: 17
plot(pcr_model, main = "PCR Model")
# 3. PARTIAL LEAST SQUARES (PLS)
pls_model <- train(X_train, y_train,
                   method = "pls",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)
cat("Optimal number of components for PLS:", pls_model$bestTune$ncomp, "\n")
## Optimal number of components for PLS: 18
plot(pls_model, main = "PLS Model")
# 4. RIDGE REGRESSION
ridge_model <- train(X_train, y_train,
                     method = "ridge",
                     trControl = ctrl,
                     preProcess = c("center", "scale"),
                     tuneGrid = expand.grid(lambda = seq(0, 0.1, length = 20)))
cat("Optimal lambda for Ridge:", ridge_model$bestTune$lambda, "\n")
## Optimal lambda for Ridge: 0.005263158
plot(ridge_model, main = "Ridge Regression")
# 5. LASSO
lasso_model <- train(X_train, y_train,
                     method = "lasso",
                     trControl = ctrl,
                     preProcess = c("center", "scale"),
                     tuneGrid = expand.grid(fraction = seq(0.05, 1, length = 20)))
cat("Optimal fraction for Lasso:", lasso_model$bestTune$fraction, "\n")
## Optimal fraction for Lasso: 0.05
plot(lasso_model, main = "Lasso")
# 6. ELASTIC NET
enet_model <- train(X_train, y_train,
                    method = "enet",
                    trControl = ctrl,
                    preProcess = c("center", "scale"),
                    tuneGrid = expand.grid(
                      lambda = seq(0, 0.01, length = 10),
                      fraction = seq(0.05, 1, length = 10)))
cat("Optimal parameters for Elastic Net:\n")
## Optimal parameters for Elastic Net:
print(enet_model$bestTune)
##   fraction lambda
## 1     0.05      0
plot(enet_model, main = "Elastic Net")
# Compare all models
results <- resamples(list(
  PCR = pcr_model,
  PLS = pls_model,
  Ridge = ridge_model,
  Lasso = lasso_model,
  ElasticNet = enet_model
))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: PCR, PLS, Ridge, Lasso, ElasticNet 
## Number of resamples: 10 
## 
## MAE 
##                 Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## PCR        0.9907898 1.655433 1.750641 1.706735 1.894032 1.978650    0
## PLS        0.9950547 1.237408 1.525913 1.479060 1.694800 1.924520    0
## Ridge      1.8510309 2.082544 2.527221 2.426472 2.711722 3.042205    0
## Lasso      1.0783121 1.359751 1.513780 1.577265 1.750224 2.293645    0
## ElasticNet 0.9704648 1.239734 1.417753 1.530344 1.823465 2.231281    0
## 
## RMSE 
##                Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## PCR        1.131044 2.077892 2.260061 2.261940 2.400253 3.121051    0
## PLS        1.170039 1.801339 2.067001 2.158819 2.704045 3.102498    0
## Ridge      2.138349 2.405387 3.102519 2.943903 3.323940 3.869651    0
## Lasso      1.542059 1.729813 2.335174 2.388967 2.690583 4.055471    0
## ElasticNet 1.279046 1.592042 2.146024 2.201900 2.835488 3.309855    0
## 
## Rsquared 
##                 Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## PCR        0.9108570 0.9472697 0.9562315 0.9558933 0.9647052 0.9871226    0
## PLS        0.8513291 0.9500963 0.9638373 0.9537026 0.9791718 0.9838614    0
## Ridge      0.8470151 0.9192795 0.9250380 0.9284689 0.9432627 0.9839045    0
## Lasso      0.8525562 0.9417862 0.9528760 0.9481198 0.9711296 0.9789369    0
## ElasticNet 0.8992566 0.9180510 0.9615250 0.9514368 0.9818809 0.9855794    0
dotplot(results)
# Test set performance
models <- list(
  PCR = pcr_model,
  PLS = pls_model,
  Ridge = ridge_model,
  Lasso = lasso_model,
  ElasticNet = enet_model
)
test_results <- data.frame(
  Model = names(models),
  RMSE = numeric(length(models)),
  Rsquared = numeric(length(models))
)
for (i in seq_along(models)) {
  pred <- predict(models[[i]], X_test)
  test_results$RMSE[i] <- sqrt(mean((pred - y_test)^2))
  test_results$Rsquared[i] <- cor(pred, y_test)^2
}
print("Test Set Performance:")
## [1] "Test Set Performance:"
print(test_results)
##        Model     RMSE  Rsquared
## 1        PCR 1.959721 0.9590600
## 2        PLS 1.514225 0.9752885
## 3      Ridge 2.689994 0.9215908
## 4      Lasso 1.456885 0.9772255
## 5 ElasticNet 1.456885 0.9772255
OPTIMAL TUNING PARAMETERS
## PCR (Principal Component Regression):
##   Optimal number of components: 17
##   Cross-validated RMSE: 2.26194
## PLS (Partial Least Squares):
##   Optimal number of components: 18
##   Cross-validated RMSE: 2.158819
## Ridge Regression:
##   Optimal lambda: 0.005263158
##   Cross-validated RMSE: 2.943903
## Lasso:
##   Optimal fraction: 0.05
##   Cross-validated RMSE: 2.388967
## Elastic Net:
##   Optimal lambda: 0
##   Optimal fraction: 0.05
##   Cross-validated RMSE: 2.2019
## ========================================
## BEST MODEL (by CV RMSE):
## ========================================
## The model considered the best model is the one with the lowest RMSE score: PLS with a cross-validated RMSE of 2.158819
MODEL PERFORMANCE ANALYSIS
## All five regression models were evaluated using 10-fold cross-validation
## to predict moisture content from spectroscopy data. The results show
## clear performance differences:
## Cross-Validated RMSE:
##   1. PLS:         2.159  (BEST)
##   2. Elastic Net: 2.202  (+2.0%)
##   3. PCR:         2.262  (+4.8%)
##   4. Lasso:       2.389  (+10.7%)
##   5. Ridge:       2.944  (+36.4%)
## Key Findings:
## PLS emerged as the best model with the lowest prediction error (RMSE = 2.159).
## It uses 18 components to capture the underlying patterns in the data.
## Elastic Net performed nearly as well (only 2% worse), making it a viable
## alternative to PLS. The small difference suggests both methods effectively
## handle the high correlation in spectroscopy data.
## PCR was competitive but slightly worse than PLS (4.8% higher RMSE). Both
## methods use similar numbers of components (17-18), but PLS has an advantage
## because it considers the response variable when creating components, while
## PCR does not.
## Lasso performed moderately well but was 10.7% worse than PLS. Its aggressive
## variable selection approach may have removed useful predictors.
## Ridge Regression performed worst, with RMSE 36% higher than PLS. Ridge keeps
## all 100 highly correlated predictors, which hurts performance compared to
## dimension reduction methods like PLS and PCR that reduce to ~18 components.
## Recommendation: Use PLS with 18 components for predicting moisture content.
## It provides the best predictive performance and effectively handles the
## multicollinearity inherent in near-infrared spectroscopy data.
## ========================================
## PREDICTING FAT CONTENT
## Training models to predict fat content...
## 
## === FAT PREDICTION RESULTS ===
## Cross-Validated RMSE (Best to Worst):
##   1. Lasso        2.129 <- BEST
##   2. ElasticNet   2.308 (+8.4%)
##   3. PLS          2.322 (+9.1%)
##   4. PCR          2.365 (+11.1%)
##   5. Ridge        2.632 (+23.6%)
## 
## Optimal Tuning Parameters:
##   PCR components: 17
##   PLS components: 14
##   Ridge lambda: 0
##   Lasso fraction: 0.25
##   Elastic Net lambda: 0 , fraction: 0.2611111
## Test Set RMSE:
##   1. PLS          2.197 <- BEST
##   2. PCR          2.326
##   3. Lasso        5.601
##   4. ElasticNet   5.669
##   5. Ridge        6.534
## 
## Recommendation for Fat Prediction:
## Use Lasso for predicting fat content
## CV RMSE: 2.129
## Test RMSE: 5.601
Developing a model to predict permeability could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug. Molecular fingerprints are binary indicators that represent the presence (1) or absence (0) of specific chemical substructures in a molecule. They are commonly used in:
library(AppliedPredictiveModeling)
data(permeability)
## EXPLORATORY DATA ANALYSIS
## Permeability Dataset
## === DATASET DIMENSIONS ===
## Fingerprints matrix dimensions: 165 1107
##   - Number of compounds: 165
##   - Number of molecular descriptors: 1107
## Permeability vector length: 165
## === PERMEABILITY (RESPONSE VARIABLE) ===
## Summary statistics:
##   permeability  
##  Min.   : 0.06  
##  1st Qu.: 1.55  
##  Median : 4.91  
##  Mean   :12.24  
##  3rd Qu.:15.47  
##  Max.   :55.60
## 
## Standard deviation: 15.579
## Range: 55.54
# Check dimensions
cat("Dimensions of fingerprint matrix:\n")
## Dimensions of fingerprint matrix:
print(dim(fingerprints))
## [1]  165 1107
cat("This means", nrow(fingerprints), "samples with", ncol(fingerprints), "predictors\n")
## This means 165 samples with 1107 predictors
Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?
library(caret)
library(AppliedPredictiveModeling)
data(permeability)
nzv <- nearZeroVar(fingerprints)
fingerprints_filtered <- fingerprints[, -nzv]
ncol(fingerprints_filtered)  
## [1] 388
cat("The number of predictors left for modeling:", ncol(fingerprints_filtered))
## The number of predictors left for modeling: 388
Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?
library(caret)
library(AppliedPredictiveModeling)
library(caret)
library(pls)        # for pcr
library(elasticnet) # for ridge/lasso/enet
library(kknn)
## 
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
## 
##     contr.dummy
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
set.seed(103025)
ctrl <- trainControl(method = "cv", number = 10)
# Load data
data(permeability)
# Filter near-zero variance predictors
nzv <- nearZeroVar(fingerprints)
fingerprints_filtered <- fingerprints[, -nzv]
cat("Predictors after filtering:", ncol(fingerprints_filtered), "\n\n")
## Predictors after filtering: 388
# Split data into training and test sets (75/25)
set.seed(103125)
trainIndex <- createDataPartition(permeability, p = 0.75, list = FALSE)
trainX <- fingerprints_filtered[trainIndex, ]
trainY <- permeability[trainIndex]
testX <- fingerprints_filtered[-trainIndex, ]
testY <- permeability[-trainIndex]
cat("Training set:", nrow(trainX), "samples\n")
## Training set: 125 samples
cat("Test set:", nrow(testX), "samples\n\n")
## Test set: 40 samples
# Set up 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)
# Tune PLS model (preprocessing: center and scale)
set.seed(103025)
plsTune <- train(
  x = trainX,
  y = trainY,
  method = "pls",
  tuneLength = 20,
  trControl = ctrl,
  preProc = c("center", "scale")
)
# Get results
bestTune <- plsTune$bestTune
bestResults <- plsTune$results[plsTune$results$ncomp == bestTune$ncomp, ]
cat("1. Optimal number of latent variables:", bestTune$ncomp, "\n\n")
## 1. Optimal number of latent variables: 7
cat("2. Resampled R²:", round(bestResults$Rsquared, 4), "\n\n")
## 2. Resampled R²: 0.5212
cat("======================================\n")
## ======================================
What is the test set estimate of R2?
# Predict the response for the test set
testPred <- predict(plsTune, newdata = testX)
# Calculate test set R²
testR2 <- cor(testPred, testY)^2
# ANSWER
cat("Test set estimate of R²:", round(testR2, 4), "\n\n")
## Test set estimate of R²: 0.4724
library(caret)
library(pls)        # needed by method = "pcr"
library(glmnet)     # ridge/lasso/elastic net via glmnet
library(kknn)
library(kernlab)
# Reuse trainX, trainY f
set.seed(103025)
ctrl <- trainControl(method = "cv", number = 10)
# PCR
pcr_model <- train(trainX, trainY,
                   method = "pcr",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)
# Ridge (alpha = 0)
ridge_model <- train(trainX, trainY,
                     method = "glmnet",
                     trControl = ctrl,
                     preProcess = c("center", "scale"),
                     tuneGrid = expand.grid(alpha = 0,
                                            lambda = 10^seq(-4, 1, length = 50)))
# Lasso (alpha = 1)
lasso_model <- train(trainX, trainY,
                     method = "glmnet",
                     trControl = ctrl,
                     preProcess = c("center", "scale"),
                     tuneGrid = expand.grid(alpha = 1,
                                            lambda = 10^seq(-4, 1, length = 50)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# Elastic Net (search alpha in [0,1] and lambda)
enet_model <- train(trainX, trainY,
                    method = "glmnet",
                    trControl = ctrl,
                    preProcess = c("center", "scale"),
                    tuneLength = 20)  # lets caret pick alpha & lambda
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
# KNN
knn_model <- train(trainX, trainY,
                   method = "knn",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)
# SVM (RBF)
svm_model <- train(trainX, trainY,
                   method = "svmRadial",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 10)
# Compare with your existing PLS model object: plsTune
results <- resamples(list(
  PLS = plsTune,
  PCR = pcr_model,
  Ridge = ridge_model,
  Lasso = lasso_model,
  ElasticNet = enet_model,
  KNN = knn_model,
  SVM = svm_model
))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: PLS, PCR, Ridge, Lasso, ElasticNet, KNN, SVM 
## Number of resamples: 10 
## 
## MAE 
##                Min.  1st Qu.   Median     Mean   3rd Qu.      Max. NA's
## PLS        5.855166 7.671185 8.374878 8.819935 10.395264 12.232113    0
## PCR        6.478052 7.349715 8.886026 8.936486  9.847171 12.295670    0
## Ridge      6.067736 7.319335 8.336143 8.605020  9.342998 12.349564    0
## Lasso      4.168337 6.448881 7.736902 8.105491  9.835276 12.484209    0
## ElasticNet 6.077117 6.675149 7.634973 7.705983  8.601024  9.607781    0
## KNN        4.471095 6.082174 7.956981 8.385917 10.117267 13.761968    0
## SVM        5.538656 6.595833 7.517188 7.563386  8.156856 10.131174    0
## 
## RMSE 
##                Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## PLS        7.129567  9.433234 11.39016 11.29200 12.94590 16.54612    0
## PCR        8.037053 10.161534 11.06334 11.96948 14.23587 16.60493    0
## Ridge      8.283846  9.164638 12.18091 11.62923 12.77967 15.96915    0
## Lasso      5.554426 10.672216 11.81544 11.36815 13.14913 16.33687    0
## ElasticNet 8.530177  9.119431 11.03678 10.87942 12.14213 14.26285    0
## KNN        6.665764  7.582388 10.41034 11.73250 15.25848 18.84733    0
## SVM        7.233803  9.722672 10.72060 10.43967 11.16724 12.69882    0
## 
## Rsquared 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## PLS        0.080111817 0.4387388 0.5599021 0.5211719 0.6322325 0.8229828    0
## PCR        0.141356694 0.3518307 0.3820989 0.4562220 0.6234931 0.8204790    0
## Ridge      0.119162090 0.2991567 0.5790568 0.5017807 0.6534809 0.8146635    0
## Lasso      0.131695219 0.2994707 0.4844250 0.5243114 0.7702345 0.9220318    0
## ElasticNet 0.302124171 0.3836284 0.5328731 0.5723251 0.7386761 0.8775188    0
## KNN        0.009837283 0.1747413 0.4901745 0.4648761 0.7428649 0.9107852    0
## SVM        0.190436679 0.5147241 0.5918350 0.5851556 0.6655141 0.8661903    0
dotplot(results)
## Question 6.2e Extract the Best Model from Comparison
# Extract best model from comparison
cv_results <- data.frame(
  Model = c("PLS", "PCR", "Ridge", "Lasso", "ElasticNet", "KNN", "SVM"),
  CV_RMSE = c(
    min(plsTune$results$RMSE),
    min(pcr_model$results$RMSE),
    min(ridge_model$results$RMSE),
    min(lasso_model$results$RMSE),
    min(enet_model$results$RMSE),
    min(knn_model$results$RMSE),
    min(svm_model$results$RMSE)
  )
)
cv_results <- cv_results[order(cv_results$CV_RMSE), ]
cat("\n=== MODEL RANKINGS (by Cross-Validated RMSE) ===\n\n")
## 
## === MODEL RANKINGS (by Cross-Validated RMSE) ===
print(cv_results)
##        Model  CV_RMSE
## 7        SVM 10.43967
## 5 ElasticNet 10.87942
## 1        PLS 11.29200
## 4      Lasso 11.36815
## 3      Ridge 11.62923
## 6        KNN 11.73250
## 2        PCR 11.96948
cat("\nBest Model:", cv_results$Model[1], "\n")
## 
## Best Model: SVM
cat("Answer to (e): ", 
    ifelse(cv_results$Model[1] == "PLS", 
           "No other model performs better than PLS",
           paste(cv_results$Model[1], "performs better than PLS")), 
    "\n")
## Answer to (e):  SVM performs better than PLS
cat("Test R²:", round(testR2, 4), "\n\n")
## Test R²: 0.4724
if (testR2 >= 0.70) {
  cat("YES - Strong performance. Could replace lab experiments.\n")
} else if (testR2 >= 0.50) {
  cat("PARTIAL - Use for screening only. Confirm with lab.\n")
  cat("\nRationale: Model explains only ~", round(testR2*100, 0), 
      "% of variance.\n", sep="")
  cat("Too much uncertainty for full replacement, but useful for\n")
  cat("initial filtering before expensive lab tests.\n")
} else {
  cat("NO - Too weak to replace lab experiments.\n")
}
## NO - Too weak to replace lab experiments.