Linear Regression and Its’s Cousins

1 Question 6.1a: Load the data

library(caret)
data(tecator)
library(pls)
library(elasticnet)
library(glmnet)

2 Question 6.1b PCA Analysis

library(caret)
data(tecator)

# The absorb matrix contains the spectroscopy data
# Perform PCA
pca_result <- prcomp(absorp, center = TRUE, scale. = TRUE)

# Calculate the proportion of variance explained by each PC
variance_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)

# Calculate cumulative variance explained
cumulative_variance <- cumsum(variance_explained)

# Find effective dimension at different thresholds
dim_95 <- which(cumulative_variance >= 0.95)[1]
dim_99 <- which(cumulative_variance >= 0.99)[1]

cat("Effective dimension (95% variance):", dim_95, "\n")

## Effective dimension (95% variance): 1

cat("Effective dimension (99% variance):", dim_99, "\n")

## Effective dimension (99% variance): 2

# Visualize with a scree plot
par(mfrow = c(1, 2))

# Scree plot
plot(1:20, variance_explained[1:20], type = "b", 
     xlab = "Principal Component", 
     ylab = "Proportion of Variance Explained",
     main = "Scree Plot")

# Cumulative variance plot
plot(1:20, cumulative_variance[1:20], type = "b",
     xlab = "Number of Components",
     ylab = "Cumulative Variance Explained",
     main = "Cumulative Variance")
abline(h = 0.95, col = "red", lty = 2)
abline(h = 0.99, col = "blue", lty = 2)
legend("bottomright", legend = c("95%", "99%"), 
       col = c("red", "blue"), lty = 2)

3 Question 6.1c Explore the Datasset

# Check dimensions
cat("Dimensions of absorp matrix:\n")

## Dimensions of absorp matrix:

print(dim(absorp))

## [1] 215 100

cat("This means", nrow(absorp), "samples with", ncol(absorp), "wavelength measurements\n\n")

## This means 215 samples with 100 wavelength measurements

cat("Dimensions of endpoints matrix:\n")

## Dimensions of endpoints matrix:

print(dim(endpoints))

## [1] 215   3

cat("This means", nrow(endpoints), "samples with", ncol(endpoints), "chemical composition measurements\n\n")

## This means 215 samples with 3 chemical composition measurements

# View first few rows
cat("First few rows of absorp (spectroscopy data):\n")

## First few rows of absorp (spectroscopy data):

print(head(absorp, 5))

##         [,1]    [,2]    [,3]    [,4]    [,5]    [,6]    [,7]    [,8]    [,9]
## [1,] 2.61776 2.61814 2.61859 2.61912 2.61981 2.62071 2.62186 2.62334 2.62511
## [2,] 2.83454 2.83871 2.84283 2.84705 2.85138 2.85587 2.86060 2.86566 2.87093
## [3,] 2.58284 2.58458 2.58629 2.58808 2.58996 2.59192 2.59401 2.59627 2.59873
## [4,] 2.82286 2.82460 2.82630 2.82814 2.83001 2.83192 2.83392 2.83606 2.83842
## [5,] 2.78813 2.78989 2.79167 2.79350 2.79538 2.79746 2.79984 2.80254 2.80553
##        [,10]   [,11]   [,12]   [,13]   [,14]   [,15]   [,16]   [,17]   [,18]
## [1,] 2.62722 2.62964 2.63245 2.63565 2.63933 2.64353 2.64825 2.65350 2.65937
## [2,] 2.87661 2.88264 2.88898 2.89577 2.90308 2.91097 2.91953 2.92873 2.93863
## [3,] 2.60131 2.60414 2.60714 2.61029 2.61361 2.61714 2.62089 2.62486 2.62909
## [4,] 2.84097 2.84374 2.84664 2.84975 2.85307 2.85661 2.86038 2.86437 2.86860
## [5,] 2.80890 2.81272 2.81704 2.82184 2.82710 2.83294 2.83945 2.84664 2.85458
##        [,19]   [,20]   [,21]   [,22]   [,23]   [,24]   [,25]   [,26]   [,27]
## [1,] 2.66585 2.67281 2.68008 2.68733 2.69427 2.70073 2.70684 2.71281 2.71914
## [2,] 2.94929 2.96072 2.97272 2.98493 2.99690 3.00833 3.01920 3.02990 3.04101
## [3,] 2.63361 2.63835 2.64330 2.64838 2.65354 2.65870 2.66375 2.66880 2.67383
## [4,] 2.87308 2.87789 2.88301 2.88832 2.89374 2.89917 2.90457 2.90991 2.91521
## [5,] 2.86331 2.87280 2.88291 2.89335 2.90374 2.91371 2.92305 2.93187 2.94060
##        [,28]   [,29]   [,30]   [,31]   [,32]   [,33]   [,34]   [,35]   [,36]
## [1,] 2.72628 2.73462 2.74416 2.75466 2.76568 2.77679 2.78790 2.79949 2.81225
## [2,] 3.05345 3.06777 3.08416 3.10221 3.12106 3.13983 3.15810 3.17623 3.19519
## [3,] 2.67892 2.68411 2.68937 2.69470 2.70012 2.70563 2.71141 2.71775 2.72490
## [4,] 2.92043 2.92565 2.93082 2.93604 2.94128 2.94658 2.95202 2.95777 2.96419
## [5,] 2.94986 2.96035 2.97241 2.98606 3.00097 3.01652 3.03220 3.04793 3.06413
##        [,37]   [,38]   [,39]   [,40]   [,41]   [,42]   [,43]   [,44]   [,45]
## [1,] 2.82706 2.84356 2.86106 2.87857 2.89497 2.90924 2.92085 2.93015 2.93846
## [2,] 3.21584 3.23747 3.25889 3.27835 3.29384 3.30362 3.30681 3.30393 3.29700
## [3,] 2.73344 2.74327 2.75433 2.76642 2.77931 2.79272 2.80649 2.82064 2.83541
## [4,] 2.97159 2.98045 2.99090 3.00284 3.01611 3.03048 3.04579 3.06194 3.07889
## [5,] 3.08153 3.10078 3.12185 3.14371 3.16510 3.18470 3.20140 3.21477 3.22544
##        [,46]   [,47]   [,48]   [,49]   [,50]   [,51]   [,52]   [,53]   [,54]
## [1,] 2.94771 2.96019 2.97831 3.00306 3.03506 3.07428 3.11963 3.16868 3.21771
## [2,] 3.28925 3.28409 3.28505 3.29326 3.30923 3.33267 3.36251 3.39661 3.43188
## [3,] 2.85121 2.86872 2.88905 2.91289 2.94088 2.97325 3.00946 3.04780 3.08554
## [4,] 3.09686 3.11629 3.13775 3.16217 3.19068 3.22376 3.26172 3.30379 3.34793
## [5,] 3.23505 3.24586 3.26027 3.28063 3.30889 3.34543 3.39019 3.44198 3.49800
##        [,55]   [,56]   [,57]   [,58]   [,59]   [,60]   [,61]   [,62]   [,63]
## [1,] 3.26254 3.29988 3.32847 3.34899 3.36342 3.37379 3.38152 3.38741 3.39164
## [2,] 3.46492 3.49295 3.51458 3.53004 3.54067 3.54797 3.55306 3.55675 3.55921
## [3,] 3.11947 3.14696 3.16677 3.17938 3.18631 3.18924 3.18950 3.18801 3.18498
## [4,] 3.39093 3.42920 3.45998 3.48227 3.49687 3.50558 3.51026 3.51221 3.51215
## [5,] 3.55407 3.60534 3.64789 3.68011 3.70272 3.71815 3.72863 3.73574 3.74059
##        [,64]   [,65]   [,66]   [,67]   [,68]   [,69]   [,70]   [,71]   [,72]
## [1,] 3.39418 3.39490 3.39366 3.39045 3.38541 3.37869 3.37041 3.36073 3.34979
## [2,] 3.56045 3.56034 3.55876 3.55571 3.55132 3.54585 3.53950 3.53235 3.52442
## [3,] 3.18039 3.17411 3.16611 3.15641 3.14512 3.13241 3.11843 3.10329 3.08714
## [4,] 3.51036 3.50682 3.50140 3.49398 3.48457 3.47333 3.46041 3.44595 3.43005
## [5,] 3.74357 3.74453 3.74336 3.73991 3.73418 3.72638 3.71676 3.70553 3.69289
##        [,73]   [,74]   [,75]   [,76]   [,77]   [,78]   [,79]   [,80]   [,81]
## [1,] 3.33769 3.32443 3.31013 3.29487 3.27891 3.26232 3.24542 3.22828 3.21080
## [2,] 3.51583 3.50668 3.49700 3.48683 3.47626 3.46552 3.45501 3.44481 3.43477
## [3,] 3.07014 3.05237 3.03393 3.01504 2.99569 2.97612 2.95642 2.93660 2.91667
## [4,] 3.41285 3.39450 3.37511 3.35482 3.33376 3.31204 3.28986 3.26730 3.24442
## [5,] 3.67900 3.66396 3.64785 3.63085 3.61305 3.59463 3.57582 3.55695 3.53796
##        [,82]   [,83]   [,84]   [,85]   [,86]   [,87]   [,88]   [,89]   [,90]
## [1,] 3.19287 3.17433 3.15503 3.13475 3.11339 3.09116 3.06850 3.04596 3.02393
## [2,] 3.42465 3.41419 3.40303 3.39082 3.37731 3.36265 3.34745 3.33245 3.31818
## [3,] 2.89655 2.87622 2.85563 2.83474 2.81361 2.79235 2.77113 2.75015 2.72956
## [4,] 3.22117 3.19757 3.17357 3.14915 3.12429 3.09908 3.07366 3.04825 3.02308
## [5,] 3.51880 3.49936 3.47938 3.45869 3.43711 3.41458 3.39129 3.36772 3.34450
##        [,91]   [,92]   [,93]   [,94]   [,95]   [,96]   [,97]   [,98]   [,99]
## [1,] 3.00247 2.98145 2.96072 2.94013 2.91978 2.89966 2.87964 2.85960 2.83940
## [2,] 3.30473 3.29186 3.27921 3.26655 3.25369 3.24045 3.22659 3.21181 3.19600
## [3,] 2.70934 2.68951 2.67009 2.65112 2.63262 2.61461 2.59718 2.58034 2.56404
## [4,] 2.99820 2.97367 2.94951 2.92576 2.90251 2.87988 2.85794 2.83672 2.81617
## [5,] 3.32201 3.30025 3.27907 3.25831 3.23784 3.21765 3.19766 3.17770 3.15770
##       [,100]
## [1,] 2.81920
## [2,] 3.17942
## [3,] 2.54816
## [4,] 2.79622
## [5,] 3.13753

cat("\n")

cat("First few rows of endpoints (moisture, fat, protein):\n")

## First few rows of endpoints (moisture, fat, protein):

print(head(endpoints, 5))

##      [,1] [,2] [,3]
## [1,] 60.5 22.5 16.7
## [2,] 46.0 40.1 13.5
## [3,] 71.0  8.4 20.5
## [4,] 72.8  5.9 20.7
## [5,] 58.3 25.5 15.5

cat("\n")

# Look at the endpoints more closely
cat("Summary statistics for endpoints:\n")

## Summary statistics for endpoints:

cat("(Columns are: moisture, fat, protein)\n")

## (Columns are: moisture, fat, protein)

print(summary(endpoints))

##        V1              V2              V3       
##  Min.   :39.30   Min.   : 0.90   Min.   :11.00  
##  1st Qu.:55.55   1st Qu.: 7.30   1st Qu.:15.35  
##  Median :65.70   Median :14.00   Median :18.70  
##  Mean   :63.20   Mean   :18.14   Mean   :17.68  
##  3rd Qu.:71.80   3rd Qu.:28.00   3rd Qu.:20.10  
##  Max.   :76.60   Max.   :49.10   Max.   :21.80

cat("\n")

# Additional useful information
cat("Range of values for each endpoint:\n")

## Range of values for each endpoint:

cat("Moisture: ", range(endpoints[,1]), "\n")

## Moisture:  39.3 76.6

cat("Fat: ", range(endpoints[,2]), "\n")

## Fat:  0.9 49.1

cat("Protein: ", range(endpoints[,3]), "\n")

## Protein:  11 21.8

4 Question 6.1c(i) Develop Models

# Prepare the data
# Use moisture (column 1) as the response for this example
X <- absorp
y <- endpoints[, 1]  # moisture

# Prepare the data with column names from the start
X <- as.data.frame(absorp)
colnames(X) <- paste0("V", 1:ncol(X))
y <- endpoints[, 1]  # moisture

# Create training and test sets (80/20 split)
set.seed(103025)
training_indices <- createDataPartition(y, p = 0.8, list = FALSE)

X_train <- X[training_indices, ]
y_train <- y[training_indices]
X_test <- X[-training_indices, ]
y_test <- y[-training_indices]

# Set up cross-validation
ctrl <- trainControl(method = "cv", number = 10)


# 2. PRINCIPAL COMPONENT REGRESSION (PCR)
pcr_model <- train(X_train, y_train,
                   method = "pcr",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

cat("Optimal number of components for PCR:", pcr_model$bestTune$ncomp, "\n")

## Optimal number of components for PCR: 17

plot(pcr_model, main = "PCR Model")

# 3. PARTIAL LEAST SQUARES (PLS)
pls_model <- train(X_train, y_train,
                   method = "pls",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

cat("Optimal number of components for PLS:", pls_model$bestTune$ncomp, "\n")

## Optimal number of components for PLS: 18

plot(pls_model, main = "PLS Model")

# 4. RIDGE REGRESSION
ridge_model <- train(X_train, y_train,
                     method = "ridge",
                     trControl = ctrl,
                     preProcess = c("center", "scale"),
                     tuneGrid = expand.grid(lambda = seq(0, 0.1, length = 20)))

cat("Optimal lambda for Ridge:", ridge_model$bestTune$lambda, "\n")

## Optimal lambda for Ridge: 0.005263158

plot(ridge_model, main = "Ridge Regression")

# 5. LASSO
lasso_model <- train(X_train, y_train,
                     method = "lasso",
                     trControl = ctrl,
                     preProcess = c("center", "scale"),
                     tuneGrid = expand.grid(fraction = seq(0.05, 1, length = 20)))

cat("Optimal fraction for Lasso:", lasso_model$bestTune$fraction, "\n")

## Optimal fraction for Lasso: 0.05

plot(lasso_model, main = "Lasso")

# 6. ELASTIC NET
enet_model <- train(X_train, y_train,
                    method = "enet",
                    trControl = ctrl,
                    preProcess = c("center", "scale"),
                    tuneGrid = expand.grid(
                      lambda = seq(0, 0.01, length = 10),
                      fraction = seq(0.05, 1, length = 10)))

cat("Optimal parameters for Elastic Net:\n")

## Optimal parameters for Elastic Net:

print(enet_model$bestTune)

##   fraction lambda
## 1     0.05      0

plot(enet_model, main = "Elastic Net")

# Compare all models
results <- resamples(list(
  PCR = pcr_model,
  PLS = pls_model,
  Ridge = ridge_model,
  Lasso = lasso_model,
  ElasticNet = enet_model
))

summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: PCR, PLS, Ridge, Lasso, ElasticNet 
## Number of resamples: 10 
## 
## MAE 
##                 Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## PCR        0.9907898 1.655433 1.750641 1.706735 1.894032 1.978650    0
## PLS        0.9950547 1.237408 1.525913 1.479060 1.694800 1.924520    0
## Ridge      1.8510309 2.082544 2.527221 2.426472 2.711722 3.042205    0
## Lasso      1.0783121 1.359751 1.513780 1.577265 1.750224 2.293645    0
## ElasticNet 0.9704648 1.239734 1.417753 1.530344 1.823465 2.231281    0
## 
## RMSE 
##                Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## PCR        1.131044 2.077892 2.260061 2.261940 2.400253 3.121051    0
## PLS        1.170039 1.801339 2.067001 2.158819 2.704045 3.102498    0
## Ridge      2.138349 2.405387 3.102519 2.943903 3.323940 3.869651    0
## Lasso      1.542059 1.729813 2.335174 2.388967 2.690583 4.055471    0
## ElasticNet 1.279046 1.592042 2.146024 2.201900 2.835488 3.309855    0
## 
## Rsquared 
##                 Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## PCR        0.9108570 0.9472697 0.9562315 0.9558933 0.9647052 0.9871226    0
## PLS        0.8513291 0.9500963 0.9638373 0.9537026 0.9791718 0.9838614    0
## Ridge      0.8470151 0.9192795 0.9250380 0.9284689 0.9432627 0.9839045    0
## Lasso      0.8525562 0.9417862 0.9528760 0.9481198 0.9711296 0.9789369    0
## ElasticNet 0.8992566 0.9180510 0.9615250 0.9514368 0.9818809 0.9855794    0

dotplot(results)

# Test set performance
models <- list(
  PCR = pcr_model,
  PLS = pls_model,
  Ridge = ridge_model,
  Lasso = lasso_model,
  ElasticNet = enet_model
)

test_results <- data.frame(
  Model = names(models),
  RMSE = numeric(length(models)),
  Rsquared = numeric(length(models))
)

for (i in seq_along(models)) {
  pred <- predict(models[[i]], X_test)
  test_results$RMSE[i] <- sqrt(mean((pred - y_test)^2))
  test_results$Rsquared[i] <- cor(pred, y_test)^2
}

print("Test Set Performance:")

## [1] "Test Set Performance:"

print(test_results)

##        Model     RMSE  Rsquared
## 1        PCR 1.959721 0.9590600
## 2        PLS 1.514225 0.9752885
## 3      Ridge 2.689994 0.9215908
## 4      Lasso 1.456885 0.9772255
## 5 ElasticNet 1.456885 0.9772255

5 Question 6.1c(ii) Optimal Values of the Tuning Parameter

OPTIMAL TUNING PARAMETERS

## PCR (Principal Component Regression):

##   Optimal number of components: 17

##   Cross-validated RMSE: 2.26194

## PLS (Partial Least Squares):

##   Optimal number of components: 18

##   Cross-validated RMSE: 2.158819

## Ridge Regression:

##   Optimal lambda: 0.005263158

##   Cross-validated RMSE: 2.943903

## Lasso:

##   Optimal fraction: 0.05

##   Cross-validated RMSE: 2.388967

## Elastic Net:

##   Optimal lambda: 0

##   Optimal fraction: 0.05

##   Cross-validated RMSE: 2.2019

## ========================================

## BEST MODEL (by CV RMSE):

## ========================================

## The model considered the best model is the one with the lowest RMSE score: PLS with a cross-validated RMSE of 2.158819

6 Question 6.1d Model Performance Analysis

MODEL PERFORMANCE ANALYSIS

## All five regression models were evaluated using 10-fold cross-validation

## to predict moisture content from spectroscopy data. The results show

## clear performance differences:

## Cross-Validated RMSE:

##   1. PLS:         2.159  (BEST)

##   2. Elastic Net: 2.202  (+2.0%)

##   3. PCR:         2.262  (+4.8%)

##   4. Lasso:       2.389  (+10.7%)

##   5. Ridge:       2.944  (+36.4%)

## Key Findings:

## PLS emerged as the best model with the lowest prediction error (RMSE = 2.159).

## It uses 18 components to capture the underlying patterns in the data.

## Elastic Net performed nearly as well (only 2% worse), making it a viable

## alternative to PLS. The small difference suggests both methods effectively

## handle the high correlation in spectroscopy data.

## PCR was competitive but slightly worse than PLS (4.8% higher RMSE). Both

## methods use similar numbers of components (17-18), but PLS has an advantage

## because it considers the response variable when creating components, while

## PCR does not.

## Lasso performed moderately well but was 10.7% worse than PLS. Its aggressive

## variable selection approach may have removed useful predictors.

## Ridge Regression performed worst, with RMSE 36% higher than PLS. Ridge keeps

## all 100 highly correlated predictors, which hurts performance compared to

## dimension reduction methods like PLS and PCR that reduce to ~18 components.

## Recommendation: Use PLS with 18 components for predicting moisture content.

## It provides the best predictive performance and effectively handles the

## multicollinearity inherent in near-infrared spectroscopy data.

## ========================================

7 Question 6.1e Predicting Fat Content

## PREDICTING FAT CONTENT

## Training models to predict fat content...

## 
## === FAT PREDICTION RESULTS ===

## Cross-Validated RMSE (Best to Worst):

##   1. Lasso        2.129 <- BEST
##   2. ElasticNet   2.308 (+8.4%)
##   3. PLS          2.322 (+9.1%)
##   4. PCR          2.365 (+11.1%)
##   5. Ridge        2.632 (+23.6%)

## 
## Optimal Tuning Parameters:

##   PCR components: 17

##   PLS components: 14

##   Ridge lambda: 0

##   Lasso fraction: 0.25

##   Elastic Net lambda: 0 , fraction: 0.2611111

## Test Set RMSE:

##   1. PLS          2.197 <- BEST
##   2. PCR          2.326
##   3. Lasso        5.601
##   4. ElasticNet   5.669
##   5. Ridge        6.534

## 
## Recommendation for Fat Prediction:

## Use Lasso for predicting fat content

## CV RMSE: 2.129

## Test RMSE: 5.601

8 Question 6.2 Develop a Model to Predict Permeability

Developing a model to predict permeability could save significant resources for a pharmaceutical company, while at the same time more rapidly identifying molecules that have a sufficient permeability to become a drug. Molecular fingerprints are binary indicators that represent the presence (1) or absence (0) of specific chemical substructures in a molecule. They are commonly used in:

Drug discovery
Chemical similarity analysis
Quantitative structure-activity relationship (QSAR) modeling
Solubility prediction

library(AppliedPredictiveModeling)
data(permeability)

9 Question 6.2a EDA on Permeability Data

## EXPLORATORY DATA ANALYSIS

## Permeability Dataset

## === DATASET DIMENSIONS ===

## Fingerprints matrix dimensions: 165 1107

##   - Number of compounds: 165

##   - Number of molecular descriptors: 1107

## Permeability vector length: 165

## === PERMEABILITY (RESPONSE VARIABLE) ===

## Summary statistics:

##   permeability  
##  Min.   : 0.06  
##  1st Qu.: 1.55  
##  Median : 4.91  
##  Mean   :12.24  
##  3rd Qu.:15.47  
##  Max.   :55.60

## 
## Standard deviation: 15.579

## Range: 55.54

10 Question 6.2a(i) Explore the Dataset

# Check dimensions
cat("Dimensions of fingerprint matrix:\n")

## Dimensions of fingerprint matrix:

print(dim(fingerprints))

## [1]  165 1107

cat("This means", nrow(fingerprints), "samples with", ncol(fingerprints), "predictors\n")

## This means 165 samples with 1107 predictors

11 Filter out Predictors

Filter out the predictors that have low frequencies using the nearZeroVar function from the caret package. How many predictors are left for modeling?

12 Dataset Description:

Permeability data (AppliedPredictiveModeling package)
165 compounds
1,107 binary fingerprint predictors
Ratio: 6.7 predictors per sample (high-dimensional problem)

library(caret)
library(AppliedPredictiveModeling)
data(permeability)

nzv <- nearZeroVar(fingerprints)
fingerprints_filtered <- fingerprints[, -nzv]
ncol(fingerprints_filtered)

## [1] 388

cat("The number of predictors left for modeling:", ncol(fingerprints_filtered))

## The number of predictors left for modeling: 388

13 Question 6.2c Prepare the Data for Modeling

Split the data into a training and a test set, pre-process the data, and tune a PLS model. How many latent variables are optimal and what is the corresponding resampled estimate of R2?

library(caret)
library(AppliedPredictiveModeling)
library(caret)
library(pls)        # for pcr
library(elasticnet) # for ridge/lasso/enet
library(kknn)

## 
## Attaching package: 'kknn'

## The following object is masked from 'package:caret':
## 
##     contr.dummy

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

set.seed(103025)
ctrl <- trainControl(method = "cv", number = 10)


# Load data
data(permeability)

# Filter near-zero variance predictors
nzv <- nearZeroVar(fingerprints)
fingerprints_filtered <- fingerprints[, -nzv]

cat("Predictors after filtering:", ncol(fingerprints_filtered), "\n\n")

## Predictors after filtering: 388

# Split data into training and test sets (75/25)
set.seed(103125)
trainIndex <- createDataPartition(permeability, p = 0.75, list = FALSE)

trainX <- fingerprints_filtered[trainIndex, ]
trainY <- permeability[trainIndex]
testX <- fingerprints_filtered[-trainIndex, ]
testY <- permeability[-trainIndex]

cat("Training set:", nrow(trainX), "samples\n")

## Training set: 125 samples

cat("Test set:", nrow(testX), "samples\n\n")

## Test set: 40 samples

# Set up 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)

# Tune PLS model (preprocessing: center and scale)
set.seed(103025)
plsTune <- train(
  x = trainX,
  y = trainY,
  method = "pls",
  tuneLength = 20,
  trControl = ctrl,
  preProc = c("center", "scale")
)

# Get results
bestTune <- plsTune$bestTune
bestResults <- plsTune$results[plsTune$results$ncomp == bestTune$ncomp, ]


cat("1. Optimal number of latent variables:", bestTune$ncomp, "\n\n")

## 1. Optimal number of latent variables: 7

cat("2. Resampled R²:", round(bestResults$Rsquared, 4), "\n\n")

## 2. Resampled R²: 0.5212

cat("======================================\n")

## ======================================

14 Question 6.2d Predict the Response for the Test Set.

What is the test set estimate of R2?

# Predict the response for the test set
testPred <- predict(plsTune, newdata = testX)

# Calculate test set R²
testR2 <- cor(testPred, testY)^2

# ANSWER

cat("Test set estimate of R²:", round(testR2, 4), "\n\n")

## Test set estimate of R²: 0.4724

15 Question 6.2e Showing Which Model Is Best

library(caret)
library(pls)        # needed by method = "pcr"
library(glmnet)     # ridge/lasso/elastic net via glmnet
library(kknn)
library(kernlab)

# Reuse trainX, trainY f
set.seed(103025)
ctrl <- trainControl(method = "cv", number = 10)

# PCR
pcr_model <- train(trainX, trainY,
                   method = "pcr",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

# Ridge (alpha = 0)
ridge_model <- train(trainX, trainY,
                     method = "glmnet",
                     trControl = ctrl,
                     preProcess = c("center", "scale"),
                     tuneGrid = expand.grid(alpha = 0,
                                            lambda = 10^seq(-4, 1, length = 50)))

# Lasso (alpha = 1)
lasso_model <- train(trainX, trainY,
                     method = "glmnet",
                     trControl = ctrl,
                     preProcess = c("center", "scale"),
                     tuneGrid = expand.grid(alpha = 1,
                                            lambda = 10^seq(-4, 1, length = 50)))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

# Elastic Net (search alpha in [0,1] and lambda)
enet_model <- train(trainX, trainY,
                    method = "glmnet",
                    trControl = ctrl,
                    preProcess = c("center", "scale"),
                    tuneLength = 20)  # lets caret pick alpha & lambda

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

# KNN
knn_model <- train(trainX, trainY,
                   method = "knn",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 20)

# SVM (RBF)
svm_model <- train(trainX, trainY,
                   method = "svmRadial",
                   trControl = ctrl,
                   preProcess = c("center", "scale"),
                   tuneLength = 10)

# Compare with your existing PLS model object: plsTune
results <- resamples(list(
  PLS = plsTune,
  PCR = pcr_model,
  Ridge = ridge_model,
  Lasso = lasso_model,
  ElasticNet = enet_model,
  KNN = knn_model,
  SVM = svm_model
))

summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: PLS, PCR, Ridge, Lasso, ElasticNet, KNN, SVM 
## Number of resamples: 10 
## 
## MAE 
##                Min.  1st Qu.   Median     Mean   3rd Qu.      Max. NA's
## PLS        5.855166 7.671185 8.374878 8.819935 10.395264 12.232113    0
## PCR        6.478052 7.349715 8.886026 8.936486  9.847171 12.295670    0
## Ridge      6.067736 7.319335 8.336143 8.605020  9.342998 12.349564    0
## Lasso      4.168337 6.448881 7.736902 8.105491  9.835276 12.484209    0
## ElasticNet 6.077117 6.675149 7.634973 7.705983  8.601024  9.607781    0
## KNN        4.471095 6.082174 7.956981 8.385917 10.117267 13.761968    0
## SVM        5.538656 6.595833 7.517188 7.563386  8.156856 10.131174    0
## 
## RMSE 
##                Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## PLS        7.129567  9.433234 11.39016 11.29200 12.94590 16.54612    0
## PCR        8.037053 10.161534 11.06334 11.96948 14.23587 16.60493    0
## Ridge      8.283846  9.164638 12.18091 11.62923 12.77967 15.96915    0
## Lasso      5.554426 10.672216 11.81544 11.36815 13.14913 16.33687    0
## ElasticNet 8.530177  9.119431 11.03678 10.87942 12.14213 14.26285    0
## KNN        6.665764  7.582388 10.41034 11.73250 15.25848 18.84733    0
## SVM        7.233803  9.722672 10.72060 10.43967 11.16724 12.69882    0
## 
## Rsquared 
##                   Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## PLS        0.080111817 0.4387388 0.5599021 0.5211719 0.6322325 0.8229828    0
## PCR        0.141356694 0.3518307 0.3820989 0.4562220 0.6234931 0.8204790    0
## Ridge      0.119162090 0.2991567 0.5790568 0.5017807 0.6534809 0.8146635    0
## Lasso      0.131695219 0.2994707 0.4844250 0.5243114 0.7702345 0.9220318    0
## ElasticNet 0.302124171 0.3836284 0.5328731 0.5723251 0.7386761 0.8775188    0
## KNN        0.009837283 0.1747413 0.4901745 0.4648761 0.7428649 0.9107852    0
## SVM        0.190436679 0.5147241 0.5918350 0.5851556 0.6655141 0.8661903    0

dotplot(results)

## Question 6.2e Extract the Best Model from Comparison

# Extract best model from comparison
cv_results <- data.frame(
  Model = c("PLS", "PCR", "Ridge", "Lasso", "ElasticNet", "KNN", "SVM"),
  CV_RMSE = c(
    min(plsTune$results$RMSE),
    min(pcr_model$results$RMSE),
    min(ridge_model$results$RMSE),
    min(lasso_model$results$RMSE),
    min(enet_model$results$RMSE),
    min(knn_model$results$RMSE),
    min(svm_model$results$RMSE)
  )
)

cv_results <- cv_results[order(cv_results$CV_RMSE), ]

cat("\n=== MODEL RANKINGS (by Cross-Validated RMSE) ===\n\n")

## 
## === MODEL RANKINGS (by Cross-Validated RMSE) ===

print(cv_results)

##        Model  CV_RMSE
## 7        SVM 10.43967
## 5 ElasticNet 10.87942
## 1        PLS 11.29200
## 4      Lasso 11.36815
## 3      Ridge 11.62923
## 6        KNN 11.73250
## 2        PCR 11.96948

cat("\nBest Model:", cv_results$Model[1], "\n")

## 
## Best Model: SVM

cat("Answer to (e): ", 
    ifelse(cv_results$Model[1] == "PLS", 
           "No other model performs better than PLS",
           paste(cv_results$Model[1], "performs better than PLS")), 
    "\n")

## Answer to (e):  SVM performs better than PLS

16 Question 6.2f Can the Model Replace Lab Experiments?

cat("Test R²:", round(testR2, 4), "\n\n")

## Test R²: 0.4724

if (testR2 >= 0.70) {
  cat("YES - Strong performance. Could replace lab experiments.\n")
} else if (testR2 >= 0.50) {
  cat("PARTIAL - Use for screening only. Confirm with lab.\n")
  cat("\nRationale: Model explains only ~", round(testR2*100, 0), 
      "% of variance.\n", sep="")
  cat("Too much uncertainty for full replacement, but useful for\n")
  cat("initial filtering before expensive lab tests.\n")
} else {
  cat("NO - Too weak to replace lab experiments.\n")
}

## NO - Too weak to replace lab experiments.