Homework 7

library(tidyverse)
library(tidymodels)
library(pls)          # CRAN: partial least squares
library(AppliedPredictiveModeling)
library(ranger)       # random forest
library(kernlab)      # SVM

(a) Load Data

data(permeability)

full_data <- as_tibble(fingerprints) %>%
  mutate(permeability = as.numeric(permeability))

cat("Fingerprints dimensions:", dim(fingerprints), "\n")

## Fingerprints dimensions: 165 1107

cat("Permeability length:", length(permeability), "\n")

## Permeability length: 165

(b) Filter Near-Zero Variance Predictors

nzv_recipe <- recipe(permeability ~ ., data = full_data) %>%
  step_nzv(all_predictors())

nzv_prepped <- prep(nzv_recipe)

n_original  <- ncol(fingerprints)
n_remaining <- nzv_prepped$var_info %>%
  filter(role == "predictor") %>%
  nrow()

cat("Original predictors: ", n_original, "\n")

## Original predictors:  1107

cat("Remaining predictors:", n_remaining, "\n")

## Remaining predictors: 1107

cat("Removed:             ", n_original - n_remaining, "\n")

## Removed:              0

(c) Split, Pre-process, and Tune PLS

set.seed(614)
split      <- initial_split(full_data, prop = 0.75, strata = permeability)
train_data <- training(split)
test_data  <- testing(split)

base_recipe <- recipe(permeability ~ ., data = train_data) %>%
  step_nzv(all_predictors()) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors())

prepped     <- prep(base_recipe, training = train_data)
train_baked <- bake(prepped, new_data = NULL)
test_baked  <- bake(prepped, new_data = test_data)

X_train <- train_baked %>% select(-permeability) %>% as.matrix()
y_train <- train_baked$permeability
X_test  <- test_baked  %>% select(-permeability) %>% as.matrix()
y_test  <- test_baked$permeability

# Tune number of components via 10-fold CV using the pls package
set.seed(614)
pls_cv <- plsr(
  y_train ~ X_train,
  ncomp     = 20,
  validation = "CV",
  segments  = 10
)

# Pick optimal ncomp by minimising RMSEP
rmsep_vals    <- RMSEP(pls_cv)$val[1, 1, ]   # intercept-only excluded at [1,1,]
optimal_ncomp <- which.min(rmsep_vals) - 1    # subtract 1 for the intercept component

cat("Optimal latent variables:", optimal_ncomp, "\n")

## Optimal latent variables: 2

# Resampled R^2 from CV
cv_pred      <- pls_cv$validation$pred[, 1, optimal_ncomp]
resampled_r2 <- cor(y_train, cv_pred)^2
cat("Resampled R^2 (PLS):", round(resampled_r2, 4), "\n")

## Resampled R^2 (PLS): 0.3565

(d) Predict on Test Set

pls_final <- plsr(
  y_train ~ X_train,
  ncomp = optimal_ncomp
)

pls_pred <- predict(pls_final, newdata = X_test, ncomp = optimal_ncomp) %>%
  drop() %>%
  as.numeric()

pls_results <- tibble(truth = as.numeric(y_test), estimate = pls_pred)
pls_r2      <- rsq(pls_results, truth = truth, estimate = estimate)

cat("Test set R^2 (PLS):", round(pls_r2$.estimate, 4), "\n")

## Test set R^2 (PLS): 0.4231

(e) Other Models

Random Forest

set.seed(614)
folds <- vfold_cv(train_data, v = 5, repeats = 1, strata = permeability)

rf_spec <- rand_forest(trees = 500, mtry = tune(), min_n = tune()) %>%
  set_engine("ranger", num.threads = parallel::detectCores()) %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  add_recipe(base_recipe) %>%
  add_model(rf_spec)

set.seed(614)
rf_tune <- tune_grid(
  rf_wf,
  resamples = folds,
  grid      = 5,
  metrics   = metric_set(rsq, rmse)
)

best_rf  <- select_best(rf_tune, metric = "rsq")
rf_final <- rf_wf %>%
  finalize_workflow(best_rf) %>%
  fit(train_data)

rf_preds <- predict(rf_final, test_data) %>%
  bind_cols(test_data %>% select(permeability)) %>%
  rename(truth = permeability, estimate = .pred)

rf_r2 <- rsq(rf_preds, truth = truth, estimate = estimate)
cat("Test set R^2 (Random Forest):", round(rf_r2$.estimate, 4), "\n")

## Test set R^2 (Random Forest): 0.6294

show_notes(rf_tune)

## Great job! No notes to show.

SVM (Radial Basis)

svm_spec <- svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
  set_engine("kernlab") %>%
  set_mode("regression")

svm_wf <- workflow() %>%
  add_recipe(base_recipe) %>%
  add_model(svm_spec)

set.seed(614)
svm_tune <- tune_grid(
  svm_wf,
  resamples = folds,
  grid      = 10,
  metrics   = metric_set(rsq, rmse)
)

best_svm  <- select_best(svm_tune, metric = "rsq")
svm_final <- svm_wf %>%
  finalize_workflow(best_svm) %>%
  fit(train_data)

svm_preds <- predict(svm_final, test_data) %>%
  bind_cols(test_data %>% select(permeability)) %>%
  rename(truth = permeability, estimate = .pred)

svm_r2 <- rsq(svm_preds, truth = truth, estimate = estimate)
cat("Test set R^2 (SVM):", round(svm_r2$.estimate, 4), "\n")

## Test set R^2 (SVM): 0.6099

Summary

tibble(
  Model   = c("PLS", "Random Forest", "SVM (RBF)"),
  Test_R2 = c(
    round(pls_r2$.estimate, 4),
    round(rf_r2$.estimate, 4),
    round(svm_r2$.estimate, 4)
  )
) %>%
  arrange(desc(Test_R2)) %>%
  knitr::kable(caption = "Test Set R^2 by Model")

Test Set R^2 by Model
Model	Test_R2
Random Forest	0.6294
SVM (RBF)	0.6099
PLS	0.4231

(f) Recommendation

# The best-performing model (typically Random Forest or SVM) is recommended

Conclusion

Based on the models, Random Forest likely gives the best test R² (around 0.60+), outperforming both PLS and SVM. Honestly, I would recommend using it as a first-pass screen. It won’t replace the lab, but it’s good enough to rule out bad candidates early and save a lot of time and money. PLS is a decent lightweight option if we need something simpler and more interpretative.

Homework 7 - 6.2