library(tidyverse)
library(tidymodels)
library(pls) # CRAN: partial least squares
library(AppliedPredictiveModeling)
library(ranger) # random forest
library(kernlab) # SVM
data(permeability)
full_data <- as_tibble(fingerprints) %>%
mutate(permeability = as.numeric(permeability))
cat("Fingerprints dimensions:", dim(fingerprints), "\n")
## Fingerprints dimensions: 165 1107
cat("Permeability length:", length(permeability), "\n")
## Permeability length: 165
nzv_recipe <- recipe(permeability ~ ., data = full_data) %>%
step_nzv(all_predictors())
nzv_prepped <- prep(nzv_recipe)
n_original <- ncol(fingerprints)
n_remaining <- nzv_prepped$var_info %>%
filter(role == "predictor") %>%
nrow()
cat("Original predictors: ", n_original, "\n")
## Original predictors: 1107
cat("Remaining predictors:", n_remaining, "\n")
## Remaining predictors: 1107
cat("Removed: ", n_original - n_remaining, "\n")
## Removed: 0
set.seed(614)
split <- initial_split(full_data, prop = 0.75, strata = permeability)
train_data <- training(split)
test_data <- testing(split)
base_recipe <- recipe(permeability ~ ., data = train_data) %>%
step_nzv(all_predictors()) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors())
prepped <- prep(base_recipe, training = train_data)
train_baked <- bake(prepped, new_data = NULL)
test_baked <- bake(prepped, new_data = test_data)
X_train <- train_baked %>% select(-permeability) %>% as.matrix()
y_train <- train_baked$permeability
X_test <- test_baked %>% select(-permeability) %>% as.matrix()
y_test <- test_baked$permeability
# Tune number of components via 10-fold CV using the pls package
set.seed(614)
pls_cv <- plsr(
y_train ~ X_train,
ncomp = 20,
validation = "CV",
segments = 10
)
# Pick optimal ncomp by minimising RMSEP
rmsep_vals <- RMSEP(pls_cv)$val[1, 1, ] # intercept-only excluded at [1,1,]
optimal_ncomp <- which.min(rmsep_vals) - 1 # subtract 1 for the intercept component
cat("Optimal latent variables:", optimal_ncomp, "\n")
## Optimal latent variables: 2
# Resampled R^2 from CV
cv_pred <- pls_cv$validation$pred[, 1, optimal_ncomp]
resampled_r2 <- cor(y_train, cv_pred)^2
cat("Resampled R^2 (PLS):", round(resampled_r2, 4), "\n")
## Resampled R^2 (PLS): 0.3565
pls_final <- plsr(
y_train ~ X_train,
ncomp = optimal_ncomp
)
pls_pred <- predict(pls_final, newdata = X_test, ncomp = optimal_ncomp) %>%
drop() %>%
as.numeric()
pls_results <- tibble(truth = as.numeric(y_test), estimate = pls_pred)
pls_r2 <- rsq(pls_results, truth = truth, estimate = estimate)
cat("Test set R^2 (PLS):", round(pls_r2$.estimate, 4), "\n")
## Test set R^2 (PLS): 0.4231
set.seed(614)
folds <- vfold_cv(train_data, v = 5, repeats = 1, strata = permeability)
rf_spec <- rand_forest(trees = 500, mtry = tune(), min_n = tune()) %>%
set_engine("ranger", num.threads = parallel::detectCores()) %>%
set_mode("regression")
rf_wf <- workflow() %>%
add_recipe(base_recipe) %>%
add_model(rf_spec)
set.seed(614)
rf_tune <- tune_grid(
rf_wf,
resamples = folds,
grid = 5,
metrics = metric_set(rsq, rmse)
)
best_rf <- select_best(rf_tune, metric = "rsq")
rf_final <- rf_wf %>%
finalize_workflow(best_rf) %>%
fit(train_data)
rf_preds <- predict(rf_final, test_data) %>%
bind_cols(test_data %>% select(permeability)) %>%
rename(truth = permeability, estimate = .pred)
rf_r2 <- rsq(rf_preds, truth = truth, estimate = estimate)
cat("Test set R^2 (Random Forest):", round(rf_r2$.estimate, 4), "\n")
## Test set R^2 (Random Forest): 0.6294
show_notes(rf_tune)
## Great job! No notes to show.
svm_spec <- svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
set_engine("kernlab") %>%
set_mode("regression")
svm_wf <- workflow() %>%
add_recipe(base_recipe) %>%
add_model(svm_spec)
set.seed(614)
svm_tune <- tune_grid(
svm_wf,
resamples = folds,
grid = 10,
metrics = metric_set(rsq, rmse)
)
best_svm <- select_best(svm_tune, metric = "rsq")
svm_final <- svm_wf %>%
finalize_workflow(best_svm) %>%
fit(train_data)
svm_preds <- predict(svm_final, test_data) %>%
bind_cols(test_data %>% select(permeability)) %>%
rename(truth = permeability, estimate = .pred)
svm_r2 <- rsq(svm_preds, truth = truth, estimate = estimate)
cat("Test set R^2 (SVM):", round(svm_r2$.estimate, 4), "\n")
## Test set R^2 (SVM): 0.6099
tibble(
Model = c("PLS", "Random Forest", "SVM (RBF)"),
Test_R2 = c(
round(pls_r2$.estimate, 4),
round(rf_r2$.estimate, 4),
round(svm_r2$.estimate, 4)
)
) %>%
arrange(desc(Test_R2)) %>%
knitr::kable(caption = "Test Set R^2 by Model")
| Model | Test_R2 |
|---|---|
| Random Forest | 0.6294 |
| SVM (RBF) | 0.6099 |
| PLS | 0.4231 |
# The best-performing model (typically Random Forest or SVM) is recommended
Based on the models, Random Forest likely gives the best test R² (around 0.60+), outperforming both PLS and SVM. Honestly, I would recommend using it as a first-pass screen. It won’t replace the lab, but it’s good enough to rule out bad candidates early and save a lot of time and money. PLS is a decent lightweight option if we need something simpler and more interpretative.