0.1 Packages

need <- c("tidyverse","caret","AppliedPredictiveModeling","pls","glmnet","ranger","recipes")
for (p in need){
  if (!requireNamespace(p, quietly = TRUE)) install.packages(p, repos = "https://cloud.r-project.org")
  suppressPackageStartupMessages(library(p, character.only = TRUE))
}

1 Problem 6.2 — Predicting Permeability

1.1 (a) Load the data

data("permeability", package = "AppliedPredictiveModeling")
stopifnot(exists("fingerprints"), exists("permeability"))
dim(fingerprints)

## [1]  165 1107

Interpretation: The dataset contains 165 samples and 1,107 molecular fingerprint predictors used to estimate compound permeability.

1.2 (b) Filter near-zero-variance predictors

nzv_idx <- nearZeroVar(as.data.frame(fingerprints))
X_filt  <- fingerprints[, -nzv_idx, drop = FALSE]
c(original = ncol(fingerprints), filtered = ncol(X_filt), removed = length(nzv_idx))

## original filtered  removed 
##     1107      388      719

Interpretation: Predictors with minimal variability were removed to improve model efficiency and prevent overfitting.

1.3 (c) Split and tune a PLS model

set.seed(62417)
idx <- createDataPartition(permeability, p = 0.75, list = FALSE)
X_train <- X_filt[idx, , drop = FALSE]; y_train <- permeability[idx]
X_test  <- X_filt[-idx, , drop = FALSE]; y_test <- permeability[-idx]

ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 2)

pls_fit <- train(x = X_train, y = y_train,
                 method = "pls",
                 preProcess = c("center","scale"),
                 tuneLength = 30,
                 trControl = ctrl,
                 metric = "Rsquared")

plot(pls_fit)

Interpretation: The PLS model identifies the optimal number of latent components that maximize cross-validated R² for accurate prediction.

1.4 (d) Evaluate PLS on test data

pls_pred <- predict(pls_fit, X_test)
r2_pls   <- caret::R2(y_test, pls_pred)
rmse_pls <- caret::RMSE(y_test, pls_pred)
c(R2 = r2_pls, RMSE = rmse_pls)

##         R2       RMSE 
##  0.3122834 15.1844193

Interpretation: The R² value measures how much of the variance in permeability is explained by the model, while RMSE shows average prediction error.

1.5 (e) Try Elastic Net and Random Forest

glmnet_fit <- train(x = X_train, y = y_train,
                    method = "glmnet",
                    preProcess = c("center","scale"),
                    tuneLength = 25,
                    trControl = ctrl,
                    metric = "Rsquared")

rf_fit <- train(x = X_train, y = y_train,
                method = "ranger",
                tuneLength = 10,
                trControl = ctrl,
                metric = "Rsquared",
                importance = "permutation")

resamps <- resamples(list(PLS = pls_fit, ENet = glmnet_fit, RF = rf_fit))

bwplot(resamps, metric = "Rsquared")

Interpretation: Comparing R² across models helps determine whether nonlinear or regularized approaches outperform the PLS baseline.

1.6 (f) Test-set comparison

perf_62 <- tibble(
  Model = c("PLS","ENet","RF"),
  R2    = c(caret::R2(y_test, predict(pls_fit,   X_test)),
            caret::R2(y_test, predict(glmnet_fit,X_test)),
            caret::R2(y_test, predict(rf_fit,    X_test))),
  RMSE  = c(caret::RMSE(y_test, predict(pls_fit,   X_test)),
            caret::RMSE(y_test, predict(glmnet_fit,X_test)),
            caret::RMSE(y_test, predict(rf_fit,    X_test)))
) %>% arrange(desc(R2))
head(perf_62, 3)

Interpretation: The model with the highest R² and lowest RMSE provides the best predictive performance; often, Random Forest or Elastic Net achieves slightly better accuracy.

2 Problem 6.3 — Chemical Manufacturing Process (Yield)

2.1 (a) Load the data safely

if (!requireNamespace("AppliedPredictiveModeling", quietly = TRUE)) {
  install.packages("AppliedPredictiveModeling", repos = "https://cloud.r-project.org")
}
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess", package = "AppliedPredictiveModeling")

processPredictors <- ChemicalManufacturingProcess[, setdiff(names(ChemicalManufacturingProcess), "Yield")]
yield <- ChemicalManufacturingProcess$Yield

dim(processPredictors)

## [1] 176  57

Interpretation: The dataset includes 176 samples with 57 predictors capturing both biological and process-related measurements affecting product yield.

2.2 (b) Impute missing values and standardize

chem_df <- as_tibble(processPredictors) %>% mutate(Yield = yield)

chem_rec <- recipe(Yield ~ ., data = chem_df) %>%
  step_impute_bag(all_predictors()) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors())

chem_prep  <- prep(chem_rec)
chem_ready <- bake(chem_prep, new_data = NULL)

sum(is.na(chem_ready))

## [1] 0

Interpretation: Missing values were imputed and predictors standardized to ensure consistent scaling and improve model convergence.

2.3 (c) Split and tune an Elastic Net model

set.seed(62417)
idx2      <- createDataPartition(chem_ready$Yield, p = 0.75, list = FALSE)
train_dat <- chem_ready[idx2, ]
test_dat  <- chem_ready[-idx2, ]

ctrl2 <- trainControl(method = "repeatedcv", number = 5, repeats = 3)

enet_fit <- train(Yield ~ .,
                  data = train_dat,
                  method = "glmnet",
                  trControl = ctrl2,
                  metric = "RMSE",
                  tuneLength = 50)

plot(enet_fit)

Interpretation: Cross-validation finds the best mix of L1 and L2 regularization (alpha and lambda) that minimizes RMSE and balances bias-variance tradeoff.

2.4 (d) Evaluate on test data

enet_pred <- predict(enet_fit, newdata = test_dat)
r2_enet   <- caret::R2(test_dat$Yield, enet_pred)
rmse_enet <- caret::RMSE(test_dat$Yield, enet_pred)
c(R2 = r2_enet, RMSE = rmse_enet)

##        R2      RMSE 
## 0.7301318 1.0259774

Interpretation: The model’s test performance reflects how well it generalizes to unseen manufacturing runs, using R² and RMSE as key indicators.

2.5 (e) Variable importance

vi_tbl <- varImp(enet_fit, scale = TRUE)$importance %>%
  rownames_to_column("Predictor") %>%
  arrange(desc(Overall))
head(vi_tbl, 10)

Interpretation: The top variables indicate which process and biological parameters have the strongest influence on product yield.

2.6 (f) Relationships between top predictors and Yield

top6 <- vi_tbl %>% slice_max(Overall, n = 6) %>% pull(Predictor)
chem_plot <- ChemicalManufacturingProcess %>%
  select(all_of(top6), Yield) %>%
  pivot_longer(cols = all_of(top6), names_to = "Predictor", values_to = "Value")

ggplot(chem_plot, aes(Value, Yield)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "loess", se = FALSE) +
  facet_wrap(~ Predictor, scales = "free_x") +
  labs(title = "Yield vs. Top Predictors", x = "Predictor Value", y = "Yield (%)")

Interpretation: Scatterplots reveal how each top predictor correlates with yield, showing trends that could guide process improvements.

DATA 624 — Kuhn & Johnson Chapter 6: Problems 6.2 & 6.3

Sabina Baraili

November 01, 2025