need <- c("tidyverse","caret","AppliedPredictiveModeling","pls","glmnet","ranger","recipes")
for (p in need){
if (!requireNamespace(p, quietly = TRUE)) install.packages(p, repos = "https://cloud.r-project.org")
suppressPackageStartupMessages(library(p, character.only = TRUE))
}
data("permeability", package = "AppliedPredictiveModeling")
stopifnot(exists("fingerprints"), exists("permeability"))
dim(fingerprints)
## [1] 165 1107
Interpretation: The dataset contains 165 samples and 1,107 molecular fingerprint predictors used to estimate compound permeability.
nzv_idx <- nearZeroVar(as.data.frame(fingerprints))
X_filt <- fingerprints[, -nzv_idx, drop = FALSE]
c(original = ncol(fingerprints), filtered = ncol(X_filt), removed = length(nzv_idx))
## original filtered removed
## 1107 388 719
Interpretation: Predictors with minimal variability were removed to improve model efficiency and prevent overfitting.
set.seed(62417)
idx <- createDataPartition(permeability, p = 0.75, list = FALSE)
X_train <- X_filt[idx, , drop = FALSE]; y_train <- permeability[idx]
X_test <- X_filt[-idx, , drop = FALSE]; y_test <- permeability[-idx]
ctrl <- trainControl(method = "repeatedcv", number = 5, repeats = 2)
pls_fit <- train(x = X_train, y = y_train,
method = "pls",
preProcess = c("center","scale"),
tuneLength = 30,
trControl = ctrl,
metric = "Rsquared")
plot(pls_fit)
Interpretation: The PLS model identifies the optimal
number of latent components that maximize cross-validated R² for
accurate prediction.
pls_pred <- predict(pls_fit, X_test)
r2_pls <- caret::R2(y_test, pls_pred)
rmse_pls <- caret::RMSE(y_test, pls_pred)
c(R2 = r2_pls, RMSE = rmse_pls)
## R2 RMSE
## 0.3122834 15.1844193
Interpretation: The R² value measures how much of the variance in permeability is explained by the model, while RMSE shows average prediction error.
glmnet_fit <- train(x = X_train, y = y_train,
method = "glmnet",
preProcess = c("center","scale"),
tuneLength = 25,
trControl = ctrl,
metric = "Rsquared")
rf_fit <- train(x = X_train, y = y_train,
method = "ranger",
tuneLength = 10,
trControl = ctrl,
metric = "Rsquared",
importance = "permutation")
resamps <- resamples(list(PLS = pls_fit, ENet = glmnet_fit, RF = rf_fit))
bwplot(resamps, metric = "Rsquared")
Interpretation: Comparing R² across models helps
determine whether nonlinear or regularized approaches outperform the PLS
baseline.
perf_62 <- tibble(
Model = c("PLS","ENet","RF"),
R2 = c(caret::R2(y_test, predict(pls_fit, X_test)),
caret::R2(y_test, predict(glmnet_fit,X_test)),
caret::R2(y_test, predict(rf_fit, X_test))),
RMSE = c(caret::RMSE(y_test, predict(pls_fit, X_test)),
caret::RMSE(y_test, predict(glmnet_fit,X_test)),
caret::RMSE(y_test, predict(rf_fit, X_test)))
) %>% arrange(desc(R2))
head(perf_62, 3)
Interpretation: The model with the highest R² and lowest RMSE provides the best predictive performance; often, Random Forest or Elastic Net achieves slightly better accuracy.
if (!requireNamespace("AppliedPredictiveModeling", quietly = TRUE)) {
install.packages("AppliedPredictiveModeling", repos = "https://cloud.r-project.org")
}
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess", package = "AppliedPredictiveModeling")
processPredictors <- ChemicalManufacturingProcess[, setdiff(names(ChemicalManufacturingProcess), "Yield")]
yield <- ChemicalManufacturingProcess$Yield
dim(processPredictors)
## [1] 176 57
Interpretation: The dataset includes 176 samples with 57 predictors capturing both biological and process-related measurements affecting product yield.
chem_df <- as_tibble(processPredictors) %>% mutate(Yield = yield)
chem_rec <- recipe(Yield ~ ., data = chem_df) %>%
step_impute_bag(all_predictors()) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors())
chem_prep <- prep(chem_rec)
chem_ready <- bake(chem_prep, new_data = NULL)
sum(is.na(chem_ready))
## [1] 0
Interpretation: Missing values were imputed and predictors standardized to ensure consistent scaling and improve model convergence.
set.seed(62417)
idx2 <- createDataPartition(chem_ready$Yield, p = 0.75, list = FALSE)
train_dat <- chem_ready[idx2, ]
test_dat <- chem_ready[-idx2, ]
ctrl2 <- trainControl(method = "repeatedcv", number = 5, repeats = 3)
enet_fit <- train(Yield ~ .,
data = train_dat,
method = "glmnet",
trControl = ctrl2,
metric = "RMSE",
tuneLength = 50)
plot(enet_fit)
Interpretation: Cross-validation finds the best mix of
L1 and L2 regularization (alpha and lambda) that minimizes RMSE and
balances bias-variance tradeoff.
enet_pred <- predict(enet_fit, newdata = test_dat)
r2_enet <- caret::R2(test_dat$Yield, enet_pred)
rmse_enet <- caret::RMSE(test_dat$Yield, enet_pred)
c(R2 = r2_enet, RMSE = rmse_enet)
## R2 RMSE
## 0.7301318 1.0259774
Interpretation: The model’s test performance reflects how well it generalizes to unseen manufacturing runs, using R² and RMSE as key indicators.
vi_tbl <- varImp(enet_fit, scale = TRUE)$importance %>%
rownames_to_column("Predictor") %>%
arrange(desc(Overall))
head(vi_tbl, 10)
Interpretation: The top variables indicate which process and biological parameters have the strongest influence on product yield.
top6 <- vi_tbl %>% slice_max(Overall, n = 6) %>% pull(Predictor)
chem_plot <- ChemicalManufacturingProcess %>%
select(all_of(top6), Yield) %>%
pivot_longer(cols = all_of(top6), names_to = "Predictor", values_to = "Value")
ggplot(chem_plot, aes(Value, Yield)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "loess", se = FALSE) +
facet_wrap(~ Predictor, scales = "free_x") +
labs(title = "Yield vs. Top Predictors", x = "Predictor Value", y = "Yield (%)")
Interpretation: Scatterplots reveal how each top
predictor correlates with yield, showing trends that could guide process
improvements.