library(tidyverse)
library(tidymodels)
library(ranger)
library(vip)
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
full_data <- ChemicalManufacturingProcess %>%
as_tibble() %>%
mutate(Yield = as.numeric(Yield))
cat("Dimensions:", dim(full_data), "\n")
## Dimensions: 176 58
cat("Missing values:", sum(is.na(full_data)), "\n")
## Missing values: 106
# Using k-nearest neighbors imputation
impute_recipe <- recipe(Yield ~ ., data = full_data) %>%
step_impute_knn(all_predictors(), neighbors = 5)
cat("Missing values after imputation will be filled via KNN\n")
## Missing values after imputation will be filled via KNN
set.seed(614)
split <- initial_split(full_data, prop = 0.75)
train_data <- training(split)
test_data <- testing(split)
set.seed(614)
folds <- vfold_cv(train_data, v = 5, repeats = 1)
base_recipe <- recipe(Yield ~ ., data = train_data) %>%
step_impute_knn(all_predictors(), neighbors = 5) %>%
step_nzv(all_predictors()) %>%
step_center(all_predictors()) %>%
step_scale(all_predictors())
rf_spec <- rand_forest(trees = 500, mtry = tune(), min_n = tune()) %>%
set_engine("ranger", importance = "impurity",
num.threads = parallel::detectCores()) %>%
set_mode("regression")
rf_wf <- workflow() %>%
add_recipe(base_recipe) %>%
add_model(rf_spec)
set.seed(614)
rf_tune <- tune_grid(
rf_wf,
resamples = folds,
grid = 5,
metrics = metric_set(rsq, rmse)
)
best_rf <- select_best(rf_tune, metric = "rsq")
cat("Optimal mtry:", best_rf$mtry, "\n")
## Optimal mtry: 28
cat("Optimal min_n:", best_rf$min_n, "\n")
## Optimal min_n: 2
rf_tune %>%
collect_metrics() %>%
filter(.metric == "rsq") %>%
arrange(desc(mean)) %>%
slice(1) %>%
pull(mean) %>%
round(4) %>%
cat("Resampled R^2 (Random Forest):", ., "\n")
## Resampled R^2 (Random Forest): 0.5897
rf_final <- rf_wf %>%
finalize_workflow(best_rf) %>%
fit(train_data)
rf_preds <- predict(rf_final, test_data) %>%
bind_cols(test_data %>% select(Yield)) %>%
mutate(Yield = as.numeric(Yield)) %>%
rename(truth = Yield, estimate = .pred)
rf_r2 <- rsq(rf_preds, truth = truth, estimate = estimate)
rf_rmse <- rmse(rf_preds, truth = truth, estimate = estimate)
cat("Test set R^2 (Random Forest):", round(rf_r2$.estimate, 4), "\n")
## Test set R^2 (Random Forest): 0.4534
cat("Test set RMSE (Random Forest):", round(rf_rmse$.estimate, 4), "\n")
## Test set RMSE (Random Forest): 1.239
rf_final %>%
extract_fit_parsnip() %>%
vip(num_features = 20) +
labs(title = "Top 20 Most Important Predictors") +
theme_minimal()
rf_final %>%
extract_fit_parsnip() %>%
vi() %>%
slice(1:20) %>%
mutate(Type = if_else(str_starts(Variable, "Biological"), "Biological", "Process")) %>%
knitr::kable(caption = "Top 20 Predictors by Variable Importance")
| Variable | Importance | Type |
|---|---|---|
| ManufacturingProcess32 | 129.399688 | Process |
| ManufacturingProcess31 | 37.721956 | Process |
| ManufacturingProcess36 | 33.761082 | Process |
| BiologicalMaterial06 | 30.832379 | Biological |
| BiologicalMaterial12 | 22.540848 | Biological |
| ManufacturingProcess09 | 13.777945 | Process |
| BiologicalMaterial03 | 13.277844 | Biological |
| BiologicalMaterial11 | 12.419928 | Biological |
| ManufacturingProcess17 | 10.662864 | Process |
| ManufacturingProcess13 | 10.481283 | Process |
| BiologicalMaterial02 | 9.741585 | Biological |
| ManufacturingProcess30 | 7.851748 | Process |
| ManufacturingProcess33 | 7.185713 | Process |
| BiologicalMaterial05 | 5.837120 | Biological |
| ManufacturingProcess11 | 5.786268 | Process |
| ManufacturingProcess21 | 5.579435 | Process |
| ManufacturingProcess28 | 5.417350 | Process |
| ManufacturingProcess06 | 5.396140 | Process |
| ManufacturingProcess27 | 5.240396 | Process |
| BiologicalMaterial08 | 5.236670 | Biological |
# Get top 6 predictor names
top_vars <- rf_final %>%
extract_fit_parsnip() %>%
vi() %>%
slice(1:6) %>%
pull(Variable)
# Bake training data for plotting
train_baked <- base_recipe %>%
prep(training = train_data) %>%
bake(new_data = train_data)
# Plot each top predictor against Yield
train_baked %>%
select(Yield, all_of(top_vars)) %>%
pivot_longer(-Yield, names_to = "Predictor", values_to = "Value") %>%
ggplot(aes(x = Value, y = Yield)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "loess", se = TRUE, color = "steelblue") +
facet_wrap(~ Predictor, scales = "free_x") +
labs(
title = "Yield vs Top 6 Predictors",
x = "Predictor Value",
y = "Yield"
) +
theme_minimal()
The top predictors are a mix of biological and process variables, so neither type fully dominates.
ManufacturingProcess09 shows a clear positive trend: increasing it should directly improve yield.
ManufacturingProcess31 and 32 show negative or non-linear relationships.
BiologicalMaterial06 and 12 matter but can’t be controlled, so they’re more useful for screening raw material quality .
ManufacturingProcess36 looks noisy with little clear trend.
Overall, focusing on ManufacturingProcess09 as a controllable lever seems like the most practical path to squeezing out that 1% yield improvement.