Exercise 6.3: Chemical Manufacturing Process

library(tidyverse)
library(tidymodels)
library(ranger)
library(vip)
library(AppliedPredictiveModeling)

(a) Load Data

data(ChemicalManufacturingProcess)

full_data <- ChemicalManufacturingProcess %>%
  as_tibble() %>%
  mutate(Yield = as.numeric(Yield))

cat("Dimensions:", dim(full_data), "\n")

## Dimensions: 176 58

cat("Missing values:", sum(is.na(full_data)), "\n")

## Missing values: 106

(b) Impute Missing Values

# Using k-nearest neighbors imputation
impute_recipe <- recipe(Yield ~ ., data = full_data) %>%
  step_impute_knn(all_predictors(), neighbors = 5)

cat("Missing values after imputation will be filled via KNN\n")

## Missing values after imputation will be filled via KNN

(c) Split, Pre-process, and Tune Random Forest

set.seed(614)
split      <- initial_split(full_data, prop = 0.75)
train_data <- training(split)
test_data  <- testing(split)

set.seed(614)
folds <- vfold_cv(train_data, v = 5, repeats = 1)

base_recipe <- recipe(Yield ~ ., data = train_data) %>%
  step_impute_knn(all_predictors(), neighbors = 5) %>%
  step_nzv(all_predictors()) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors())

rf_spec <- rand_forest(trees = 500, mtry = tune(), min_n = tune()) %>%
  set_engine("ranger", importance = "impurity",
             num.threads = parallel::detectCores()) %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  add_recipe(base_recipe) %>%
  add_model(rf_spec)

set.seed(614)
rf_tune <- tune_grid(
  rf_wf,
  resamples = folds,
  grid      = 5,
  metrics   = metric_set(rsq, rmse)
)

best_rf <- select_best(rf_tune, metric = "rsq")

cat("Optimal mtry:", best_rf$mtry, "\n")

## Optimal mtry: 28

cat("Optimal min_n:", best_rf$min_n, "\n")

## Optimal min_n: 2

rf_tune %>%
  collect_metrics() %>%
  filter(.metric == "rsq") %>%
  arrange(desc(mean)) %>%
  slice(1) %>%
  pull(mean) %>%
  round(4) %>%
  cat("Resampled R^2 (Random Forest):", ., "\n")

## Resampled R^2 (Random Forest): 0.5897

(d) Predict on Test Set

rf_final <- rf_wf %>%
  finalize_workflow(best_rf) %>%
  fit(train_data)

rf_preds <- predict(rf_final, test_data) %>%
  bind_cols(test_data %>% select(Yield)) %>%
  mutate(Yield = as.numeric(Yield)) %>%
  rename(truth = Yield, estimate = .pred)

rf_r2   <- rsq(rf_preds,  truth = truth, estimate = estimate)
rf_rmse <- rmse(rf_preds, truth = truth, estimate = estimate)

cat("Test set R^2  (Random Forest):", round(rf_r2$.estimate,   4), "\n")

## Test set R^2  (Random Forest): 0.4534

cat("Test set RMSE (Random Forest):", round(rf_rmse$.estimate, 4), "\n")

## Test set RMSE (Random Forest): 1.239

(e) Variable Importance

rf_final %>%
  extract_fit_parsnip() %>%
  vip(num_features = 20) +
  labs(title = "Top 20 Most Important Predictors") +
  theme_minimal()

rf_final %>%
  extract_fit_parsnip() %>%
  vi() %>%
  slice(1:20) %>%
  mutate(Type = if_else(str_starts(Variable, "Biological"), "Biological", "Process")) %>%
  knitr::kable(caption = "Top 20 Predictors by Variable Importance")

Top 20 Predictors by Variable Importance
Variable	Importance	Type
ManufacturingProcess32	129.399688	Process
ManufacturingProcess31	37.721956	Process
ManufacturingProcess36	33.761082	Process
BiologicalMaterial06	30.832379	Biological
BiologicalMaterial12	22.540848	Biological
ManufacturingProcess09	13.777945	Process
BiologicalMaterial03	13.277844	Biological
BiologicalMaterial11	12.419928	Biological
ManufacturingProcess17	10.662864	Process
ManufacturingProcess13	10.481283	Process
BiologicalMaterial02	9.741585	Biological
ManufacturingProcess30	7.851748	Process
ManufacturingProcess33	7.185713	Process
BiologicalMaterial05	5.837120	Biological
ManufacturingProcess11	5.786268	Process
ManufacturingProcess21	5.579435	Process
ManufacturingProcess28	5.417350	Process
ManufacturingProcess06	5.396140	Process
ManufacturingProcess27	5.240396	Process
BiologicalMaterial08	5.236670	Biological

(f) Explore Top Predictor Relationships

# Get top 6 predictor names
top_vars <- rf_final %>%
  extract_fit_parsnip() %>%
  vi() %>%
  slice(1:6) %>%
  pull(Variable)

# Bake training data for plotting
train_baked <- base_recipe %>%
  prep(training = train_data) %>%
  bake(new_data = train_data)

# Plot each top predictor against Yield
train_baked %>%
  select(Yield, all_of(top_vars)) %>%
  pivot_longer(-Yield, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", se = TRUE, color = "steelblue") +
  facet_wrap(~ Predictor, scales = "free_x") +
  labs(
    title = "Yield vs Top 6 Predictors",
    x     = "Predictor Value",
    y     = "Yield"
  ) +
  theme_minimal()

Conclusion

The top predictors are a mix of biological and process variables, so neither type fully dominates.

ManufacturingProcess09 shows a clear positive trend: increasing it should directly improve yield.

ManufacturingProcess31 and 32 show negative or non-linear relationships.

BiologicalMaterial06 and 12 matter but can’t be controlled, so they’re more useful for screening raw material quality .

ManufacturingProcess36 looks noisy with little clear trend.

Overall, focusing on ManufacturingProcess09 as a controllable lever seems like the most practical path to squeezing out that 1% yield improvement.