library(tidyverse)
library(tidymodels)
library(ranger)
library(vip)
library(AppliedPredictiveModeling)

(a) Load Data

data(ChemicalManufacturingProcess)

full_data <- ChemicalManufacturingProcess %>%
  as_tibble() %>%
  mutate(Yield = as.numeric(Yield))

cat("Dimensions:", dim(full_data), "\n")
## Dimensions: 176 58
cat("Missing values:", sum(is.na(full_data)), "\n")
## Missing values: 106

(b) Impute Missing Values

# Using k-nearest neighbors imputation
impute_recipe <- recipe(Yield ~ ., data = full_data) %>%
  step_impute_knn(all_predictors(), neighbors = 5)

cat("Missing values after imputation will be filled via KNN\n")
## Missing values after imputation will be filled via KNN

(c) Split, Pre-process, and Tune Random Forest

set.seed(614)
split      <- initial_split(full_data, prop = 0.75)
train_data <- training(split)
test_data  <- testing(split)

set.seed(614)
folds <- vfold_cv(train_data, v = 5, repeats = 1)
base_recipe <- recipe(Yield ~ ., data = train_data) %>%
  step_impute_knn(all_predictors(), neighbors = 5) %>%
  step_nzv(all_predictors()) %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors())
rf_spec <- rand_forest(trees = 500, mtry = tune(), min_n = tune()) %>%
  set_engine("ranger", importance = "impurity",
             num.threads = parallel::detectCores()) %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  add_recipe(base_recipe) %>%
  add_model(rf_spec)

set.seed(614)
rf_tune <- tune_grid(
  rf_wf,
  resamples = folds,
  grid      = 5,
  metrics   = metric_set(rsq, rmse)
)

best_rf <- select_best(rf_tune, metric = "rsq")

cat("Optimal mtry:", best_rf$mtry, "\n")
## Optimal mtry: 28
cat("Optimal min_n:", best_rf$min_n, "\n")
## Optimal min_n: 2
rf_tune %>%
  collect_metrics() %>%
  filter(.metric == "rsq") %>%
  arrange(desc(mean)) %>%
  slice(1) %>%
  pull(mean) %>%
  round(4) %>%
  cat("Resampled R^2 (Random Forest):", ., "\n")
## Resampled R^2 (Random Forest): 0.5897

(d) Predict on Test Set

rf_final <- rf_wf %>%
  finalize_workflow(best_rf) %>%
  fit(train_data)

rf_preds <- predict(rf_final, test_data) %>%
  bind_cols(test_data %>% select(Yield)) %>%
  mutate(Yield = as.numeric(Yield)) %>%
  rename(truth = Yield, estimate = .pred)

rf_r2   <- rsq(rf_preds,  truth = truth, estimate = estimate)
rf_rmse <- rmse(rf_preds, truth = truth, estimate = estimate)

cat("Test set R^2  (Random Forest):", round(rf_r2$.estimate,   4), "\n")
## Test set R^2  (Random Forest): 0.4534
cat("Test set RMSE (Random Forest):", round(rf_rmse$.estimate, 4), "\n")
## Test set RMSE (Random Forest): 1.239

(e) Variable Importance

rf_final %>%
  extract_fit_parsnip() %>%
  vip(num_features = 20) +
  labs(title = "Top 20 Most Important Predictors") +
  theme_minimal()

rf_final %>%
  extract_fit_parsnip() %>%
  vi() %>%
  slice(1:20) %>%
  mutate(Type = if_else(str_starts(Variable, "Biological"), "Biological", "Process")) %>%
  knitr::kable(caption = "Top 20 Predictors by Variable Importance")
Top 20 Predictors by Variable Importance
Variable Importance Type
ManufacturingProcess32 129.399688 Process
ManufacturingProcess31 37.721956 Process
ManufacturingProcess36 33.761082 Process
BiologicalMaterial06 30.832379 Biological
BiologicalMaterial12 22.540848 Biological
ManufacturingProcess09 13.777945 Process
BiologicalMaterial03 13.277844 Biological
BiologicalMaterial11 12.419928 Biological
ManufacturingProcess17 10.662864 Process
ManufacturingProcess13 10.481283 Process
BiologicalMaterial02 9.741585 Biological
ManufacturingProcess30 7.851748 Process
ManufacturingProcess33 7.185713 Process
BiologicalMaterial05 5.837120 Biological
ManufacturingProcess11 5.786268 Process
ManufacturingProcess21 5.579435 Process
ManufacturingProcess28 5.417350 Process
ManufacturingProcess06 5.396140 Process
ManufacturingProcess27 5.240396 Process
BiologicalMaterial08 5.236670 Biological

(f) Explore Top Predictor Relationships

# Get top 6 predictor names
top_vars <- rf_final %>%
  extract_fit_parsnip() %>%
  vi() %>%
  slice(1:6) %>%
  pull(Variable)

# Bake training data for plotting
train_baked <- base_recipe %>%
  prep(training = train_data) %>%
  bake(new_data = train_data)

# Plot each top predictor against Yield
train_baked %>%
  select(Yield, all_of(top_vars)) %>%
  pivot_longer(-Yield, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", se = TRUE, color = "steelblue") +
  facet_wrap(~ Predictor, scales = "free_x") +
  labs(
    title = "Yield vs Top 6 Predictors",
    x     = "Predictor Value",
    y     = "Yield"
  ) +
  theme_minimal()

Conclusion

The top predictors are a mix of biological and process variables, so neither type fully dominates.

ManufacturingProcess09 shows a clear positive trend: increasing it should directly improve yield.

ManufacturingProcess31 and 32 show negative or non-linear relationships.

BiologicalMaterial06 and 12 matter but can’t be controlled, so they’re more useful for screening raw material quality .

ManufacturingProcess36 looks noisy with little clear trend.

Overall, focusing on ManufacturingProcess09 as a controllable lever seems like the most practical path to squeezing out that 1% yield improvement.