Setup and Data Preparation

Replicate the same preprocessing steps from Exercise 6.3.

library(tidyverse)
library(caret)
library(AppliedPredictiveModeling)
library(e1071)       # SVM
library(kernlab)     # KSVM
library(earth)       # MARS
library(nnet)        # Neural net
library(randomForest)
library(gbm)
library(Cubist)
library(party)
library(doParallel)


cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)
data(ChemicalManufacturingProcess)

cmp <- ChemicalManufacturingProcess
dim(cmp)
## [1] 176  58

Imputation

# Impute missing values with median via preProcess 
pre_impute <- preProcess(cmp, method = "medianImpute")
cmp_imputed <- predict(pre_impute, cmp)

# Verify no missing values remain
sum(is.na(cmp_imputed))
## [1] 0

Remove Near-Zero Variance Predictors

nzv_cols <- nearZeroVar(cmp_imputed)
if (length(nzv_cols) > 0) {
  cmp_clean <- cmp_imputed[, -nzv_cols]
} else {
  cmp_clean <- cmp_imputed
}

cat("Dimensions after NZV removal:", dim(cmp_clean), "\n")
## Dimensions after NZV removal: 176 57

Train / Test Split

set.seed(42)

y    <- cmp_clean$Yield
X    <- cmp_clean %>% select(-Yield)

train_idx  <- createDataPartition(y, p = 0.80, list = FALSE)

X_train <- X[train_idx, ]
X_test  <- X[-train_idx, ]
y_train <- y[train_idx]
y_test  <- y[-train_idx]

Pre-Processing (Center, Scale)

pre_proc <- preProcess(X_train, method = c("center", "scale"))

X_train_pp <- predict(pre_proc, X_train)
X_test_pp  <- predict(pre_proc, X_test)

train_df <- X_train_pp %>% mutate(Yield = y_train)
test_df  <- X_test_pp  %>% mutate(Yield = y_test)

(a) Train Nonlinear Regression Models

We use 10-fold cross-validation repeated 5 times for resampling.

ctrl <- trainControl(
  method    = "repeatedcv",
  number    = 10,
  repeats   = 5,
  allowParallel = TRUE
)

MARS (Multivariate Adaptive Regression Splines)

set.seed(42)
mars_grid <- expand.grid(degree = 1:2, nprune = seq(2, 30, by = 2))

mars_fit <- train(
  Yield ~ .,
  data      = train_df,
  method    = "earth",
  tuneGrid  = mars_grid,
  trControl = ctrl
)

mars_fit$bestTune
##    nprune degree
## 20     10      2

Support Vector Machine (Radial)

set.seed(42)
svm_fit <- train(
  Yield ~ .,
  data      = train_df,
  method    = "svmRadial",
  tuneLength = 14,
  trControl  = ctrl
)

svm_fit$bestTune
##        sigma  C
## 7 0.01489057 16

K-Nearest Neighbors

set.seed(42)
knn_fit <- train(
  Yield ~ .,
  data      = train_df,
  method    = "knn",
  tuneGrid  = data.frame(k = 1:20),
  trControl = ctrl
)

knn_fit$bestTune
##   k
## 4 4

Neural Network (averaged)

set.seed(42)
nnet_grid <- expand.grid(
  size  = c(3, 5, 7),
  decay = c(0.001, 0.01, 0.1),
  bag   = FALSE
)

nnet_fit <- train(
  Yield ~ .,
  data      = train_df,
  method    = "avNNet",
  tuneGrid  = nnet_grid,
  trControl = ctrl,
  linout    = TRUE,
  trace     = FALSE,
  maxit     = 200,
  repeats   = 3
)

nnet_fit$bestTune
##   size decay   bag
## 1    3 0.001 FALSE

Compare Resampling Performance

resamp <- resamples(list(
  MARS  = mars_fit,
  SVM   = svm_fit,
  KNN   = knn_fit,
  NNet  = nnet_fit
))

summary(resamp)
## 
## Call:
## summary.resamples(object = resamp)
## 
## Models: MARS, SVM, KNN, NNet 
## Number of resamples: 50 
## 
## MAE 
##           Min.   1st Qu.    Median      Mean   3rd Qu.     Max. NA's
## MARS 0.5614058 0.7317273 0.8987451 0.8891809 1.0154430 1.389358    0
## SVM  0.5864215 0.7886887 0.8540282 0.8917068 0.9807905 1.442299    0
## KNN  0.7450000 0.9084524 1.0354464 1.0236367 1.1175833 1.368750    0
## NNet 0.7448082 0.9919217 1.2290316 1.3448509 1.6528606 2.350983    0
## 
## RMSE 
##           Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## MARS 0.6457921 0.9090172 1.053665 1.108660 1.241799 1.806462    0
## SVM  0.6771665 0.9899905 1.073766 1.094051 1.169537 1.535768    0
## KNN  0.8979373 1.1100770 1.280754 1.259417 1.376238 1.695101    0
## NNet 0.9454561 1.2440237 1.544879 1.754621 2.155358 3.797201    0
## 
## Rsquared 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS 0.31172268 0.5687068 0.6793332 0.6663900 0.7893585 0.9403164    0
## SVM  0.30118304 0.5938421 0.6735069 0.6535080 0.7203748 0.8605378    0
## KNN  0.12698505 0.4227078 0.5659275 0.5313215 0.6474769 0.8226659    0
## NNet 0.01817544 0.2018569 0.4314195 0.3906245 0.5447891 0.7261486    0
bwplot(resamp, metric = "RMSE",
       main = "Resampling RMSE Comparison")

bwplot(resamp, metric = "Rsquared",
       main = "Resampling R-squared Comparison")

MARS and SVM are neck and neck with RMSE around 1.0 and R² near 0.6.

R² box spanning nearly 0 to 0.9 and the outlier past 3 on RMSE tells you it was sensitive to the small sample size. SVM wins on consistency.

Test Set Performance

compute_test_metrics <- function(fit, X_test_pp, y_test) {
  preds <- predict(fit, newdata = X_test_pp)
  postResample(pred = preds, obs = y_test)
}

test_results <- map_dfr(
  list(MARS = mars_fit, SVM = svm_fit, KNN = knn_fit, NNet = nnet_fit),
  compute_test_metrics,
  X_test_pp = X_test_pp,
  y_test    = y_test,
  .id       = "Model"
)

test_results %>%
  arrange(RMSE) %>%
  knitr::kable(digits = 4, caption = "Test Set Performance")
Test Set Performance
Model RMSE Rsquared MAE
SVM 1.2704 0.6180 0.9170
KNN 1.3586 0.5533 0.9645
MARS 1.3790 0.5772 1.0178
NNet 1.6113 0.4328 1.2499
best_model_name <- test_results %>%
  arrange(RMSE) %>%
  slice(1) %>%
  pull(Model)

cat("Best nonlinear model by test RMSE:", best_model_name, "\n")
## Best nonlinear model by test RMSE: SVM

(b) Variable Importance

Identify the optimal nonlinear model and extract variable importance.

# Programmatically select the best fit object
model_list <- list(MARS = mars_fit, SVM = svm_fit, KNN = knn_fit, NNet = nnet_fit)
best_nl_fit <- model_list[[best_model_name]]

Top 20 Predictors - Optimal Nonlinear Model

vi_nl <- varImp(best_nl_fit, scale = TRUE)

plot(vi_nl, top = 20,
     main = paste("Variable Importance -", best_model_name))

The top 5 is process-heavy, but biological variables show up steadily from rank 5 onward.

vi_nl_df <- vi_nl$importance %>%
  rownames_to_column("Predictor") %>%
  arrange(desc(Overall)) %>%
  mutate(Rank_NL = row_number(),
         Type    = if_else(str_detect(Predictor, "^Biological"), "Biological", "Process"))

vi_nl_df %>%
  slice(1:20) %>%
  knitr::kable(digits = 2, caption = paste("Top 20 Predictors -", best_model_name))
Top 20 Predictors - SVM
Predictor Overall Rank_NL Type
ManufacturingProcess32 100.00 1 Process
ManufacturingProcess13 93.82 2 Process
ManufacturingProcess09 89.93 3 Process
ManufacturingProcess17 88.20 4 Process
BiologicalMaterial06 82.61 5 Biological
BiologicalMaterial03 79.44 6 Biological
BiologicalMaterial12 72.36 7 Biological
ManufacturingProcess36 71.91 8 Process
ManufacturingProcess06 68.21 9 Process
ManufacturingProcess31 55.57 10 Process
ManufacturingProcess11 51.61 11 Process
BiologicalMaterial02 50.34 12 Biological
BiologicalMaterial11 48.53 13 Biological
BiologicalMaterial09 44.76 14 Biological
ManufacturingProcess30 43.47 15 Process
BiologicalMaterial08 40.24 16 Biological
ManufacturingProcess29 38.22 17 Process
ManufacturingProcess33 38.08 18 Process
BiologicalMaterial04 36.92 19 Biological
ManufacturingProcess25 35.83 20 Process
vi_nl_df %>%
  slice(1:10) %>%
  count(Type) %>%
  knitr::kable(caption = "Top 10 predictors: Biological vs Process")
Top 10 predictors: Biological vs Process
Type n
Biological 3
Process 7

Compare to Top 10 from Optimal Linear Model

vi_lm <- varImp(lm_fit, scale = TRUE)

vi_lm_df <- vi_lm$importance %>%
  rownames_to_column("Predictor") %>%
  arrange(desc(Overall)) %>%
  mutate(Rank_Linear = row_number())

# Join top 10 from each
top10_nl  <- vi_nl_df  %>% slice(1:10) %>% select(Predictor, Rank_NL)
top10_lm  <- vi_lm_df  %>% slice(1:10) %>% select(Predictor, Rank_Linear)

comparison <- full_join(top10_nl, top10_lm, by = "Predictor") %>%
  arrange(coalesce(Rank_NL, 99L))

comparison %>%
  knitr::kable(caption = "Top 10 Predictors: Nonlinear vs Linear Model")

(c) Explore Relationships for Unique Predictors

unique_to_nl <- comparison %>%
  filter(is.na(Rank_Linear)) %>%
  pull(Predictor)

cat("Predictors unique to nonlinear top 10:\n")
print(unique_to_nl)

Partial Dependence Plots

For MARS, partial dependence is available directly. For SVM or NNet, use ICEbox or pdp.

library(pdp)

top_predictors <- vi_nl_df %>% slice(1:10) %>% pull(Predictor)

# Plot partial dependence for each top predictor
for (pred in top_predictors[1:4]) {
  p <- partial(best_nl_fit,
               pred.var = pred,
               train    = train_df,
               plot     = FALSE) %>%
    autoplot() +
    labs(title  = paste("Partial Dependence:", pred),
         x      = pred,
         y      = "Predicted Yield") +
    theme_minimal()
  print(p)
}

Scatter Plots: Unique Predictors vs Yield

scatter_preds <- vi_nl_df %>% slice(1:6) %>% pull(Predictor)

train_df %>%
  select(Yield, all_of(scatter_preds)) %>%
  pivot_longer(-Yield, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  geom_smooth(method = "loess", se = TRUE, color = "firebrick") +
  facet_wrap(~Predictor, scales = "free_x", ncol = 3) +
  labs(title = "Top Predictors vs Yield (Nonlinear Model)",
       x     = "Predictor Value",
       y     = "Yield") +
  theme_minimal()

ManufacturingProcess13 and MP17 both drop sharply at higher values. BM03 and BM06 peak in the middle range then taper off. ManufacturingProcess32 has a slight U-shape. None of these are relationships a linear model could capture well, which explains why SVM won.

Conclusion

SVM is the best model. Process variables, especially MP32, are the primary yield drivers, but biological material quality cannot be ignored. The scatter plots reveal that several key predictors have optimal operating windows rather than simple linear relationships.