Homework 8

Setup and Data Preparation

Replicate the same preprocessing steps from Exercise 6.3.

library(tidyverse)
library(caret)
library(AppliedPredictiveModeling)
library(e1071)       # SVM
library(kernlab)     # KSVM
library(earth)       # MARS
library(nnet)        # Neural net
library(randomForest)
library(gbm)
library(Cubist)
library(party)
library(doParallel)


cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)

data(ChemicalManufacturingProcess)

cmp <- ChemicalManufacturingProcess
dim(cmp)

## [1] 176  58

Imputation

# Impute missing values with median via preProcess 
pre_impute <- preProcess(cmp, method = "medianImpute")
cmp_imputed <- predict(pre_impute, cmp)

# Verify no missing values remain
sum(is.na(cmp_imputed))

## [1] 0

Remove Near-Zero Variance Predictors

nzv_cols <- nearZeroVar(cmp_imputed)
if (length(nzv_cols) > 0) {
  cmp_clean <- cmp_imputed[, -nzv_cols]
} else {
  cmp_clean <- cmp_imputed
}

cat("Dimensions after NZV removal:", dim(cmp_clean), "\n")

## Dimensions after NZV removal: 176 57

Train / Test Split

set.seed(42)

y    <- cmp_clean$Yield
X    <- cmp_clean %>% select(-Yield)

train_idx  <- createDataPartition(y, p = 0.80, list = FALSE)

X_train <- X[train_idx, ]
X_test  <- X[-train_idx, ]
y_train <- y[train_idx]
y_test  <- y[-train_idx]

Pre-Processing (Center, Scale)

pre_proc <- preProcess(X_train, method = c("center", "scale"))

X_train_pp <- predict(pre_proc, X_train)
X_test_pp  <- predict(pre_proc, X_test)

train_df <- X_train_pp %>% mutate(Yield = y_train)
test_df  <- X_test_pp  %>% mutate(Yield = y_test)

(a) Train Nonlinear Regression Models

We use 10-fold cross-validation repeated 5 times for resampling.

ctrl <- trainControl(
  method    = "repeatedcv",
  number    = 10,
  repeats   = 5,
  allowParallel = TRUE
)

MARS (Multivariate Adaptive Regression Splines)

set.seed(42)
mars_grid <- expand.grid(degree = 1:2, nprune = seq(2, 30, by = 2))

mars_fit <- train(
  Yield ~ .,
  data      = train_df,
  method    = "earth",
  tuneGrid  = mars_grid,
  trControl = ctrl
)

mars_fit$bestTune

##    nprune degree
## 20     10      2

Support Vector Machine (Radial)

set.seed(42)
svm_fit <- train(
  Yield ~ .,
  data      = train_df,
  method    = "svmRadial",
  tuneLength = 14,
  trControl  = ctrl
)

svm_fit$bestTune

##        sigma  C
## 7 0.01489057 16

K-Nearest Neighbors

set.seed(42)
knn_fit <- train(
  Yield ~ .,
  data      = train_df,
  method    = "knn",
  tuneGrid  = data.frame(k = 1:20),
  trControl = ctrl
)

knn_fit$bestTune

##   k
## 4 4

Neural Network (averaged)

set.seed(42)
nnet_grid <- expand.grid(
  size  = c(3, 5, 7),
  decay = c(0.001, 0.01, 0.1),
  bag   = FALSE
)

nnet_fit <- train(
  Yield ~ .,
  data      = train_df,
  method    = "avNNet",
  tuneGrid  = nnet_grid,
  trControl = ctrl,
  linout    = TRUE,
  trace     = FALSE,
  maxit     = 200,
  repeats   = 3
)

nnet_fit$bestTune

##   size decay   bag
## 1    3 0.001 FALSE

Compare Resampling Performance

resamp <- resamples(list(
  MARS  = mars_fit,
  SVM   = svm_fit,
  KNN   = knn_fit,
  NNet  = nnet_fit
))

summary(resamp)

## 
## Call:
## summary.resamples(object = resamp)
## 
## Models: MARS, SVM, KNN, NNet 
## Number of resamples: 50 
## 
## MAE 
##           Min.   1st Qu.    Median      Mean   3rd Qu.     Max. NA's
## MARS 0.5614058 0.7317273 0.8987451 0.8891809 1.0154430 1.389358    0
## SVM  0.5864215 0.7886887 0.8540282 0.8917068 0.9807905 1.442299    0
## KNN  0.7450000 0.9084524 1.0354464 1.0236367 1.1175833 1.368750    0
## NNet 0.7448082 0.9919217 1.2290316 1.3448509 1.6528606 2.350983    0
## 
## RMSE 
##           Min.   1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## MARS 0.6457921 0.9090172 1.053665 1.108660 1.241799 1.806462    0
## SVM  0.6771665 0.9899905 1.073766 1.094051 1.169537 1.535768    0
## KNN  0.8979373 1.1100770 1.280754 1.259417 1.376238 1.695101    0
## NNet 0.9454561 1.2440237 1.544879 1.754621 2.155358 3.797201    0
## 
## Rsquared 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS 0.31172268 0.5687068 0.6793332 0.6663900 0.7893585 0.9403164    0
## SVM  0.30118304 0.5938421 0.6735069 0.6535080 0.7203748 0.8605378    0
## KNN  0.12698505 0.4227078 0.5659275 0.5313215 0.6474769 0.8226659    0
## NNet 0.01817544 0.2018569 0.4314195 0.3906245 0.5447891 0.7261486    0

bwplot(resamp, metric = "RMSE",
       main = "Resampling RMSE Comparison")

bwplot(resamp, metric = "Rsquared",
       main = "Resampling R-squared Comparison")

MARS and SVM are neck and neck with RMSE around 1.0 and R² near 0.6.

R² box spanning nearly 0 to 0.9 and the outlier past 3 on RMSE tells you it was sensitive to the small sample size. SVM wins on consistency.

Test Set Performance

compute_test_metrics <- function(fit, X_test_pp, y_test) {
  preds <- predict(fit, newdata = X_test_pp)
  postResample(pred = preds, obs = y_test)
}

test_results <- map_dfr(
  list(MARS = mars_fit, SVM = svm_fit, KNN = knn_fit, NNet = nnet_fit),
  compute_test_metrics,
  X_test_pp = X_test_pp,
  y_test    = y_test,
  .id       = "Model"
)

test_results %>%
  arrange(RMSE) %>%
  knitr::kable(digits = 4, caption = "Test Set Performance")

Test Set Performance
Model	RMSE	Rsquared	MAE
SVM	1.2704	0.6180	0.9170
KNN	1.3586	0.5533	0.9645
MARS	1.3790	0.5772	1.0178
NNet	1.6113	0.4328	1.2499

best_model_name <- test_results %>%
  arrange(RMSE) %>%
  slice(1) %>%
  pull(Model)

cat("Best nonlinear model by test RMSE:", best_model_name, "\n")

## Best nonlinear model by test RMSE: SVM

(b) Variable Importance

Identify the optimal nonlinear model and extract variable importance.

# Programmatically select the best fit object
model_list <- list(MARS = mars_fit, SVM = svm_fit, KNN = knn_fit, NNet = nnet_fit)
best_nl_fit <- model_list[[best_model_name]]

Top 20 Predictors - Optimal Nonlinear Model

vi_nl <- varImp(best_nl_fit, scale = TRUE)

plot(vi_nl, top = 20,
     main = paste("Variable Importance -", best_model_name))

The top 5 is process-heavy, but biological variables show up steadily from rank 5 onward.

vi_nl_df <- vi_nl$importance %>%
  rownames_to_column("Predictor") %>%
  arrange(desc(Overall)) %>%
  mutate(Rank_NL = row_number(),
         Type    = if_else(str_detect(Predictor, "^Biological"), "Biological", "Process"))

vi_nl_df %>%
  slice(1:20) %>%
  knitr::kable(digits = 2, caption = paste("Top 20 Predictors -", best_model_name))

Top 20 Predictors - SVM
Predictor	Overall	Rank_NL	Type
ManufacturingProcess32	100.00	1	Process
ManufacturingProcess13	93.82	2	Process
ManufacturingProcess09	89.93	3	Process
ManufacturingProcess17	88.20	4	Process
BiologicalMaterial06	82.61	5	Biological
BiologicalMaterial03	79.44	6	Biological
BiologicalMaterial12	72.36	7	Biological
ManufacturingProcess36	71.91	8	Process
ManufacturingProcess06	68.21	9	Process
ManufacturingProcess31	55.57	10	Process
ManufacturingProcess11	51.61	11	Process
BiologicalMaterial02	50.34	12	Biological
BiologicalMaterial11	48.53	13	Biological
BiologicalMaterial09	44.76	14	Biological
ManufacturingProcess30	43.47	15	Process
BiologicalMaterial08	40.24	16	Biological
ManufacturingProcess29	38.22	17	Process
ManufacturingProcess33	38.08	18	Process
BiologicalMaterial04	36.92	19	Biological
ManufacturingProcess25	35.83	20	Process

vi_nl_df %>%
  slice(1:10) %>%
  count(Type) %>%
  knitr::kable(caption = "Top 10 predictors: Biological vs Process")

Top 10 predictors: Biological vs Process
Type	n
Biological	3
Process	7

Compare to Top 10 from Optimal Linear Model

vi_lm <- varImp(lm_fit, scale = TRUE)

vi_lm_df <- vi_lm$importance %>%
  rownames_to_column("Predictor") %>%
  arrange(desc(Overall)) %>%
  mutate(Rank_Linear = row_number())

# Join top 10 from each
top10_nl  <- vi_nl_df  %>% slice(1:10) %>% select(Predictor, Rank_NL)
top10_lm  <- vi_lm_df  %>% slice(1:10) %>% select(Predictor, Rank_Linear)

comparison <- full_join(top10_nl, top10_lm, by = "Predictor") %>%
  arrange(coalesce(Rank_NL, 99L))

comparison %>%
  knitr::kable(caption = "Top 10 Predictors: Nonlinear vs Linear Model")

(c) Explore Relationships for Unique Predictors

unique_to_nl <- comparison %>%
  filter(is.na(Rank_Linear)) %>%
  pull(Predictor)

cat("Predictors unique to nonlinear top 10:\n")
print(unique_to_nl)

Partial Dependence Plots

For MARS, partial dependence is available directly. For SVM or NNet, use ICEbox or pdp.

library(pdp)

top_predictors <- vi_nl_df %>% slice(1:10) %>% pull(Predictor)

# Plot partial dependence for each top predictor
for (pred in top_predictors[1:4]) {
  p <- partial(best_nl_fit,
               pred.var = pred,
               train    = train_df,
               plot     = FALSE) %>%
    autoplot() +
    labs(title  = paste("Partial Dependence:", pred),
         x      = pred,
         y      = "Predicted Yield") +
    theme_minimal()
  print(p)
}

Scatter Plots: Unique Predictors vs Yield

scatter_preds <- vi_nl_df %>% slice(1:6) %>% pull(Predictor)

train_df %>%
  select(Yield, all_of(scatter_preds)) %>%
  pivot_longer(-Yield, names_to = "Predictor", values_to = "Value") %>%
  ggplot(aes(x = Value, y = Yield)) +
  geom_point(alpha = 0.4, color = "steelblue") +
  geom_smooth(method = "loess", se = TRUE, color = "firebrick") +
  facet_wrap(~Predictor, scales = "free_x", ncol = 3) +
  labs(title = "Top Predictors vs Yield (Nonlinear Model)",
       x     = "Predictor Value",
       y     = "Yield") +
  theme_minimal()

ManufacturingProcess13 and MP17 both drop sharply at higher values. BM03 and BM06 peak in the middle range then taper off. ManufacturingProcess32 has a slight U-shape. None of these are relationships a linear model could capture well, which explains why SVM won.

Conclusion

SVM is the best model. Process variables, especially MP32, are the primary yield drivers, but biological material quality cannot be ignored. The scatter plots reveal that several key predictors have optimal operating windows rather than simple linear relationships.

Homework 8 - 7.5