Replicate the same preprocessing steps from Exercise 6.3.
library(tidyverse)
library(caret)
library(AppliedPredictiveModeling)
library(e1071) # SVM
library(kernlab) # KSVM
library(earth) # MARS
library(nnet) # Neural net
library(randomForest)
library(gbm)
library(Cubist)
library(party)
library(doParallel)
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)
data(ChemicalManufacturingProcess)
cmp <- ChemicalManufacturingProcess
dim(cmp)
## [1] 176 58
# Impute missing values with median via preProcess
pre_impute <- preProcess(cmp, method = "medianImpute")
cmp_imputed <- predict(pre_impute, cmp)
# Verify no missing values remain
sum(is.na(cmp_imputed))
## [1] 0
nzv_cols <- nearZeroVar(cmp_imputed)
if (length(nzv_cols) > 0) {
cmp_clean <- cmp_imputed[, -nzv_cols]
} else {
cmp_clean <- cmp_imputed
}
cat("Dimensions after NZV removal:", dim(cmp_clean), "\n")
## Dimensions after NZV removal: 176 57
set.seed(42)
y <- cmp_clean$Yield
X <- cmp_clean %>% select(-Yield)
train_idx <- createDataPartition(y, p = 0.80, list = FALSE)
X_train <- X[train_idx, ]
X_test <- X[-train_idx, ]
y_train <- y[train_idx]
y_test <- y[-train_idx]
pre_proc <- preProcess(X_train, method = c("center", "scale"))
X_train_pp <- predict(pre_proc, X_train)
X_test_pp <- predict(pre_proc, X_test)
train_df <- X_train_pp %>% mutate(Yield = y_train)
test_df <- X_test_pp %>% mutate(Yield = y_test)
We use 10-fold cross-validation repeated 5 times for resampling.
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5,
allowParallel = TRUE
)
set.seed(42)
mars_grid <- expand.grid(degree = 1:2, nprune = seq(2, 30, by = 2))
mars_fit <- train(
Yield ~ .,
data = train_df,
method = "earth",
tuneGrid = mars_grid,
trControl = ctrl
)
mars_fit$bestTune
## nprune degree
## 20 10 2
set.seed(42)
svm_fit <- train(
Yield ~ .,
data = train_df,
method = "svmRadial",
tuneLength = 14,
trControl = ctrl
)
svm_fit$bestTune
## sigma C
## 7 0.01489057 16
set.seed(42)
knn_fit <- train(
Yield ~ .,
data = train_df,
method = "knn",
tuneGrid = data.frame(k = 1:20),
trControl = ctrl
)
knn_fit$bestTune
## k
## 4 4
set.seed(42)
nnet_grid <- expand.grid(
size = c(3, 5, 7),
decay = c(0.001, 0.01, 0.1),
bag = FALSE
)
nnet_fit <- train(
Yield ~ .,
data = train_df,
method = "avNNet",
tuneGrid = nnet_grid,
trControl = ctrl,
linout = TRUE,
trace = FALSE,
maxit = 200,
repeats = 3
)
nnet_fit$bestTune
## size decay bag
## 1 3 0.001 FALSE
resamp <- resamples(list(
MARS = mars_fit,
SVM = svm_fit,
KNN = knn_fit,
NNet = nnet_fit
))
summary(resamp)
##
## Call:
## summary.resamples(object = resamp)
##
## Models: MARS, SVM, KNN, NNet
## Number of resamples: 50
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.5614058 0.7317273 0.8987451 0.8891809 1.0154430 1.389358 0
## SVM 0.5864215 0.7886887 0.8540282 0.8917068 0.9807905 1.442299 0
## KNN 0.7450000 0.9084524 1.0354464 1.0236367 1.1175833 1.368750 0
## NNet 0.7448082 0.9919217 1.2290316 1.3448509 1.6528606 2.350983 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.6457921 0.9090172 1.053665 1.108660 1.241799 1.806462 0
## SVM 0.6771665 0.9899905 1.073766 1.094051 1.169537 1.535768 0
## KNN 0.8979373 1.1100770 1.280754 1.259417 1.376238 1.695101 0
## NNet 0.9454561 1.2440237 1.544879 1.754621 2.155358 3.797201 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.31172268 0.5687068 0.6793332 0.6663900 0.7893585 0.9403164 0
## SVM 0.30118304 0.5938421 0.6735069 0.6535080 0.7203748 0.8605378 0
## KNN 0.12698505 0.4227078 0.5659275 0.5313215 0.6474769 0.8226659 0
## NNet 0.01817544 0.2018569 0.4314195 0.3906245 0.5447891 0.7261486 0
bwplot(resamp, metric = "RMSE",
main = "Resampling RMSE Comparison")
bwplot(resamp, metric = "Rsquared",
main = "Resampling R-squared Comparison")
MARS and SVM are neck and neck with RMSE around 1.0 and R² near
0.6.
R² box spanning nearly 0 to 0.9 and the outlier past 3 on RMSE tells you it was sensitive to the small sample size. SVM wins on consistency.
compute_test_metrics <- function(fit, X_test_pp, y_test) {
preds <- predict(fit, newdata = X_test_pp)
postResample(pred = preds, obs = y_test)
}
test_results <- map_dfr(
list(MARS = mars_fit, SVM = svm_fit, KNN = knn_fit, NNet = nnet_fit),
compute_test_metrics,
X_test_pp = X_test_pp,
y_test = y_test,
.id = "Model"
)
test_results %>%
arrange(RMSE) %>%
knitr::kable(digits = 4, caption = "Test Set Performance")
| Model | RMSE | Rsquared | MAE |
|---|---|---|---|
| SVM | 1.2704 | 0.6180 | 0.9170 |
| KNN | 1.3586 | 0.5533 | 0.9645 |
| MARS | 1.3790 | 0.5772 | 1.0178 |
| NNet | 1.6113 | 0.4328 | 1.2499 |
best_model_name <- test_results %>%
arrange(RMSE) %>%
slice(1) %>%
pull(Model)
cat("Best nonlinear model by test RMSE:", best_model_name, "\n")
## Best nonlinear model by test RMSE: SVM
Identify the optimal nonlinear model and extract variable importance.
# Programmatically select the best fit object
model_list <- list(MARS = mars_fit, SVM = svm_fit, KNN = knn_fit, NNet = nnet_fit)
best_nl_fit <- model_list[[best_model_name]]
vi_nl <- varImp(best_nl_fit, scale = TRUE)
plot(vi_nl, top = 20,
main = paste("Variable Importance -", best_model_name))
The top 5 is process-heavy, but biological variables show up steadily from rank 5 onward.
vi_nl_df <- vi_nl$importance %>%
rownames_to_column("Predictor") %>%
arrange(desc(Overall)) %>%
mutate(Rank_NL = row_number(),
Type = if_else(str_detect(Predictor, "^Biological"), "Biological", "Process"))
vi_nl_df %>%
slice(1:20) %>%
knitr::kable(digits = 2, caption = paste("Top 20 Predictors -", best_model_name))
| Predictor | Overall | Rank_NL | Type |
|---|---|---|---|
| ManufacturingProcess32 | 100.00 | 1 | Process |
| ManufacturingProcess13 | 93.82 | 2 | Process |
| ManufacturingProcess09 | 89.93 | 3 | Process |
| ManufacturingProcess17 | 88.20 | 4 | Process |
| BiologicalMaterial06 | 82.61 | 5 | Biological |
| BiologicalMaterial03 | 79.44 | 6 | Biological |
| BiologicalMaterial12 | 72.36 | 7 | Biological |
| ManufacturingProcess36 | 71.91 | 8 | Process |
| ManufacturingProcess06 | 68.21 | 9 | Process |
| ManufacturingProcess31 | 55.57 | 10 | Process |
| ManufacturingProcess11 | 51.61 | 11 | Process |
| BiologicalMaterial02 | 50.34 | 12 | Biological |
| BiologicalMaterial11 | 48.53 | 13 | Biological |
| BiologicalMaterial09 | 44.76 | 14 | Biological |
| ManufacturingProcess30 | 43.47 | 15 | Process |
| BiologicalMaterial08 | 40.24 | 16 | Biological |
| ManufacturingProcess29 | 38.22 | 17 | Process |
| ManufacturingProcess33 | 38.08 | 18 | Process |
| BiologicalMaterial04 | 36.92 | 19 | Biological |
| ManufacturingProcess25 | 35.83 | 20 | Process |
vi_nl_df %>%
slice(1:10) %>%
count(Type) %>%
knitr::kable(caption = "Top 10 predictors: Biological vs Process")
| Type | n |
|---|---|
| Biological | 3 |
| Process | 7 |
vi_lm <- varImp(lm_fit, scale = TRUE)
vi_lm_df <- vi_lm$importance %>%
rownames_to_column("Predictor") %>%
arrange(desc(Overall)) %>%
mutate(Rank_Linear = row_number())
# Join top 10 from each
top10_nl <- vi_nl_df %>% slice(1:10) %>% select(Predictor, Rank_NL)
top10_lm <- vi_lm_df %>% slice(1:10) %>% select(Predictor, Rank_Linear)
comparison <- full_join(top10_nl, top10_lm, by = "Predictor") %>%
arrange(coalesce(Rank_NL, 99L))
comparison %>%
knitr::kable(caption = "Top 10 Predictors: Nonlinear vs Linear Model")
unique_to_nl <- comparison %>%
filter(is.na(Rank_Linear)) %>%
pull(Predictor)
cat("Predictors unique to nonlinear top 10:\n")
print(unique_to_nl)
For MARS, partial dependence is available directly. For SVM or NNet, use ICEbox or pdp.
library(pdp)
top_predictors <- vi_nl_df %>% slice(1:10) %>% pull(Predictor)
# Plot partial dependence for each top predictor
for (pred in top_predictors[1:4]) {
p <- partial(best_nl_fit,
pred.var = pred,
train = train_df,
plot = FALSE) %>%
autoplot() +
labs(title = paste("Partial Dependence:", pred),
x = pred,
y = "Predicted Yield") +
theme_minimal()
print(p)
}
scatter_preds <- vi_nl_df %>% slice(1:6) %>% pull(Predictor)
train_df %>%
select(Yield, all_of(scatter_preds)) %>%
pivot_longer(-Yield, names_to = "Predictor", values_to = "Value") %>%
ggplot(aes(x = Value, y = Yield)) +
geom_point(alpha = 0.4, color = "steelblue") +
geom_smooth(method = "loess", se = TRUE, color = "firebrick") +
facet_wrap(~Predictor, scales = "free_x", ncol = 3) +
labs(title = "Top Predictors vs Yield (Nonlinear Model)",
x = "Predictor Value",
y = "Yield") +
theme_minimal()
ManufacturingProcess13 and MP17 both drop sharply at higher values. BM03 and BM06 peak in the middle range then taper off. ManufacturingProcess32 has a slight U-shape. None of these are relationships a linear model could capture well, which explains why SVM won.
SVM is the best model. Process variables, especially MP32, are the primary yield drivers, but biological material quality cannot be ignored. The scatter plots reveal that several key predictors have optimal operating windows rather than simple linear relationships.