Packages

library(mlbench)
library(caret)
library(earth)
library(kernlab)
library(nnet)
library(ggplot2)
library(dplyr)
library(tidyr)

Exercise 7.2

Data Generation

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

cat("Training obs:", nrow(trainingData$x),
    " | Test obs:", nrow(testData$x),
    " | Predictors:", ncol(trainingData$x), "\n")

Training obs: 200 | Test obs: 5000 | Predictors: 10

cat("Response range:", round(range(trainingData$y), 2), "\n")

Response range: 3.56 28.38

The training set (n = 200) is intentionally small so model choice matters. The large test set (n = 5,000) gives a precise estimate of true generalisation error.

Exploratory Look

cor_vals <- sapply(trainingData$x, \(xj) cor(xj, trainingData$y))
cor_df <- data.frame(
  Predictor   = names(cor_vals),
  Correlation = cor_vals,
  Role = ifelse(names(cor_vals) %in% paste0("X", 1:5),
                "Informative", "Noise")
)

ggplot(cor_df, aes(x = reorder(Predictor, abs(Correlation)),
                   y = Correlation, fill = Role)) +
  geom_col(width = 0.6) +
  coord_flip() +
  scale_fill_manual(values = c(Informative = "#2166ac", Noise = "#d73027")) +
  labs(
    title    = "Linear Correlation of Each Predictor with the Response",
    subtitle = "X3 appears weak because its relationship is quadratic, not linear",
    x = NULL, y = "Pearson r", fill = NULL
  ) +
  theme_minimal(base_size = 13)

The correlation chart highlights an important subtlety. X1, X2, X4, and X5 each show modest but visible linear correlations with the response, consistent with their direct contributions in the true model. X3, however, registers near-zero correlation despite being a genuine driver of the outcome. The reason is that its effect is a symmetric U-shape: values of X3 near zero and values near one both push the response upward by roughly equal amounts, so the positive and negative associations cancel out when averaged into a single correlation coefficient. Low correlation does not mean low importance here — it means low linear importance, which is exactly the limitation that motivates using nonlinear models.

Feature Plots

fp_df  <- data.frame(trainingData$x, y = trainingData$y)
fp_long <- pivot_longer(fp_df, cols = -y,
                        names_to = "Predictor", values_to = "x")

noise_vars <- paste0("X", 6:10)
info_vars  <- paste0("X", 1:5)
fp_long$Predictor <- factor(fp_long$Predictor,
                             levels = c(noise_vars, info_vars))

ggplot(fp_long, aes(x = x, y = y)) +
  geom_point(shape = 1, colour = "#4393c3", size = 1.2, alpha = 0.7) +
  facet_wrap(~ Predictor, nrow = 2, dir = "h") +
  labs(title    = "Scatter Plots: Each Predictor vs. Response",
       subtitle = "Top row: noise (X6-X10) | Bottom row: informative (X1-X5)",
       x = "Feature", y = "y") +
  theme_bw(base_size = 12) +
  theme(strip.background = element_rect(fill = "grey90"),
        strip.text = element_text(face = "bold"))

The top row (X6-X10) shows pure scatter with no structure. The bottom row (X1-X5) looks similarly patchy — and that is the central lesson. The signal is real but largely undetectable by eye or linear correlation, which motivates flexible nonlinear models.

Resampling Control

All four models use the same 10-fold cross-validation so results are directly comparable.

ctrl <- trainControl(method = "cv", number = 10, verboseIter = FALSE)

Model 1: K-Nearest Neighbours

KNN requires centring and scaling because it is distance-based.

set.seed(7)
knn_72 <- train(
  x          = trainingData$x,
  y          = trainingData$y,
  method     = "knn",
  preProc    = c("center", "scale"),
  tuneLength = 10,
  trControl  = ctrl
)
.tmp <- knn_72

knitr::kable(
  knn_72$results[, c("k","RMSE","Rsquared","MAE")] |>
    mutate(across(where(is.numeric), \(x) round(x, 3))),
  caption = paste("KNN cross-validation results. Best k =", knn_72$bestTune$k),
  
)

KNN cross-validation results. Best k = 11
k	RMSE	Rsquared	MAE
5	3.141	0.629	2.533
7	3.078	0.656	2.484
9	3.080	0.672	2.488
11	3.016	0.712	2.437
13	3.024	0.723	2.464
15	3.061	0.722	2.479
17	3.059	0.735	2.475
19	3.090	0.735	2.512
21	3.091	0.743	2.509
23	3.138	0.741	2.558

ggplot(knn_72) +
  labs(title = "KNN: CV RMSE vs. Number of Neighbours") +
  theme_minimal()

Model 2: MARS

MARS fits piecewise linear basis functions and prunes them via GCV, giving built-in variable selection. We allow degree-2 (interaction) terms.

set.seed(7)
mars_72 <- train(
  x         = trainingData$x,
  y         = trainingData$y,
  method    = "earth",
  tuneGrid  = expand.grid(degree = 1:2, nprune = 2:20),
  trControl = ctrl
)
.tmp <- mars_72

knitr::kable(
  mars_72$results[, c("degree","nprune","RMSE","Rsquared","MAE")] |>
    mutate(across(where(is.numeric), \(x) round(x, 3))) |>
    arrange(RMSE) |> head(10),
  caption = paste("MARS cross-validation results (top 10 by RMSE). Best: degree =",
                  mars_72$bestTune$degree, ", nprune =", mars_72$bestTune$nprune),
  
)

MARS cross-validation results (top 10 by RMSE). Best: degree = 2 , nprune = 14
degree	nprune	RMSE	Rsquared	MAE
2	14	1.309	0.935	1.038
2	15	1.310	0.936	1.036
2	16	1.325	0.934	1.044
2	17	1.325	0.934	1.047
2	18	1.326	0.934	1.047
2	19	1.326	0.934	1.047
2	20	1.326	0.934	1.047
2	13	1.328	0.932	1.057
2	11	1.332	0.931	1.070
2	12	1.346	0.932	1.078

plot(mars_72, main = "MARS: CV RMSE Across Tuning Grid")

Model 3: Neural Network

Regularised with weight decay; linear output unit for regression.

set.seed(7)
nnet_72 <- train(
  x         = trainingData$x,
  y         = trainingData$y,
  method    = "nnet",
  preProc   = c("center", "scale"),
  tuneGrid  = expand.grid(
    size  = c(3, 5, 7, 10),
    decay = c(0.001, 0.01, 0.1)
  ),
  linout    = TRUE,
  trace     = FALSE,
  maxit     = 500,
  trControl = ctrl
)
.tmp <- nnet_72

knitr::kable(
  nnet_72$results[, c("size","decay","RMSE","Rsquared","MAE")] |>
    mutate(across(where(is.numeric), \(x) round(x, 3))) |>
    arrange(RMSE),
  caption = paste("Neural network cross-validation results. Best: size =",
                  nnet_72$bestTune$size, ", decay =", nnet_72$bestTune$decay),
  
)

Neural network cross-validation results. Best: size = 5 , decay = 0.01
size	decay	RMSE	Rsquared	MAE
5	0.010	2.421	0.765	1.947
3	0.100	2.462	0.745	1.970
3	0.001	2.474	0.754	1.981
3	0.010	2.523	0.751	1.930
5	0.100	2.634	0.738	2.066
5	0.001	2.790	0.715	2.185
7	0.010	2.877	0.719	2.167
7	0.100	2.905	0.684	2.300
10	0.100	3.187	0.654	2.658
7	0.001	3.390	0.621	2.541
10	0.010	3.505	0.603	2.738
10	0.001	3.747	0.552	3.022

Model 4: Support Vector Machine (Radial Basis Kernel)

The Support Vector Machine with a radial basis function kernel finds patterns by measuring similarity between data points rather than fitting explicit curves. The cost parameter controls how strictly the model must fit the training data — lower values allow more errors in exchange for a smoother fit, while higher values push the model to fit every point more closely at the risk of overfitting. The sigma parameter controls how far each training point’s influence reaches — small values mean only very nearby points matter, large values spread influence more broadly across the predictor space.

set.seed(7)
svm_72 <- train(
  x          = trainingData$x,
  y          = trainingData$y,
  method     = "svmRadial",
  preProc    = c("center", "scale"),
  tuneLength = 10,
  trControl  = ctrl
)
.tmp <- svm_72

knitr::kable(
  svm_72$results[, c("C","RMSE","Rsquared","MAE")] |>
    mutate(across(where(is.numeric), \(x) round(x, 3))),
  caption = paste("SVM cross-validation results. Best C =", svm_72$bestTune$C),
  
)

SVM cross-validation results. Best C = 16
C	RMSE	Rsquared	MAE
0.25	2.502	0.803	1.996
0.50	2.234	0.824	1.763
1.00	2.062	0.846	1.612
2.00	1.959	0.862	1.510
4.00	1.910	0.870	1.483
8.00	1.892	0.873	1.480
16.00	1.891	0.873	1.487
32.00	1.891	0.873	1.487
64.00	1.891	0.873	1.487
128.00	1.891	0.873	1.487

plot(svm_72, scales = list(x = list(log = 2)),
     main = "SVM: CV RMSE vs. Cost Parameter")

Test-Set Performance Comparison

models_72 <- list(KNN = knn_72, MARS = mars_72, NNet = nnet_72, SVM = svm_72)

perf_72 <- lapply(models_72, \(m) {
  p <- predict(m, newdata = testData$x)
  as.data.frame(t(postResample(pred = p, obs = testData$y)))
}) |>
  bind_rows(.id = "Model") |>
  mutate(across(where(is.numeric), \(x) round(x, 3))) |>
  arrange(RMSE)

knitr::kable(perf_72, row.names = FALSE,
             caption = "Test-set performance (n = 5,000)",
             )

Test-set performance (n = 5,000)
Model	RMSE	Rsquared	MAE
MARS	1.172	0.945	0.932
SVM	2.073	0.826	1.574
NNet	2.501	0.766	1.897
KNN	3.122	0.669	2.496

ggplot(perf_72, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
  geom_col(width = 0.55, show.legend = FALSE) +
  geom_text(aes(label = RMSE), vjust = -0.4, fontface = "bold", size = 4.5) +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Test-Set RMSE by Model",
    subtitle = "Lower is better; noise floor near 1 (the simulation noise SD)",
    x = NULL, y = "RMSE"
  ) +
  theme_minimal(base_size = 13)

Part (a): Which Models Give the Best Performance?

MARS achieves the lowest test RMSE (1.172), followed by SVM (2.073), NNet (2.501), and KNN (3.122).

MARS wins for structural reasons: the true data-generating function is a sum of smooth low-dimensional terms (a sine interaction, a quadratic, two linears), exactly the kind of structure that MARS’s piecewise polynomial basis represents efficiently. The best-tuned model used degree = 2 (allowing interactions between predictors) and 14 pruned terms.

SVM is the second-best model. Its RBF kernel can represent arbitrary smooth functions, but unlike MARS it has no mechanism for discarding noise predictors, so the five uninformative dimensions add estimation variance.

KNN performs worst. Euclidean distance is computed across all 10 predictors; the five noise dimensions dilute the signal from the informative ones, so even the optimal k gives a poor bias-variance trade-off.

Part (b): Does MARS Select the Informative Predictors?

vi_72 <- varImp(mars_72)$importance
vi_72$Predictor <- rownames(vi_72)

# varImp drops zero-importance predictors — rebuild with all 10 explicitly
all_preds <- data.frame(Predictor = paste0("X", 1:10), Overall = 0)
all_preds$Overall <- ifelse(
  all_preds$Predictor %in% vi_72$Predictor,
  vi_72$Overall[match(all_preds$Predictor, vi_72$Predictor)], 0
)
all_preds$Role <- ifelse(all_preds$Predictor %in% paste0("X", 1:5),
                         "Informative (X1-X5)", "Noise (X6-X10)")

# Hardcode exact order: noise group at bottom, informative sorted by importance
desired_order <- c("X6","X7","X8","X9","X10","X3","X5","X2","X4","X1")
all_preds$Predictor <- factor(all_preds$Predictor, levels = desired_order)

ggplot(all_preds, aes(x = Predictor, y = Overall, fill = Role)) +
  geom_col(width = 0.6) +
  coord_flip() +
  scale_fill_manual(
    values = c("Informative (X1-X5)" = "#2166ac",
               "Noise (X6-X10)"      = "#d73027")
  ) +
  labs(
    title    = "MARS Correctly Identifies Informative Predictors and Discards Noise",
    subtitle = "All five noise predictors (X6-X10) receive exactly zero importance",
    x = NULL, y = "Importance (scaled 0-100)", fill = NULL
  ) +
  theme_minimal(base_size = 13)

vi_72 <- arrange(all_preds, desc(Overall))

vi_table <- vi_72 |>
  select(Predictor, Type = Role, Importance = Overall) |>
  mutate(
    Type       = ifelse(grepl("Informative", Type), "Informative", "Noise"),
    Importance = round(Importance, 1)
  )

knitr::kable(vi_table, row.names = FALSE,
             caption = "MARS variable importance for the Friedman data",
             )

MARS variable importance for the Friedman data
Predictor	Type	Importance
X1	Informative	100.0
X4	Informative	75.2
X2	Informative	48.7
X5	Informative	15.5
X3	Informative	0.0
X6	Noise	0.0
X7	Noise	0.0
X8	Noise	0.0
X9	Noise	0.0
X10	Noise	0.0

MARS perfectly separates signal from noise. All five noise predictors receive zero importance, meaning MARS’s pruning algorithm found no basis functions involving X6 through X10 that improved the model enough to retain — they were discarded entirely.

Among the informative predictors, the ranking is intuitive. X1 scores highest because it appears in the dominant sine interaction term, and its marginal contribution is amplified by the fact that MARS must build multiple hinge functions to approximate a sine wave. X4 ranks second with 75%, consistent with its linear coefficient of 10. X2 ranks third despite also appearing in the sine interaction — the asymmetry between X1 and X2 arises because MARS builds interactions sequentially and attributes more credit to the first variable encountered. X5 ranks fourth at 16%, reflecting its smaller linear coefficient of 5.

X3 is the most instructive case. It contributes a genuine quadratic effect to the response, yet receives zero importance here. The reason is that its term is centred at 0.5 and opens upward symmetrically, which means its influence on the response looks flat when viewed marginally. With only 200 training observations, the GCV criterion did not find enough evidence to retain a basis function for it. This is not a failure of MARS — it is a reminder that variable importance scores reflect what the model learned from this particular sample, not the ground truth, and that small datasets can cause even real effects to go undetected.

Exercise 7.5

Problem Statement

Exercise 6.3 describes a pharmaceutical chemical manufacturing process: 176 production runs, 57 predictors (12 biological material measurements + 45 process parameters), response = percent product yield. A 1% yield improvement is worth approximately $100,000 per batch.

Tasks (using the same imputation, split, and pre-processing as Exercise 6.3):

(a) Which nonlinear model gives the best resampling and test-set performance? (b) Which predictors are most important? Compare to the linear model’s top 10. (c) For predictors unique to the nonlinear model, explore their relationship with yield.

Data Loading and Imputation

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

yield75 <- ChemicalManufacturingProcess$Yield
preds75 <- ChemicalManufacturingProcess[, -1]

cat("Dimensions:", nrow(ChemicalManufacturingProcess), "x",
    ncol(ChemicalManufacturingProcess), "\n")

Dimensions: 176 x 58

cat("Missing values in predictors:", sum(is.na(preds75)), "\n")

Missing values in predictors: 106

cat("Yield -- mean:", round(mean(yield75), 2),
    "  sd:", round(sd(yield75), 2),
    "  range:", round(range(yield75), 2), "\n")

Yield – mean: 40.18 sd: 1.85 range: 35.25 46.34

na_df <- data.frame(
  Predictor = names(preds75),
  Missing   = colSums(is.na(preds75))
) |> filter(Missing > 0) |> arrange(desc(Missing))

if (nrow(na_df) > 0) {
  ggplot(na_df, aes(x = reorder(Predictor, Missing), y = Missing)) +
    geom_col(fill = "#d73027", width = 0.6) +
    coord_flip() +
    labs(title    = "Predictors with Missing Values",
         subtitle = "All missing values are in process predictors — consistent with sensor gaps",
         x = NULL, y = "Count of missing cells") +
    theme_minimal(base_size = 11) +
    theme(axis.text.y = element_text(size = 8))
}

pp_impute <- preProcess(preds75, method = "medianImpute")
preds_imp <- predict(pp_impute, preds75)
cat("Missing values after imputation:", sum(is.na(preds_imp)), "\n")

Missing values after imputation: 0

Train / Test Split

set.seed(100)
trainIdx <- createDataPartition(yield75, p = 0.8, list = FALSE)

trX <- preds_imp[ trainIdx, ];  teX <- preds_imp[-trainIdx, ]
trY <- yield75[trainIdx];       teY <- yield75[-trainIdx]

cat("Training:", nrow(trX), " | Test:", nrow(teX), "\n")

Training: 144 | Test: 32

Resampling Control

ctrl75 <- trainControl(method = "cv", number = 10, verboseIter = FALSE)

Model 1: KNN

set.seed(7)
knn_75 <- train(trX, trY,
                method     = "knn",
                preProc    = c("center", "scale"),
                tuneLength = 10,
                trControl  = ctrl75)
.tmp <- knn_75

knitr::kable(
  knn_75$results[, c("k","RMSE","Rsquared","MAE")] |>
    mutate(across(where(is.numeric), \(x) round(x, 3))),
  caption = paste("KNN cross-validation results. Best k =", knn_75$bestTune$k),
  
)

KNN cross-validation results. Best k = 5
k	RMSE	Rsquared	MAE
5	1.392	0.489	1.132
7	1.403	0.479	1.151
9	1.434	0.451	1.185
11	1.419	0.471	1.182
13	1.426	0.474	1.168
15	1.449	0.461	1.191
17	1.446	0.471	1.196
19	1.453	0.468	1.191
21	1.464	0.469	1.191
23	1.474	0.463	1.198

Model 2: MARS

set.seed(7)
mars_75 <- train(trX, trY,
                 method    = "earth",
                 tuneGrid  = expand.grid(degree = 1:2, nprune = 2:20),
                 trControl = ctrl75)
.tmp <- mars_75

knitr::kable(
  mars_75$results[, c("degree","nprune","RMSE","Rsquared","MAE")] |>
    mutate(across(where(is.numeric), \(x) round(x, 3))) |>
    arrange(RMSE) |> head(10),
  caption = paste("MARS cross-validation results (top 10 by RMSE). Best: degree =",
                  mars_75$bestTune$degree, ", nprune =", mars_75$bestTune$nprune),
  
)

MARS cross-validation results (top 10 by RMSE). Best: degree = 1 , nprune = 5
degree	nprune	RMSE	Rsquared	MAE
1	5	1.198	0.612	0.975
2	4	1.203	0.626	0.996
1	4	1.210	0.607	0.998
1	3	1.218	0.602	0.978
1	7	1.222	0.588	0.988
1	6	1.225	0.585	0.997
2	6	1.227	0.597	0.997
1	8	1.245	0.589	1.006
2	5	1.256	0.588	1.017
1	9	1.268	0.564	1.029

Model 3: Neural Network

set.seed(7)
nnet_75 <- train(trX, trY,
                 method    = "nnet",
                 preProc   = c("center", "scale"),
                 tuneGrid  = expand.grid(
                   size  = c(3, 5, 7),
                   decay = c(0.001, 0.01, 0.1)
                 ),
                 linout    = TRUE,
                 trace     = FALSE,
                 maxit     = 500,
                 trControl = ctrl75)
.tmp <- nnet_75

knitr::kable(
  nnet_75$results[, c("size","decay","RMSE","Rsquared","MAE")] |>
    mutate(across(where(is.numeric), \(x) round(x, 3))) |>
    arrange(RMSE),
  caption = paste("Neural network cross-validation results. Best: size =",
                  nnet_75$bestTune$size, ", decay =", nnet_75$bestTune$decay),
  
)

Neural network cross-validation results. Best: size = 3 , decay = 0.01
size	decay	RMSE	Rsquared	MAE
3	0.010	2.079	0.430	1.655
3	0.001	2.230	0.382	1.745
5	0.100	2.443	0.365	1.844
5	0.001	2.473	0.360	1.989
7	0.010	2.540	0.225	2.045
5	0.010	2.558	0.226	1.928
7	0.100	2.905	0.312	2.128
3	0.100	3.193	0.193	2.269
7	0.001	3.850	0.271	2.979

Model 4: SVM

set.seed(7)
svm_75 <- train(trX, trY,
                method     = "svmRadial",
                preProc    = c("center", "scale"),
                tuneLength = 10,
                trControl  = ctrl75)
.tmp <- svm_75

knitr::kable(
  svm_75$results[, c("C","RMSE","Rsquared","MAE")] |>
    mutate(across(where(is.numeric), \(x) round(x, 3))),
  caption = paste("SVM cross-validation results. Best C =", svm_75$bestTune$C),
  
)

SVM cross-validation results. Best C = 8
C	RMSE	Rsquared	MAE
0.25	1.473	0.478	1.184
0.50	1.342	0.540	1.084
1.00	1.257	0.595	1.014
2.00	1.214	0.613	0.979
4.00	1.203	0.622	0.970
8.00	1.171	0.640	0.938
16.00	1.171	0.640	0.938
32.00	1.171	0.640	0.938
64.00	1.171	0.640	0.938
128.00	1.171	0.640	0.938

Part (a): Best Resampling and Test-Set Performance

models_75 <- list(KNN = knn_75, MARS = mars_75, NNet = nnet_75, SVM = svm_75)

perf_75 <- lapply(models_75, \(m) {
  p <- predict(m, newdata = teX)
  as.data.frame(t(postResample(pred = p, obs = teY)))
}) |>
  bind_rows(.id = "Model") |>
  mutate(across(where(is.numeric), \(x) round(x, 3))) |>
  arrange(RMSE)

knitr::kable(perf_75, row.names = FALSE,
             caption = "Test-set performance: Chemical Manufacturing Process",
             )

Test-set performance: Chemical Manufacturing Process
Model	RMSE	Rsquared	MAE
SVM	1.053	0.599	0.835
MARS	1.097	0.563	0.850
KNN	1.196	0.486	0.943
NNet	2.117	0.237	1.576

resamps <- resamples(models_75)
bwplot(resamps, metric = "RMSE",
       main = "10-Fold CV RMSE Distribution by Model")

ggplot(perf_75, aes(x = reorder(Model, RMSE), y = RMSE, fill = Model)) +
  geom_col(width = 0.55, show.legend = FALSE) +
  geom_text(aes(label = RMSE), vjust = -0.4, fontface = "bold", size = 4.5) +
  scale_fill_brewer(palette = "Set1") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title    = "Test-Set RMSE by Model",
    subtitle = "Chemical Manufacturing Process data",
    x = NULL, y = "RMSE"
  ) +
  theme_minimal(base_size = 13)

SVM edges out MARS on the test set (RMSE 1.053 vs. 1.097), though the margin is narrow and both models perform well — R² above 0.56 on a response with a range of only 5.6 percentage points is meaningful predictive accuracy. KNN is competitive at 1.196, which is surprising given its poor CV performance and suggests the test split happened to favor it. The neural network is clearly the weakest model at 2.117, more than double the error of the top two.

The SVM result is worth examining carefully. Its CV RMSE of 1.171 was the best of any model, and it held that advantage on the test set — suggesting genuine generalisation rather than luck of the split. The RBF kernel’s ability to model smooth, nonlinear surfaces fits this data well: manufacturing process variables tend to have continuous, well-behaved relationships with yield rather than the sharp discontinuities that would favor MARS hinge functions.

MARS finishing second rather than first is consistent with the CV results, where degree = 1 was optimal. When MARS is constrained to additive main effects without interactions, it loses some of its structural advantage over SVM. The 57-predictor space also gives MARS fewer opportunities to find the clean piecewise patterns that made it dominant in the 10-predictor Friedman simulation.

The neural network’s poor performance is unsurprising and mirrors what the CV results already showed: RMSE ranging from 2.08 to 3.85 across the tuning grid signals that the model is highly sensitive to initialisation and hyperparameter choice, and 144 observations is simply too few to train a stable network with 57 correlated inputs even with weight decay.

Part (b): Variable Importance — Nonlinear vs. Linear Model

set.seed(7)
lm_75 <- train(trX, trY,
               method    = "lm",
               preProc   = c("center", "scale"),
               trControl = ctrl75)
.tmp <- lm_75

get_top10 <- function(model, label) {
  vi <- varImp(model)$importance
  vi$Predictor <- rownames(vi)
  vi$Model <- label
  vi$Type  <- ifelse(grepl("^Biological", vi$Predictor),
                     "Biological", "Manufacturing Process")
  out <- vi |> arrange(desc(Overall)) |> head(10)
  out$Rank <- seq_len(nrow(out))
  out
}

# For MARS pull from full importance list to always get 10 entries
full_mars_vi <- varImp(mars_75)$importance
full_mars_vi$Predictor <- rownames(full_mars_vi)
full_mars_vi$Model <- "MARS"
full_mars_vi$Type  <- ifelse(grepl("^Biological", full_mars_vi$Predictor),
                              "Biological", "Manufacturing Process")
full_mars_vi <- full_mars_vi |> arrange(desc(Overall)) |> head(10)
full_mars_vi$Rank <- seq_len(nrow(full_mars_vi))
top10_mars <- full_mars_vi
top10_lm   <- get_top10(lm_75, "Linear")

bind_rows(top10_mars, top10_lm) |>
  ggplot(aes(x = reorder(Predictor, Overall), y = Overall, fill = Type)) +
  geom_col(width = 0.65) +
  coord_flip() +
  facet_wrap(~Model, scales = "free") +
  scale_fill_manual(
    values = c("Biological"           = "#4dac26",
               "Manufacturing Process" = "#d01c8b")
  ) +
  labs(
    title = "Top 10 Variable Importances: MARS vs. Linear Model",
    x = NULL, y = "Importance (scaled 0-100)", fill = NULL
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

top10_mars_names <- top10_mars$Predictor
top10_lm_names   <- top10_lm$Predictor

shared        <- intersect(top10_mars_names, top10_lm_names)
unique_mars   <- setdiff(top10_mars_names,   top10_lm_names)
unique_linear <- setdiff(top10_lm_names,     top10_mars_names)

cat("Shared by both top-10 lists:\n ",
    paste(shared, collapse = "\n  "), "\n\n")

Shared by both top-10 lists: ManufacturingProcess32 ManufacturingProcess13

cat("Unique to MARS top-10:\n ",
    paste(unique_mars, collapse = "\n  "), "\n\n")

Unique to MARS top-10: ManufacturingProcess09

cat("Unique to linear top-10:\n ",
    paste(unique_linear, collapse = "\n  "), "\n")

Unique to linear top-10: ManufacturingProcess33 ManufacturingProcess28 ManufacturingProcess37 ManufacturingProcess07 BiologicalMaterial05 ManufacturingProcess04 ManufacturingProcess16 BiologicalMaterial11

for (nm in c("MARS", "Linear")) {
  top10 <- if (nm == "MARS") top10_mars else top10_lm
  cat(nm, "-- Biological:", sum(top10$Type == "Biological"),
      " | Process:", sum(top10$Type == "Manufacturing Process"), "\n")
}

MARS – Biological: 0 | Process: 3 Linear – Biological: 2 | Process: 8

Key findings for Part (b):

Both models agree that ManufacturingProcess32 and ManufacturingProcess13 are important drivers of yield — their presence in both top-10 lists gives high confidence these are genuine signal rather than model-specific artifacts. ManufacturingProcess32 is the dominant predictor in both, consistent with Chapter 6’s PLS results where it ranked first by a clear margin.

The two models diverge substantially beyond that shared core. The linear model spreads importance across eight process variables and two biological predictors (BiologicalMaterial05 and BiologicalMaterial11), reflecting its tendency to distribute credit across correlated predictors. MARS, by contrast, produces a strikingly parsimonious solution — just three predictors, all process variables — with ManufacturingProcess09 appearing in MARS but not the linear model’s top 10. This concentration of importance is a feature of MARS’s basis-function pruning: it retains only the predictors that contribute meaningfully to reducing prediction error, discarding everything else.

The absence of biological predictors from MARS’s top 10 does not mean raw material quality is irrelevant. It means that on this dataset, process parameters explain yield variation more efficiently in a piecewise linear framework. The two biological predictors the linear model identifies (BiologicalMaterial05 and BiologicalMaterial11) likely have gradual, linear relationships with yield that MARS absorbed into its intercept or found too weak to justify a hinge function.

The practical implication remains the same: process predictors are controllable, biological ones are not. The convergence of both models on ManufacturingProcess32 makes it the highest-priority target for process optimisation.

Part (c): Relationships Unique to the MARS Model

Predictors that appear in MARS’s top 10 but not in the linear model’s top 10 are the most likely candidates for nonlinear behaviour — that is precisely why MARS found them important while linear regression did not.

n_unique <- length(unique_mars)
n_cols   <- min(n_unique, 2)
n_rows   <- ceiling(n_unique / n_cols)
par(mfrow = c(max(1, n_rows), max(1, n_cols)),
    mar = c(4.5, 4.5, 3.5, 1.5), mgp = c(2.5, 0.8, 0))

for (v in unique_mars) {
  xv     <- trX[[v]]
  df_tmp <- data.frame(x = xv, y = trY)
  lo     <- loess(y ~ x, data = df_tmp, span = 0.8)
  xgr    <- seq(min(xv), max(xv), length.out = 120)
  yhat   <- predict(lo, newdata = data.frame(x = xgr))
  ok     <- !is.na(yhat)

  plot(xv, trY,
       pch = 16, col = "#2166ac55", cex = 0.9,
       xlab = v, ylab = "Yield (%)", main = v)
  lines(xgr[ok], yhat[ok], col = "#d73027", lwd = 2.5)
  abline(h = mean(trY), lty = 2, col = "grey50", lwd = 1)
  legend("topright", legend = c("Loess fit", "Mean yield"),
         col = c("#d73027","grey50"), lty = 1:2, bty = "n", cex = 0.85)
}

par(mfrow = c(1, 1))

Interpretation:

The loess smoother (red) reveals the nature of the relationship that the linear model captured only partially.

ManufacturingProcess09 shows a positive, approximately monotone relationship with yield — runs where MP09 is higher tend to produce higher yield. The loess curve is roughly linear across most of the range but flattens slightly at higher values, suggesting diminishing returns beyond a certain point. This mild nonlinearity is enough for MARS to detect and model with a hinge function, but subtle enough that the linear model assigns it moderate importance rather than elevating it to the top 10.

The practical implication is that increasing ManufacturingProcess09 toward the upper end of its observed range is associated with above-average yield, but the returns diminish beyond that point. Engineers should target this range without pushing the parameter to extremes where the relationship flattens. At approximately $100,000 per 1% yield gain, even a modest upward shift in ManufacturingProcess09 toward its optimal operating range could produce meaningful financial return.

sessionInfo()

R version 4.5.0 (2025-04-11) Platform: aarch64-apple-darwin20 Running under: macOS Sonoma 14.3

Matrix products: default BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1

locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York tzcode source: internal

attached base packages: [1] stats graphics grDevices utils datasets methods base

other attached packages: [1] AppliedPredictiveModeling_1.1-7 tidyr_1.3.1
[3] dplyr_1.1.4 nnet_7.3-20
[5] kernlab_0.9-33 earth_5.3.5
[7] plotmo_3.7.0 plotrix_3.8-14
[9] Formula_1.2-5 caret_7.0-1
[11] lattice_0.22-6 ggplot2_4.0.0.9000
[13] mlbench_2.1-7

loaded via a namespace (and not attached): [1] tidyselect_1.2.1 timeDate_4052.112 farver_2.1.2
[4] S7_0.2.0 fastmap_1.2.0 pROC_1.19.0.1
[7] digest_0.6.37 rpart_4.1.24 timechange_0.3.0
[10] lifecycle_1.0.4 cluster_2.1.8.1 survival_3.8-3
[13] magrittr_2.0.4 compiler_4.5.0 rlang_1.1.6
[16] sass_0.4.10 tools_4.5.0 yaml_2.3.10
[19] data.table_1.17.8 knitr_1.50 labeling_0.4.3
[22] plyr_1.8.9 RColorBrewer_1.1-3 withr_3.0.2
[25] purrr_1.1.0 grid_4.5.0 stats4_4.5.0
[28] future_1.68.0 globals_0.18.0 scales_1.4.0
[31] iterators_1.0.14 MASS_7.3-65 ellipse_0.5.0
[34] cli_3.6.5 rmarkdown_2.30 generics_0.1.4
[37] rstudioapi_0.17.1 future.apply_1.20.1 reshape2_1.4.4
[40] cachem_1.1.0 stringr_1.5.2 splines_4.5.0
[43] parallel_4.5.0 vctrs_0.6.5 hardhat_1.4.2
[46] Matrix_1.7-3 jsonlite_2.0.0 listenv_0.10.0
[49] foreach_1.5.2 gower_1.0.2 jquerylib_0.1.4
[52] recipes_1.3.1 glue_1.8.0 parallelly_1.46.0
[55] codetools_0.2-20 lubridate_1.9.4 stringi_1.8.7
[58] gtable_0.3.6 rpart.plot_3.1.4 tibble_3.3.0
[61] CORElearn_1.57.3.1 pillar_1.11.1 htmltools_0.5.8.1
[64] ipred_0.9-15 lava_1.8.2 R6_2.6.1
[67] evaluate_1.0.5 bslib_0.9.0 class_7.3-23
[70] Rcpp_1.1.0 nlme_3.1-168 prodlim_2025.04.28
[73] xfun_0.53 ModelMetrics_1.2.2.2 pkgconfig_2.0.3

Nonliner Regression Model

Chapter 7 Homework Problems

Kerwin Charles

2026-04-20

Packages

Exercise 7.2

Data Generation

Exploratory Look

Feature Plots

Resampling Control

Model 1: K-Nearest Neighbours

Model 2: MARS

Model 3: Neural Network

Model 4: Support Vector Machine (Radial Basis Kernel)

Test-Set Performance Comparison

Part (a): Which Models Give the Best Performance?

Part (b): Does MARS Select the Informative Predictors?

Exercise 7.5

Problem Statement

Data Loading and Imputation

Train / Test Split

Resampling Control

Model 1: KNN

Model 2: MARS

Model 3: Neural Network

Model 4: SVM

Part (a): Best Resampling and Test-Set Performance

Part (b): Variable Importance — Nonlinear vs. Linear Model

Part (c): Relationships Unique to the MARS Model