library(mlbench)
library(caret)
library(earth)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
MARS model
# Tune MARS model
marsModel <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneLength = 10,
preProc = c("center", "scale"))
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.383438 0.2405683 3.597961
## 3 3.645469 0.4745962 2.930453
## 4 2.727602 0.7035031 2.184240
## 6 2.331605 0.7835496 1.833420
## 7 1.976830 0.8421599 1.562591
## 9 1.804342 0.8683110 1.410395
## 10 1.787676 0.8711960 1.386944
## 12 1.821005 0.8670619 1.419893
## 13 1.858688 0.8617344 1.445459
## 15 1.871033 0.8607099 1.457618
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.
# Check MARS variable importance
varImp(marsModel)
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.78
## X2 64.18
## X5 40.21
## X3 28.14
## X6 0.00
Random Forest
# Tune Random Forest
rfModel <- train(x = trainingData$x,
y = trainingData$y,
method = "rf",
tuneLength = 10,
preProc = c("center", "scale"))
## note: only 9 unique complexity parameters in default grid. Truncating the grid to 9 .
rfModel
## Random Forest
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 2.921880 0.7820552 2.427825
## 3 2.714851 0.7854612 2.252894
## 4 2.634192 0.7752214 2.184914
## 5 2.602575 0.7647143 2.149024
## 6 2.596046 0.7553125 2.139150
## 7 2.603531 0.7450543 2.144592
## 8 2.620741 0.7360489 2.152775
## 9 2.631700 0.7298234 2.157534
## 10 2.646274 0.7238240 2.168685
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 6.
# Compare all models
results <- resamples(list(knn=knnModel, mars=marsModel, rf=rfModel))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: knn, mars, rf
## Number of resamples: 25
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 2.049281 2.507832 2.614543 2.567787 2.715750 2.855385 0
## mars 1.148826 1.285676 1.389701 1.386944 1.505222 1.607913 0
## rf 1.590189 1.961529 2.127290 2.139150 2.296508 2.504643 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 2.677712 3.076907 3.205878 3.183130 3.358475 3.505082 0
## mars 1.471054 1.653905 1.813675 1.787676 1.899500 2.241137 0
## rf 2.037460 2.383600 2.603670 2.596046 2.794775 2.992339 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 0.4872237 0.6024717 0.6407126 0.6425367 0.6733349 0.7549410 0
## mars 0.8179101 0.8520345 0.8715922 0.8711960 0.8971392 0.9214815 0
## rf 0.5084187 0.7439288 0.7639507 0.7553125 0.7895671 0.8627742 0
bwplot(results)
Based on resampling results (25 repeats), MARS (Multivariate Adaptive Regression Splines) outperforms both Random Forest (RF) and k-NN on Friedman’s benchmark dataset, achieving the lowest prediction errors (MAE = 1.36, RMSE = 1.72) and highest explained variance (R^2 = 0.87). MARS excels because it directly models nonlinear and additive structures, while RF (median R^2 = 0.77) and k-NN (median R^2 = 0.64) are less precise. Additionally, MARS correctly prioritizes informative predictors (X1–X5) over noise variables (X6–X10), as confirmed by variable importance analysis. For this problem, MARS is the best choice for accuracy and interpretability, followed by RF, while k-NN trails significantly.
# Load libraries
library(AppliedPredictiveModeling)
library(caret)
library(gbm)
library(earth) # For MARS
library(randomForest) # For Random Forest
library(kernlab) # For SVM
library(glmnet) # For Ridge/Lasso (comparison)
# Load and impute data (same as 6.3)
data(ChemicalManufacturingProcess)
processed <- preProcess(ChemicalManufacturingProcess,
method = c("knnImpute", "center", "scale", "nzv"))
imputedData <- predict(processed, ChemicalManufacturingProcess)
# Split data (80% train, 20% test)
set.seed(1225)
trainIndex <- createDataPartition(imputedData$Yield, p = 0.8, list = FALSE)
trainData <- imputedData[trainIndex, ]
testData <- imputedData[-trainIndex, ]
# Prepare x/y for models
x_train <- model.matrix(Yield ~ ., trainData)[, -1] # Remove intercept
y_train <- trainData$Yield
x_test <- model.matrix(Yield ~ ., testData)[, -1]
y_test <- testData$Yield
MARS
set.seed(111)
marsModel <- train(
x = x_train, y = y_train,
method = "earth",
tuneGrid = expand.grid(degree = 1:2, nprune = 10:20), # Tune interaction depth & terms
trControl = trainControl(method = "cv", number = 10),
metric = "Rsquared"
)
# Predict and evaluate
marsPred <- predict(marsModel, x_test)
postResample(marsPred, y_test) # Test RMSE, R²
## RMSE Rsquared MAE
## 0.5904763 0.7159729 0.4774057
Random Forest
set.seed(222)
rfModel <- train(
x = x_train, y = y_train,
method = "rf",
tuneLength = 10, # Auto-tune mtry (number of variables per split)
trControl = trainControl(method = "cv", number = 10),
metric = "Rsquared"
)
# Predict and evaluate
rfPred <- predict(rfModel, x_test)
postResample(rfPred, y_test)
## RMSE Rsquared MAE
## 0.7390801 0.6233859 0.5623155
SVM with Radial Kernel
set.seed(333)
svmModel <- train(
x = x_train, y = y_train,
method = "svmRadial",
tuneLength = 10, # Auto-tune sigma and C
trControl = trainControl(method = "cv", number = 10),
metric = "Rsquared"
)
# Predict and evaluate
svmPred <- predict(svmModel, x_test)
postResample(svmPred, y_test)
## RMSE Rsquared MAE
## 0.7091104 0.6057279 0.5225374
Gradient Boosting Machines
set.seed(444)
gbmModel <- train(
x = x_train, y = y_train,
method = "gbm",
tuneLength = 10, # Auto-tune interaction depth, shrinkage, n.trees
trControl = trainControl(method = "cv", number = 10),
metric = "Rsquared",
verbose = FALSE
)
# Predict and evaluate
gbmPred <- predict(gbmModel, x_test)
postResample(gbmPred, y_test)
## RMSE Rsquared MAE
## 0.6402972 0.6920954 0.4432237
# Collect all models
models <- list(
MARS = marsModel,
RandomForest = rfModel,
SVM = svmModel,
GBM = gbmModel
)
# Resample results
resamples <- resamples(models)
summary(resamples)
##
## Call:
## summary.resamples(object = resamples)
##
## Models: MARS, RandomForest, SVM, GBM
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.2865557 0.4063530 0.4771704 0.4936886 0.5350459 0.7406345 0
## RandomForest 0.3040437 0.4572309 0.4951039 0.4819018 0.5258466 0.5793859 0
## SVM 0.3374369 0.4115908 0.4810048 0.4839656 0.5375805 0.6371607 0
## GBM 0.3102461 0.3853619 0.4371421 0.4605217 0.5469722 0.6559649 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.3703093 0.4711157 0.5727605 0.6193414 0.6788472 0.9838125 0
## RandomForest 0.4029738 0.5648227 0.6137333 0.6035744 0.6684476 0.7213218 0
## SVM 0.4303769 0.5267674 0.6088384 0.5980011 0.6827474 0.7556912 0
## GBM 0.4453346 0.4846367 0.5524158 0.5715051 0.6319819 0.7702760 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.2919032 0.5935036 0.7102777 0.6445895 0.7690368 0.8657793 0
## RandomForest 0.3632040 0.5969392 0.6546133 0.6568381 0.7509035 0.8746550 0
## SVM 0.3025769 0.5803465 0.6344732 0.6283128 0.6958743 0.8206000 0
## GBM 0.5556341 0.6194403 0.6544292 0.6601056 0.6828267 0.7859701 0
# Plot performance
bwplot(resamples, metric = "Rsquared") # Compare R^2
dotplot(resamples, metric = "RMSE") # Compare RMSE
Based on the resampling results, MARS emerges as the best-performing model, achieving the highest median R^2 (0.71) and demonstrating strong predictive accuracy (median RMSE = 0.573, MAE = 0.477). Its ability to model nonlinear relationships and interactions while maintaining interpretability makes it ideal for optimizing manufacturing processes and diagnosing yield issues. Random Forest and GBM also perform well, with GBM showing the lowest median RMSE (0.552), making it a viable alternative for error-sensitive applications. SVM trails behind in performance and is less recommended. For most scenarios, MARS strikes the best balance between accuracy and interpretability, though GBM may be preferred when minimizing prediction errors is critical. Further tuning or ensemble approaches could enhance results.
Which predictors are most important in the optimal nonlinear regression model?
# MARS variable importance
varImp_mars <- varImp(marsModel)
varImp_mars
## earth variable importance
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 58.12
## ManufacturingProcess09 58.12
## ManufacturingProcess28 37.10
## ManufacturingProcess17 20.73
## ManufacturingProcess39 14.50
## ManufacturingProcess06 10.23
## ManufacturingProcess22 6.27
## BiologicalMaterial09 0.00
plot(varImp_mars, top = 10, main = "Top 10 Predictors (MARS)")
Do either the biological or process variables dominate the list?
Process variables (e.g., ManufacturingProcess32) dominate, but biological factors (e.g., BiologicalMaterial09) appear in the top 10.
How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
The top ten important predictors identified by MARS and the optimal linear model share two key predictors: ManufacturingProcess32 and ManufacturingProcess17. Notably, ManufacturingProcess32 is the most important predictor in both models. Additionally, each list includes one biological variable, although they are different from one another.
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model
# Extract top 10 predictor names (ensure they are non-NA strings)
top_mars_predictors <- rownames(varImp_mars$importance)[1:10]
top_mars_predictors <- top_mars_predictors[!is.na(top_mars_predictors)] # Remove NA
# Compare with linear model's top predictors (replace with your actual linear model's top predictors)
top_linear_predictors <- c("ManufacturingProcess32", "ManufacturingProcess17") # Example
unique_mars_predictors <- setdiff(top_mars_predictors, top_linear_predictors)
# Check if any predictors are left to plot
if (length(unique_mars_predictors) > 0) {
# Plot relationships
for (predictor in unique_mars_predictors) {
if (predictor %in% colnames(trainData)) { # Verify the column exists
p <- ggplot(trainData, aes(x = .data[[predictor]], y = Yield)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess", se = FALSE, color = "red") +
labs(title = paste("Yield vs", predictor)) +
theme_minimal()
print(p)
} else {
warning(paste("Predictor", predictor, "not found in trainData."))
}
}
} else {
message("No unique predictors found in MARS compared to the linear model.")
}
Some of the process variables exhibit nonlinear relationships with yield. For instance, some demonstrate a U-shaped curve, indicating that the optimal operating range is achieved at intermediate values of a process parameter. Additionally, there are threshold effects where yield decreases beyond certain predictor values. Among the predictors, the only biological variable shows a nearly saturating relationship: yield improves with increased values up to a certain point, after which it experiences diminishing returns.
Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
Process variables are the most unique predictors, confirming their key role in optimizing yield. Biological variables may limit potential gains, as their impact often plateaus after a certain input level.