library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(earth)
## Warning: package 'earth' was built under R version 4.5.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.5.3
## Loading required package: plotrix
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3
library(mlbench)
library(caret)
library(earth)
#Data Setup
set.seed(33)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
ctrl <- trainControl(method = "cv", number = 10)
#KNN
set.seed(33)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
#MARS
set.seed(33)
marsGrid <- expand.grid(degree = 1:2, nprune = 2:20)
marsModel <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = marsGrid,
trControl = ctrl)
#SVM
set.seed(33)
svmModel <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
#Neural Net
set.seed(100)
nnetGrid <- expand.grid(decay = c(0.001, 0.01, 0.1),
size = c(3, 5, 7, 9),
bag = FALSE)
nnetModel <- train(x = trainingData$x,
y = trainingData$y,
method = "avNNet",
preProc = c("center", "scale"),
tuneGrid = nnetGrid,
trControl = ctrl,
linout = TRUE,
trace = FALSE,
maxit = 500)
## Warning: executing %dopar% sequentially: no parallel backend registered
#Test set performance
knnPred <- predict(knnModel, newdata = testData$x)
marsPred <- predict(marsModel, newdata = testData$x)
svmPred <- predict(svmModel, newdata = testData$x)
nnetPred <- predict(nnetModel, newdata = testData$x)
print("KNN:")
## [1] "KNN:"
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.3011563 0.6234257 2.6620977
print("MARS:")
## [1] "MARS:"
postResample(pred = marsPred, obs = testData$y)
## RMSE Rsquared MAE
## 1.2509198 0.9382589 0.9866210
print("SVM:")
## [1] "SVM:"
postResample(pred = svmPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.1173900 0.8235628 1.6354301
print("Neural Network:")
## [1] "Neural Network:"
postResample(pred = nnetPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.239208 0.804767 1.741402
#Resampling comparison
results72 <- resamples(list(KNN = knnModel,
MARS = marsModel,
SVM = svmModel,
NNet = nnetModel))
summary(results72)
##
## Call:
## summary.resamples(object = results72)
##
## Models: KNN, MARS, SVM, NNet
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 2.0069169 2.1285479 2.7124153 2.660437 3.1832812 3.260846 0
## MARS 0.7346996 0.8179392 0.8449122 0.923298 0.9467612 1.329858 0
## SVM 1.3334841 1.5108691 1.6732570 1.721501 1.9071270 2.321440 0
## NNet 1.6493585 2.0037492 2.2109942 2.120498 2.2911850 2.366499 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 2.545593 2.861350 3.274685 3.331571 3.901246 3.944874 0
## MARS 0.932403 1.034885 1.124725 1.185287 1.272397 1.572700 0
## SVM 1.791818 1.910918 2.095583 2.278516 2.587654 3.228947 0
## NNet 2.137799 2.471859 2.762515 2.705187 2.873451 3.340309 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 0.1866384 0.5384763 0.6595620 0.5922978 0.6956518 0.8391479 0
## MARS 0.8919287 0.9351798 0.9472037 0.9435176 0.9653268 0.9701527 0
## SVM 0.5313604 0.7904223 0.8420503 0.7854612 0.8526209 0.8918653 0
## NNet 0.5671787 0.6633024 0.7315382 0.7160616 0.7754582 0.8317577 0
dotplot(results72)
#Does MARS select the informative predictors?
varImp(marsModel)
## earth variable importance
##
## Overall
## X4 100.00
## X5 76.63
## X1 75.64
## X2 75.64
## X3 45.15
## X6 0.00
Four models were tuned on the Friedman simulation data: KNN, MARS, SVM, and averaged neural network. MARS performed best on the test set with an RMSE of 1.25 and R-squared of 0.938, compared to SVM (2.12, 0.824), neural net (2.24, 0.805), and KNN (3.30, 0.623). The cross-validated resampling results followed the same ordering (mean CV RMSE: MARS = 1.19, SVM = 2.28, NNet = 2.71, KNN = 3.33), suggesting the test set performance is reliable and not a result of overfitting.
MARS also correctly identified all five informative predictors (X1–X5) while assigning zero importance to all five noise variables, which were pruned out during model building. X4 ranked highest in importance, which makes sense given that it enters the data-generating equation linearly with the largest coefficient. This shows that MARS’s pruning step can effectively act as variable selection, which is useful in situations where many predictors may not be relevant.
data(ChemicalManufacturingProcess)
#My 6.3 setup
preProcValues <- preProcess(ChemicalManufacturingProcess[, -1], method = "knnImpute")
imputedPredictors <- predict(preProcValues, ChemicalManufacturingProcess[, -1])
yield <- ChemicalManufacturingProcess$Yield
set.seed(33)
trainingRows <- createDataPartition(yield, p = .80, list = FALSE)
trainX <- imputedPredictors[trainingRows, ]
trainY <- yield[trainingRows]
testX <- imputedPredictors[-trainingRows, ]
testY <- yield[-trainingRows]
ctrl <- trainControl(method = "cv", number = 10)
#(a) Tune nonlinear models
#MARS
set.seed(33)
marsGrid <- expand.grid(degree = 1:2, nprune = 2:20)
marsModel75 <- train(x = trainX, y = trainY,
method = "earth",
tuneGrid = marsGrid,
trControl = ctrl)
#KNN
set.seed(33)
knnModel75 <- train(x = trainX, y = trainY,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
#SVM
set.seed(33)
svmModel75 <- train(x = trainX, y = trainY,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
#Neural Net
set.seed(33)
nnetGrid <- expand.grid(decay = c(0.001, 0.01, 0.1),
size = c(3, 5, 7, 9),
bag = FALSE)
nnetModel75 <- train(x = trainX, y = trainY,
method = "avNNet",
preProc = c("center", "scale"),
tuneGrid = nnetGrid,
trControl = ctrl,
linout = TRUE,
trace = FALSE,
maxit = 500)
#Resampling comparison
results75 <- resamples(list(MARS = marsModel75,
KNN = knnModel75,
SVM = svmModel75,
NNet = nnetModel75))
summary(results75)
##
## Call:
## summary.resamples(object = results75)
##
## Models: MARS, KNN, SVM, NNet
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.5536536 0.7118428 0.9129273 0.9221322 1.1093251 1.361527 0
## KNN 0.6632308 0.9100619 1.0697931 1.0590787 1.2181758 1.463733 0
## SVM 0.5755508 0.7492297 0.9098863 0.8725819 0.9475043 1.157290 0
## NNet 0.8648911 1.1699173 1.2044642 1.2660025 1.2950209 1.802105 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.7190588 0.9380544 1.150167 1.136608 1.299271 1.608751 0
## KNN 0.7897877 1.0352175 1.249501 1.288481 1.607863 1.771908 0
## SVM 0.6928392 0.9507232 1.097299 1.092268 1.260006 1.493649 0
## NNet 1.0838085 1.3122896 1.438034 1.519525 1.566797 2.241267 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.4880814 0.5255717 0.6702196 0.6502840 0.7309585 0.8465750 0
## KNN 0.1121056 0.3529570 0.6016394 0.5240282 0.6348658 0.8249039 0
## SVM 0.4210179 0.6187406 0.6955334 0.6916037 0.7665908 0.8929511 0
## NNet 0.1436109 0.4090370 0.5283339 0.4931983 0.6228983 0.7700667 0
dotplot(results75)
#Test set performance for each
print("MARS:")
## [1] "MARS:"
postResample(pred = predict(marsModel75, testX), obs = testY)
## RMSE Rsquared MAE
## 1.2706708 0.5370908 1.0494852
print("KNN:")
## [1] "KNN:"
postResample(pred = predict(knnModel75, testX), obs = testY)
## RMSE Rsquared MAE
## 1.3526511 0.4678742 1.0934375
print('SVM:')
## [1] "SVM:"
postResample(pred = predict(svmModel75, testX), obs = testY)
## RMSE Rsquared MAE
## 1.1189263 0.6517518 0.8122573
print('Neural Network:')
## [1] "Neural Network:"
postResample(pred = predict(nnetModel75, testX), obs = testY)
## RMSE Rsquared MAE
## 1.289633 0.611707 1.026073
SVM provided the best performance on both resampling and the test set. It achieved a mean CV RMSE of 1.09 and a test set RMSE of 1.12 (R-squared =0.652), compared to MARS (CV RMSE = 1.14, test RMSE = 1.27), NNet (1.52, 1.29), and KNN (1.29, 1.35). The dotplot shows overlapping intervals between SVM and MARS, so the difference is not large, but SVM was consistently better across both of the evaluations.
#(b) Variable importance of best model
bestModel75 <- svmModel75
imp75 <- varImp(bestModel75, scale = FALSE)
plot(imp75, top = 20)
top10_nonlinear <- rownames(imp75$importance)[
order(imp75$importance[,1], decreasing = TRUE)
][1:10]
top10_pls <- c("ManufacturingProcess32", "ManufacturingProcess13",
"ManufacturingProcess09", "ManufacturingProcess36",
"ManufacturingProcess17", "BiologicalMaterial02",
"ManufacturingProcess11", "BiologicalMaterial06",
"ManufacturingProcess06", "ManufacturingProcess33")
unique_to_nonlinear <- setdiff(top10_nonlinear, top10_pls)
cat("Predictors unique to nonlinear model top 10:\n")
## Predictors unique to nonlinear model top 10:
print(unique_to_nonlinear)
## [1] "BiologicalMaterial03" "BiologicalMaterial12" "ManufacturingProcess31"
print(top10_nonlinear)
## [1] "ManufacturingProcess32" "ManufacturingProcess13" "ManufacturingProcess17"
## [4] "BiologicalMaterial06" "BiologicalMaterial03" "BiologicalMaterial02"
## [7] "ManufacturingProcess36" "ManufacturingProcess09" "BiologicalMaterial12"
## [10] "ManufacturingProcess31"
varImp(bestModel75)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 98.59
## ManufacturingProcess17 82.03
## BiologicalMaterial06 81.46
## BiologicalMaterial03 77.65
## BiologicalMaterial02 77.57
## ManufacturingProcess36 77.43
## ManufacturingProcess09 76.08
## BiologicalMaterial12 69.73
## ManufacturingProcess31 63.81
## ManufacturingProcess06 59.19
## ManufacturingProcess11 58.53
## BiologicalMaterial04 56.79
## ManufacturingProcess29 55.41
## ManufacturingProcess33 50.84
## BiologicalMaterial08 49.65
## ManufacturingProcess30 49.54
## BiologicalMaterial11 48.02
## BiologicalMaterial01 40.63
## BiologicalMaterial09 39.12
The most important predictors in the SVM model are ManufacturingProcess32 and ManufacturingProcess13, which rank highest by a noticeable margin. Overall, the process variables dominate the list, which lines up with what I found in the PLS model from exercise 6.3. Multiple of the top ten predictors are actually shared between the two models, so there is a good amount of agreement on which variables matter most. The main differences are that the SVM model brings in BiologicalMaterial03, BiologicalMaterial12, and ManufacturingProcess31, while ManufacturingProcess11, ManufacturingProcess06, and ManufacturingProcess33 fall out of the top ten compared to PLS. Biological predictors do show up a bit more in the SVM top ten, taking four of the ten spots versus three in PLS, but process variables still dominate both models.
#(c)Plots for unique predictors
par(mfrow = c(1, 1))
for (pred in unique_to_nonlinear) {
plot(trainX[[pred]], trainY,
xlab = pred,
ylab = "Yield",
main = paste("Yield vs", pred),
pch = 16, col = "steelblue")
lines(lowess(trainX[[pred]], trainY), col = "red", lwd = 2)
}
The three predictors unique to the SVM model are BiologicalMaterial03, BiologicalMaterial12, and ManufacturingProcess31. Both of the biological material predictors show a similar pattern: yield increases as the predictor value rises, but the relationship levels off and even turns slightly downward at higher values. This kind of nonlinear relationship would not be well captured by a linear model like PLS, which could explain why these predictors did not make the top ten in 6.3 despite being genuinely related to yield. In practice, this suggests there may be a sweet spot in the quality of the incoming biological material, where beyond a certain point, higher values do not continue to actually improve yield.
ManufacturingProcess31 is different because yield increases steadily as ManufacturingProcess31 approaches zero from the left, then drops sharply just before zero, and flattens out after. This suggests there is an optimal range just below zero where yield peaks, and that running the process at exactly zero or above is actually associated with lower yield. Since this is a process variable that can be controlled, this is potentially actionable. For example, operators could aim to keep this parameter in that range just below zero to maximize yield. A linear model would miss this threshold behavior entirely, which explains why it didn’t appear in the PLS top ten. This nonlinear model’s ability to detect these kinds of curved relationships is what distinguishes it from PLS here because a linear model would either miss these predictors or misrepresent their relationship with yield.