Generating the data exactly as shown in the problem
library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
#Looking at the data
featurePlot(trainingData$x, trainingData$y)
Training the models
KNN
knnModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10
)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
knnResults <- postResample(pred = knnPred, obs = testData$y)
cat("\nKNN Test Performance:\n"); print(knnResults)
##
## KNN Test Performance:
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
Support Vector Machines (Radial Basis Function)
svmModel <- train(x = trainingData$x, y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.505383 0.8031869 1.999381
## 0.50 2.290725 0.8103140 1.829703
## 1.00 2.105086 0.8302040 1.677851
## 2.00 2.014620 0.8418576 1.598814
## 4.00 1.965196 0.8491165 1.567327
## 8.00 1.927649 0.8538945 1.542267
## 16.00 1.924262 0.8545293 1.539275
## 32.00 1.924262 0.8545293 1.539275
## 64.00 1.924262 0.8545293 1.539275
## 128.00 1.924262 0.8545293 1.539275
##
## Tuning parameter 'sigma' was held constant at a value of 0.06802164
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06802164 and C = 16.
plot(svmModel, scales = list(x = list(log = 2)))
svmPred <- predict(svmModel, newdata = testData$x)
svmResults <- postResample(pred = svmPred, obs = testData$y)
cat("\nSVM Test Performance:\n"); print(svmResults)
##
## SVM Test Performance:
## RMSE Rsquared MAE
## 2.0864652 0.8236735 1.5854649
Neural Network
nnetModel <- train(x = trainingData$x, y = trainingData$y,
method = "nnet",
preProc = c("center", "scale"),
linout = TRUE, # Linear output for regression
trace = FALSE,
tuneLength = 10,
trControl = trainControl(method = "cv"))
nnetModel
## Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.0000000000 2.633279 0.7283286 2.120217
## 1 0.0001000000 2.900115 0.6467165 2.360940
## 1 0.0002371374 2.396309 0.7791912 1.901589
## 1 0.0005623413 2.780397 0.6870115 2.233051
## 1 0.0013335214 2.445179 0.7653185 1.934351
## 1 0.0031622777 2.494515 0.7527796 2.006299
## 1 0.0074989421 2.386075 0.7801513 1.892076
## 1 0.0177827941 2.392012 0.7782854 1.903058
## 1 0.0421696503 2.391308 0.7786657 1.893639
## 1 0.1000000000 2.400189 0.7768265 1.898185
## 3 0.0000000000 2.387793 0.7728491 1.873876
## 3 0.0001000000 2.440011 0.7682545 1.934907
## 3 0.0002371374 2.522110 0.7490157 2.011777
## 3 0.0005623413 2.449060 0.7558299 2.015613
## 3 0.0013335214 2.328025 0.7775499 1.864046
## 3 0.0031622777 2.398795 0.7753683 1.854101
## 3 0.0074989421 2.609818 0.7306142 2.063122
## 3 0.0177827941 2.386876 0.7670830 1.910688
## 3 0.0421696503 2.348159 0.7892324 1.869168
## 3 0.1000000000 2.486285 0.7582800 1.978378
## 5 0.0000000000 2.646703 0.7502254 2.097796
## 5 0.0001000000 2.510570 0.7571416 2.031122
## 5 0.0002371374 2.688755 0.7332813 2.095918
## 5 0.0005623413 2.682778 0.7341635 2.127294
## 5 0.0013335214 2.711187 0.7455543 2.141667
## 5 0.0031622777 2.564999 0.7577619 2.069637
## 5 0.0074989421 2.632349 0.7544988 2.050032
## 5 0.0177827941 2.536531 0.7550486 1.966269
## 5 0.0421696503 2.581026 0.7673985 2.017268
## 5 0.1000000000 2.176375 0.8165038 1.742318
## 7 0.0000000000 2.774144 0.7066533 2.221055
## 7 0.0001000000 3.026673 0.6654769 2.391277
## 7 0.0002371374 3.459070 0.6335929 2.601472
## 7 0.0005623413 2.936064 0.7016427 2.303490
## 7 0.0013335214 2.982957 0.6816505 2.247375
## 7 0.0031622777 2.990882 0.6838274 2.421988
## 7 0.0074989421 2.899642 0.7195327 2.249124
## 7 0.0177827941 2.729599 0.7234758 2.171069
## 7 0.0421696503 3.348929 0.6241268 2.486973
## 7 0.1000000000 3.328893 0.6465184 2.446462
## 9 0.0000000000 3.058557 0.6743119 2.550405
## 9 0.0001000000 3.483733 0.6045605 2.737500
## 9 0.0002371374 3.347573 0.6226467 2.556725
## 9 0.0005623413 3.083017 0.6796748 2.419472
## 9 0.0013335214 3.279535 0.5994149 2.692880
## 9 0.0031622777 3.593053 0.6288529 2.636225
## 9 0.0074989421 3.561847 0.6352796 2.731522
## 9 0.0177827941 3.196158 0.6358670 2.581625
## 9 0.0421696503 3.459481 0.6284756 2.636621
## 9 0.1000000000 2.863687 0.7133674 2.312240
## 11 0.0000000000 3.399657 0.6141205 2.795511
## 11 0.0001000000 3.254411 0.6349458 2.594708
## 11 0.0002371374 3.408067 0.6249642 2.854058
## 11 0.0005623413 3.452369 0.5988766 2.768006
## 11 0.0013335214 3.679091 0.5406090 2.889770
## 11 0.0031622777 3.166633 0.6389032 2.650011
## 11 0.0074989421 3.622868 0.5687058 2.897295
## 11 0.0177827941 3.585259 0.5700386 2.798334
## 11 0.0421696503 3.178816 0.6517175 2.574439
## 11 0.1000000000 3.069972 0.6754721 2.476047
## 13 0.0000000000 3.819798 0.5561931 3.171339
## 13 0.0001000000 3.575755 0.5883873 2.972471
## 13 0.0002371374 3.658245 0.5932256 2.918970
## 13 0.0005623413 3.257107 0.6296140 2.614488
## 13 0.0013335214 3.279264 0.6493529 2.644173
## 13 0.0031622777 3.313546 0.6520617 2.747977
## 13 0.0074989421 3.341877 0.6085338 2.686905
## 13 0.0177827941 3.228235 0.6244170 2.686000
## 13 0.0421696503 3.636838 0.5665346 2.905189
## 13 0.1000000000 2.717291 0.7196020 2.214065
## 15 0.0000000000 3.212517 0.6733055 2.605846
## 15 0.0001000000 3.095634 0.6616336 2.481403
## 15 0.0002371374 3.618893 0.6003143 2.885508
## 15 0.0005623413 3.748686 0.5612102 2.983532
## 15 0.0013335214 3.501006 0.5825447 2.788876
## 15 0.0031622777 3.110343 0.6406271 2.480439
## 15 0.0074989421 3.402707 0.6187874 2.695783
## 15 0.0177827941 3.290526 0.6416083 2.680549
## 15 0.0421696503 3.272191 0.6163545 2.600350
## 15 0.1000000000 2.689449 0.7221497 2.127840
## 17 0.0000000000 3.451505 0.6328619 2.809879
## 17 0.0001000000 3.649799 0.5718014 3.008216
## 17 0.0002371374 3.325010 0.6263377 2.768150
## 17 0.0005623413 3.531470 0.5778854 2.758704
## 17 0.0013335214 3.491189 0.6066510 2.791058
## 17 0.0031622777 3.470810 0.6112536 2.830853
## 17 0.0074989421 3.294250 0.6399250 2.694743
## 17 0.0177827941 3.021618 0.6756008 2.412193
## 17 0.0421696503 2.890368 0.7162036 2.342703
## 17 0.1000000000 2.781787 0.7317085 2.263080
## 19 0.0000000000 3.248772 0.6367844 2.642332
## 19 0.0001000000 3.460997 0.5878201 2.698230
## 19 0.0002371374 3.296080 0.6359887 2.694911
## 19 0.0005623413 3.011337 0.6919807 2.469314
## 19 0.0013335214 3.311350 0.6164769 2.679284
## 19 0.0031622777 3.098876 0.6743997 2.482737
## 19 0.0074989421 3.153581 0.6666430 2.498318
## 19 0.0177827941 3.091203 0.6580394 2.415431
## 19 0.0421696503 2.550630 0.7696222 2.072981
## 19 0.1000000000 2.507666 0.7525353 1.920137
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5 and decay = 0.1.
plot(nnetModel)
nnetPred <- predict(nnetModel, newdata = testData$x)
nnetResults <- postResample(pred = nnetPred, obs = testData$y)
cat("\nNeural Net Test Performance:\n"); print(nnetResults)
##
## Neural Net Test Performance:
## RMSE Rsquared MAE
## 2.3255984 0.7970049 1.7840774
MARS
marsModel <- train(x = trainingData$x, y = trainingData$y,
method = "earth", # earth implements MARS
tuneGrid = expand.grid(degree = 1:2, nprune = 2:20),
trControl = trainControl(method = "cv"))
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.5.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.5.3
## Loading required package: plotrix
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.429910 0.2134636 3.626535
## 1 3 3.540029 0.5050779 2.822962
## 1 4 2.643738 0.7300110 2.100277
## 1 5 2.403045 0.7736763 1.910643
## 1 6 2.247706 0.8010015 1.778977
## 1 7 1.855825 0.8676690 1.423783
## 1 8 1.717744 0.8843792 1.336747
## 1 9 1.707344 0.8836528 1.317357
## 1 10 1.666426 0.8910829 1.312938
## 1 11 1.659434 0.8925892 1.304616
## 1 12 1.661005 0.8955928 1.314674
## 1 13 1.676936 0.8940788 1.314838
## 1 14 1.682871 0.8934731 1.325360
## 1 15 1.682871 0.8934731 1.325360
## 1 16 1.682871 0.8934731 1.325360
## 1 17 1.682871 0.8934731 1.325360
## 1 18 1.682871 0.8934731 1.325360
## 1 19 1.682871 0.8934731 1.325360
## 1 20 1.682871 0.8934731 1.325360
## 2 2 4.429910 0.2134636 3.626535
## 2 3 3.540029 0.5050779 2.822962
## 2 4 2.643738 0.7300110 2.100277
## 2 5 2.426617 0.7696635 1.928653
## 2 6 2.288169 0.7946240 1.818823
## 2 7 1.803699 0.8737778 1.388634
## 2 8 1.639712 0.8933071 1.253688
## 2 9 1.538273 0.9105295 1.188271
## 2 10 1.449948 0.9206157 1.152142
## 2 11 1.374458 0.9276781 1.090676
## 2 12 1.344642 0.9295932 1.055110
## 2 13 1.360976 0.9288580 1.054223
## 2 14 1.348315 0.9299003 1.050563
## 2 15 1.348151 0.9301725 1.067773
## 2 16 1.362602 0.9294904 1.077448
## 2 17 1.354003 0.9299707 1.072649
## 2 18 1.354003 0.9299707 1.072649
## 2 19 1.354003 0.9299707 1.072649
## 2 20 1.354003 0.9299707 1.072649
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 2.
plot(marsModel)
marsPred <- predict(marsModel, newdata = testData$x)
marsResults <- postResample(pred = marsPred, obs = testData$y)
cat("\nMARS Test Performance:\n"); print(marsResults)
##
## MARS Test Performance:
## RMSE Rsquared MAE
## 1.2803060 0.9335241 1.0168673
# Which predictors did MARS select?
cat("\nMARS Variable Importance:\n")
##
## MARS Variable Importance:
print(varImp(marsModel))
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.33
## X2 48.88
## X5 15.63
## X3 0.00
# Inspect the final model's selected terms directly
cat("\nMARS Final Model Summary:\n")
##
## MARS Final Model Summary:
summary(marsModel$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=2,
## nprune=12)
##
## coefficients
## (Intercept) 22.050690
## h(0.621722-X1) -15.001651
## h(X1-0.621722) 10.878737
## h(0.601063-X2) -18.830135
## h(0.447442-X3) 9.940077
## h(X3-0.606015) 12.999390
## h(0.734892-X4) -9.877554
## h(X4-0.734892) 10.414930
## h(0.850094-X5) -5.604897
## h(X1-0.621722) * h(X2-0.295997) -43.245766
## h(0.649253-X1) * h(0.601063-X2) 26.218297
##
## Selected 11 of 18 terms, and 5 of 10 predictors (nprune=12)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 8 2
## GCV 1.747495 RSS 264.5358 GRSq 0.929051 RSq 0.9457576
results <- rbind(
KNN = knnResults,
MARS = marsResults,
SVM = svmResults,
NeuralNet = nnetResults
)
cat("\n====== Test Set Performance Summary ======\n")
##
## ====== Test Set Performance Summary ======
print(round(results, 4))
## RMSE Rsquared MAE
## KNN 3.2041 0.6820 2.5683
## MARS 1.2803 0.9335 1.0169
## SVM 2.0865 0.8237 1.5855
## NeuralNet 2.3256 0.7970 1.7841
This clearly shows that MARS is better than the other models for solving complex problem with non linearities. With an RMSE of 1.17 MARS is able to approximate the data better and ignore noise. SVM is about to find local parten which helped it with this dataset but it seemed to struggle with noise. Surprisingly neural network while power really struggled in this problem. My guess is Neural network work best with dealing with very large dataset and our training sample was to small likely causing overfitting. KNN did horrible in this example. The different 10 dimensions means the nearest neighbors aren’t actually very close. I believe the book called this the curse of dimensionality.
Checking for which variable was used
varImp(marsModel)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.33
## X2 48.88
## X5 15.63
## X3 0.00
MARS partially recovers the informative predictor and correctly identified X1, X2, X4, and X5 and assigned zero importantance to X6-x10. However it misses X3.
#Preparing the 6.3 data
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3
library(caret)
data(ChemicalManufacturingProcess)
imputedData <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
df <- predict(imputedData, ChemicalManufacturingProcess)
nzv <- nearZeroVar(df)
df <- df[, -nzv]
trainIndex <- createDataPartition(df$Yield, p = 0.8, list = FALSE)
trainX <- df[trainIndex, -1]
trainY <- df[trainIndex, 1]
testX <- df[-trainIndex, -1]
testY <- df[-trainIndex, 1]
#7.5a
Training Nonlinear models
ctrl <- trainControl(method = "cv", number = 10)
#MARS
marsFit <- train(trainX, trainY, method = "earth",
tuneGrid = expand.grid(degree = 1:2, nprune = 2:20),
trControl = ctrl)
#Support Vector Machines
svmFit <- train(trainX, trainY, method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
#Neural Networks
nnetGrid <- expand.grid(decay = c(0, 0.01, .1), size = c(1, 3, 5, 7))
nnetFit <- train(trainX, trainY, method = "nnet",
preProc = c("center", "scale"),
linout = TRUE, trace = FALSE,
tuneGrid = nnetGrid,
trControl = ctrl)
#K-Nearest Neighbors
knnFit <- train(trainX, trainY, method = "knn",
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
Comparing performance
results <- resamples(list(MARS = marsFit, SVM = svmFit, NNet = nnetFit, KNN = knnFit))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: MARS, SVM, NNet, KNN
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.2681787 0.4334683 0.4656678 0.4912756 0.5660747 0.7359436 0
## SVM 0.3178002 0.4400961 0.4867296 0.4803069 0.5571141 0.6010852 0
## NNet 0.4558031 0.4927717 0.5558848 0.5659285 0.6402397 0.6984537 0
## KNN 0.3711397 0.4758758 0.5546613 0.5618666 0.6657035 0.7271860 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.3488000 0.5172355 0.6030923 0.6077869 0.7159956 0.8481650 0
## SVM 0.3751377 0.5089186 0.5849666 0.5775243 0.6494196 0.7406841 0
## NNet 0.5959438 0.6441879 0.7096021 0.7093700 0.7910187 0.8230797 0
## KNN 0.4627690 0.5721556 0.7367661 0.7003480 0.8225077 0.8710149 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## MARS 0.4457082 0.6099491 0.6613249 0.6732810 0.7498027 0.8689334 0
## SVM 0.5008566 0.6218478 0.6868930 0.6901925 0.7652377 0.8390663 0
## NNet 0.3887806 0.5868891 0.6105868 0.5977652 0.6588298 0.7529743 0
## KNN 0.1661089 0.4713121 0.5245911 0.5412308 0.6745439 0.8813785 0
# Check Test Set Performance
model_list <- list(marsFit, svmFit, nnetFit, knnFit)
names(model_list) <- c("MARS", "SVM", "NNet", "KNN")
test_perf <- lapply(model_list, function(m) {
preds <- predict(m, testX)
postResample(preds, testY)
})
do.call(rbind, test_perf)
## RMSE Rsquared MAE
## MARS 0.6841243 0.4948002 0.5489217
## SVM 0.6827468 0.5797122 0.5498012
## NNet 0.7675024 0.5562665 0.5751219
## KNN 0.7517560 0.3529534 0.5965325
MARS and SVM are almost identical with a Mean RMSE: MARS = 0.604 and SVM = 0.609, Mean MAE: MARS = 0.493 and SVM = 0.493, and Mean R²: SVM = 0.654 and MARS = 0.664. SVM is slightly ahead but the difference is so minor I’d go with whatever I felt more comfortable with.
nonlinear_imp <- varImp(svmFit) # or marsFit
print(nonlinear_imp)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess32 99.69
## BiologicalMaterial06 82.26
## ManufacturingProcess17 81.40
## ManufacturingProcess36 74.11
## BiologicalMaterial03 74.06
## BiologicalMaterial12 70.78
## ManufacturingProcess09 63.14
## BiologicalMaterial02 59.73
## ManufacturingProcess31 59.03
## ManufacturingProcess06 56.76
## BiologicalMaterial11 50.67
## ManufacturingProcess33 47.36
## ManufacturingProcess11 46.09
## ManufacturingProcess12 44.13
## BiologicalMaterial04 42.82
## ManufacturingProcess02 42.78
## BiologicalMaterial08 42.10
## ManufacturingProcess29 41.51
## BiologicalMaterial01 36.61
# Plot the top 20
plot(nonlinear_imp, top = 20, cex.names = 0.5)
There are 7 Manufacturing variables and 3 Biological variables. Manufacturing dominates the top 10 and this pattern continues even into the top 20.
library(ggplot2)
plot_df <- data.frame(trainX, Yield = trainY)
ggplot(plot_df, aes(x = ManufacturingProcess32, y = Yield)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "loess", color = "red") +
labs(title = "Nonlinear Relationship Exploration",
x = "Manufacturing Process 32",
y = "Yield") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
The smoothed curve shows a clear nonlinear relationship with three phases:
Low values (−3 to −1): Yield rises steeply as ManufacturingProcess32 increases a strong positive association Mid values (−1 to +1): The curve flattens and yields stabilize and becomes relatively insensitive to further increases in the process variable High values (+1 to +2): The curve turns slightly downward suggesting that pushing this process variable too high may actually reduce yield
This S shaped curve pattern is the kind of relationship linear model couldn’t capture but SVM can. From a process stand point the plot shows there’s an optimal operating window for manufacturing process 32, and running the at very high levels not only won’t yield additional benefits but might actually be lower your result