7.2

Generating the data exactly as shown in the problem

library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

#Looking at the data 
featurePlot(trainingData$x, trainingData$y)

Training the models

KNN

knnModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method     = "knn",
  preProc    = c("center", "scale"),
  tuneLength = 10
)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
knnResults <- postResample(pred = knnPred, obs = testData$y)
cat("\nKNN Test Performance:\n"); print(knnResults)
## 
## KNN Test Performance:
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Support Vector Machines (Radial Basis Function)

svmModel <- train(x = trainingData$x, y = trainingData$y,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = trainControl(method = "cv"))

svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.505383  0.8031869  1.999381
##     0.50  2.290725  0.8103140  1.829703
##     1.00  2.105086  0.8302040  1.677851
##     2.00  2.014620  0.8418576  1.598814
##     4.00  1.965196  0.8491165  1.567327
##     8.00  1.927649  0.8538945  1.542267
##    16.00  1.924262  0.8545293  1.539275
##    32.00  1.924262  0.8545293  1.539275
##    64.00  1.924262  0.8545293  1.539275
##   128.00  1.924262  0.8545293  1.539275
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06802164
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06802164 and C = 16.
plot(svmModel, scales = list(x = list(log = 2)))

svmPred <- predict(svmModel, newdata = testData$x)
svmResults <- postResample(pred = svmPred, obs = testData$y)
cat("\nSVM Test Performance:\n"); print(svmResults)
## 
## SVM Test Performance:
##      RMSE  Rsquared       MAE 
## 2.0864652 0.8236735 1.5854649

Neural Network

nnetModel <- train(x = trainingData$x, y = trainingData$y,
  method = "nnet",
  preProc = c("center", "scale"),
  linout = TRUE, # Linear output for regression
  trace = FALSE,
  tuneLength = 10,
  trControl = trainControl(method = "cv"))

nnetModel
## Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   size  decay         RMSE      Rsquared   MAE     
##    1    0.0000000000  2.633279  0.7283286  2.120217
##    1    0.0001000000  2.900115  0.6467165  2.360940
##    1    0.0002371374  2.396309  0.7791912  1.901589
##    1    0.0005623413  2.780397  0.6870115  2.233051
##    1    0.0013335214  2.445179  0.7653185  1.934351
##    1    0.0031622777  2.494515  0.7527796  2.006299
##    1    0.0074989421  2.386075  0.7801513  1.892076
##    1    0.0177827941  2.392012  0.7782854  1.903058
##    1    0.0421696503  2.391308  0.7786657  1.893639
##    1    0.1000000000  2.400189  0.7768265  1.898185
##    3    0.0000000000  2.387793  0.7728491  1.873876
##    3    0.0001000000  2.440011  0.7682545  1.934907
##    3    0.0002371374  2.522110  0.7490157  2.011777
##    3    0.0005623413  2.449060  0.7558299  2.015613
##    3    0.0013335214  2.328025  0.7775499  1.864046
##    3    0.0031622777  2.398795  0.7753683  1.854101
##    3    0.0074989421  2.609818  0.7306142  2.063122
##    3    0.0177827941  2.386876  0.7670830  1.910688
##    3    0.0421696503  2.348159  0.7892324  1.869168
##    3    0.1000000000  2.486285  0.7582800  1.978378
##    5    0.0000000000  2.646703  0.7502254  2.097796
##    5    0.0001000000  2.510570  0.7571416  2.031122
##    5    0.0002371374  2.688755  0.7332813  2.095918
##    5    0.0005623413  2.682778  0.7341635  2.127294
##    5    0.0013335214  2.711187  0.7455543  2.141667
##    5    0.0031622777  2.564999  0.7577619  2.069637
##    5    0.0074989421  2.632349  0.7544988  2.050032
##    5    0.0177827941  2.536531  0.7550486  1.966269
##    5    0.0421696503  2.581026  0.7673985  2.017268
##    5    0.1000000000  2.176375  0.8165038  1.742318
##    7    0.0000000000  2.774144  0.7066533  2.221055
##    7    0.0001000000  3.026673  0.6654769  2.391277
##    7    0.0002371374  3.459070  0.6335929  2.601472
##    7    0.0005623413  2.936064  0.7016427  2.303490
##    7    0.0013335214  2.982957  0.6816505  2.247375
##    7    0.0031622777  2.990882  0.6838274  2.421988
##    7    0.0074989421  2.899642  0.7195327  2.249124
##    7    0.0177827941  2.729599  0.7234758  2.171069
##    7    0.0421696503  3.348929  0.6241268  2.486973
##    7    0.1000000000  3.328893  0.6465184  2.446462
##    9    0.0000000000  3.058557  0.6743119  2.550405
##    9    0.0001000000  3.483733  0.6045605  2.737500
##    9    0.0002371374  3.347573  0.6226467  2.556725
##    9    0.0005623413  3.083017  0.6796748  2.419472
##    9    0.0013335214  3.279535  0.5994149  2.692880
##    9    0.0031622777  3.593053  0.6288529  2.636225
##    9    0.0074989421  3.561847  0.6352796  2.731522
##    9    0.0177827941  3.196158  0.6358670  2.581625
##    9    0.0421696503  3.459481  0.6284756  2.636621
##    9    0.1000000000  2.863687  0.7133674  2.312240
##   11    0.0000000000  3.399657  0.6141205  2.795511
##   11    0.0001000000  3.254411  0.6349458  2.594708
##   11    0.0002371374  3.408067  0.6249642  2.854058
##   11    0.0005623413  3.452369  0.5988766  2.768006
##   11    0.0013335214  3.679091  0.5406090  2.889770
##   11    0.0031622777  3.166633  0.6389032  2.650011
##   11    0.0074989421  3.622868  0.5687058  2.897295
##   11    0.0177827941  3.585259  0.5700386  2.798334
##   11    0.0421696503  3.178816  0.6517175  2.574439
##   11    0.1000000000  3.069972  0.6754721  2.476047
##   13    0.0000000000  3.819798  0.5561931  3.171339
##   13    0.0001000000  3.575755  0.5883873  2.972471
##   13    0.0002371374  3.658245  0.5932256  2.918970
##   13    0.0005623413  3.257107  0.6296140  2.614488
##   13    0.0013335214  3.279264  0.6493529  2.644173
##   13    0.0031622777  3.313546  0.6520617  2.747977
##   13    0.0074989421  3.341877  0.6085338  2.686905
##   13    0.0177827941  3.228235  0.6244170  2.686000
##   13    0.0421696503  3.636838  0.5665346  2.905189
##   13    0.1000000000  2.717291  0.7196020  2.214065
##   15    0.0000000000  3.212517  0.6733055  2.605846
##   15    0.0001000000  3.095634  0.6616336  2.481403
##   15    0.0002371374  3.618893  0.6003143  2.885508
##   15    0.0005623413  3.748686  0.5612102  2.983532
##   15    0.0013335214  3.501006  0.5825447  2.788876
##   15    0.0031622777  3.110343  0.6406271  2.480439
##   15    0.0074989421  3.402707  0.6187874  2.695783
##   15    0.0177827941  3.290526  0.6416083  2.680549
##   15    0.0421696503  3.272191  0.6163545  2.600350
##   15    0.1000000000  2.689449  0.7221497  2.127840
##   17    0.0000000000  3.451505  0.6328619  2.809879
##   17    0.0001000000  3.649799  0.5718014  3.008216
##   17    0.0002371374  3.325010  0.6263377  2.768150
##   17    0.0005623413  3.531470  0.5778854  2.758704
##   17    0.0013335214  3.491189  0.6066510  2.791058
##   17    0.0031622777  3.470810  0.6112536  2.830853
##   17    0.0074989421  3.294250  0.6399250  2.694743
##   17    0.0177827941  3.021618  0.6756008  2.412193
##   17    0.0421696503  2.890368  0.7162036  2.342703
##   17    0.1000000000  2.781787  0.7317085  2.263080
##   19    0.0000000000  3.248772  0.6367844  2.642332
##   19    0.0001000000  3.460997  0.5878201  2.698230
##   19    0.0002371374  3.296080  0.6359887  2.694911
##   19    0.0005623413  3.011337  0.6919807  2.469314
##   19    0.0013335214  3.311350  0.6164769  2.679284
##   19    0.0031622777  3.098876  0.6743997  2.482737
##   19    0.0074989421  3.153581  0.6666430  2.498318
##   19    0.0177827941  3.091203  0.6580394  2.415431
##   19    0.0421696503  2.550630  0.7696222  2.072981
##   19    0.1000000000  2.507666  0.7525353  1.920137
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5 and decay = 0.1.
plot(nnetModel)

nnetPred <- predict(nnetModel, newdata = testData$x)
nnetResults <- postResample(pred = nnetPred, obs = testData$y)
cat("\nNeural Net Test Performance:\n"); print(nnetResults)
## 
## Neural Net Test Performance:
##      RMSE  Rsquared       MAE 
## 2.3255984 0.7970049 1.7840774

MARS

marsModel <- train(x = trainingData$x, y = trainingData$y,
  method = "earth", # earth implements MARS
  tuneGrid = expand.grid(degree = 1:2, nprune = 2:20),
  trControl = trainControl(method = "cv"))
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.5.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.5.3
## Loading required package: plotrix
marsModel
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.429910  0.2134636  3.626535
##   1        3      3.540029  0.5050779  2.822962
##   1        4      2.643738  0.7300110  2.100277
##   1        5      2.403045  0.7736763  1.910643
##   1        6      2.247706  0.8010015  1.778977
##   1        7      1.855825  0.8676690  1.423783
##   1        8      1.717744  0.8843792  1.336747
##   1        9      1.707344  0.8836528  1.317357
##   1       10      1.666426  0.8910829  1.312938
##   1       11      1.659434  0.8925892  1.304616
##   1       12      1.661005  0.8955928  1.314674
##   1       13      1.676936  0.8940788  1.314838
##   1       14      1.682871  0.8934731  1.325360
##   1       15      1.682871  0.8934731  1.325360
##   1       16      1.682871  0.8934731  1.325360
##   1       17      1.682871  0.8934731  1.325360
##   1       18      1.682871  0.8934731  1.325360
##   1       19      1.682871  0.8934731  1.325360
##   1       20      1.682871  0.8934731  1.325360
##   2        2      4.429910  0.2134636  3.626535
##   2        3      3.540029  0.5050779  2.822962
##   2        4      2.643738  0.7300110  2.100277
##   2        5      2.426617  0.7696635  1.928653
##   2        6      2.288169  0.7946240  1.818823
##   2        7      1.803699  0.8737778  1.388634
##   2        8      1.639712  0.8933071  1.253688
##   2        9      1.538273  0.9105295  1.188271
##   2       10      1.449948  0.9206157  1.152142
##   2       11      1.374458  0.9276781  1.090676
##   2       12      1.344642  0.9295932  1.055110
##   2       13      1.360976  0.9288580  1.054223
##   2       14      1.348315  0.9299003  1.050563
##   2       15      1.348151  0.9301725  1.067773
##   2       16      1.362602  0.9294904  1.077448
##   2       17      1.354003  0.9299707  1.072649
##   2       18      1.354003  0.9299707  1.072649
##   2       19      1.354003  0.9299707  1.072649
##   2       20      1.354003  0.9299707  1.072649
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 2.
plot(marsModel) 

marsPred <- predict(marsModel, newdata = testData$x)
marsResults <- postResample(pred = marsPred, obs = testData$y)
cat("\nMARS Test Performance:\n"); print(marsResults)
## 
## MARS Test Performance:
##      RMSE  Rsquared       MAE 
## 1.2803060 0.9335241 1.0168673
# Which predictors did MARS select?
cat("\nMARS Variable Importance:\n")
## 
## MARS Variable Importance:
print(varImp(marsModel))
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.33
## X2   48.88
## X5   15.63
## X3    0.00
# Inspect the final model's selected terms directly
cat("\nMARS Final Model Summary:\n")
## 
## MARS Final Model Summary:
summary(marsModel$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=2,
##             nprune=12)
## 
##                                 coefficients
## (Intercept)                        22.050690
## h(0.621722-X1)                    -15.001651
## h(X1-0.621722)                     10.878737
## h(0.601063-X2)                    -18.830135
## h(0.447442-X3)                      9.940077
## h(X3-0.606015)                     12.999390
## h(0.734892-X4)                     -9.877554
## h(X4-0.734892)                     10.414930
## h(0.850094-X5)                     -5.604897
## h(X1-0.621722) * h(X2-0.295997)   -43.245766
## h(0.649253-X1) * h(0.601063-X2)    26.218297
## 
## Selected 11 of 18 terms, and 5 of 10 predictors (nprune=12)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 8 2
## GCV 1.747495    RSS 264.5358    GRSq 0.929051    RSq 0.9457576
results <- rbind(
  KNN        = knnResults,
  MARS       = marsResults,
  SVM        = svmResults,
  NeuralNet  = nnetResults
)

cat("\n====== Test Set Performance Summary ======\n")
## 
## ====== Test Set Performance Summary ======
print(round(results, 4))
##             RMSE Rsquared    MAE
## KNN       3.2041   0.6820 2.5683
## MARS      1.2803   0.9335 1.0169
## SVM       2.0865   0.8237 1.5855
## NeuralNet 2.3256   0.7970 1.7841

This clearly shows that MARS is better than the other models for solving complex problem with non linearities. With an RMSE of 1.17 MARS is able to approximate the data better and ignore noise. SVM is about to find local parten which helped it with this dataset but it seemed to struggle with noise. Surprisingly neural network while power really struggled in this problem. My guess is Neural network work best with dealing with very large dataset and our training sample was to small likely causing overfitting. KNN did horrible in this example. The different 10 dimensions means the nearest neighbors aren’t actually very close. I believe the book called this the curse of dimensionality.

Checking for which variable was used

varImp(marsModel)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.33
## X2   48.88
## X5   15.63
## X3    0.00

MARS partially recovers the informative predictor and correctly identified X1, X2, X4, and X5 and assigned zero importantance to X6-x10. However it misses X3.

7.5

#Preparing the 6.3 data

library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.5.3
library(caret)
data(ChemicalManufacturingProcess)
imputedData <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
df <- predict(imputedData, ChemicalManufacturingProcess)

nzv <- nearZeroVar(df)
df <- df[, -nzv]

trainIndex <- createDataPartition(df$Yield, p = 0.8, list = FALSE)
trainX <- df[trainIndex, -1] 
trainY <- df[trainIndex, 1]
testX <- df[-trainIndex, -1]
testY <- df[-trainIndex, 1]

#7.5a

Training Nonlinear models

ctrl <- trainControl(method = "cv", number = 10)

#MARS
marsFit <- train(trainX, trainY, method = "earth",
  tuneGrid = expand.grid(degree = 1:2, nprune = 2:20),
  trControl = ctrl)

#Support Vector Machines
svmFit <- train(trainX, trainY, method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl)

#Neural Networks
nnetGrid <- expand.grid(decay = c(0, 0.01, .1), size = c(1, 3, 5, 7))
nnetFit <- train(trainX, trainY, method = "nnet",
  preProc = c("center", "scale"),
  linout = TRUE, trace = FALSE,
  tuneGrid = nnetGrid,
  trControl = ctrl)

#K-Nearest Neighbors
knnFit <- train(trainX, trainY, method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 10,
  trControl = ctrl)

Comparing performance

results <- resamples(list(MARS = marsFit, SVM = svmFit, NNet = nnetFit, KNN = knnFit))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: MARS, SVM, NNet, KNN 
## Number of resamples: 10 
## 
## MAE 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS 0.2681787 0.4334683 0.4656678 0.4912756 0.5660747 0.7359436    0
## SVM  0.3178002 0.4400961 0.4867296 0.4803069 0.5571141 0.6010852    0
## NNet 0.4558031 0.4927717 0.5558848 0.5659285 0.6402397 0.6984537    0
## KNN  0.3711397 0.4758758 0.5546613 0.5618666 0.6657035 0.7271860    0
## 
## RMSE 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS 0.3488000 0.5172355 0.6030923 0.6077869 0.7159956 0.8481650    0
## SVM  0.3751377 0.5089186 0.5849666 0.5775243 0.6494196 0.7406841    0
## NNet 0.5959438 0.6441879 0.7096021 0.7093700 0.7910187 0.8230797    0
## KNN  0.4627690 0.5721556 0.7367661 0.7003480 0.8225077 0.8710149    0
## 
## Rsquared 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## MARS 0.4457082 0.6099491 0.6613249 0.6732810 0.7498027 0.8689334    0
## SVM  0.5008566 0.6218478 0.6868930 0.6901925 0.7652377 0.8390663    0
## NNet 0.3887806 0.5868891 0.6105868 0.5977652 0.6588298 0.7529743    0
## KNN  0.1661089 0.4713121 0.5245911 0.5412308 0.6745439 0.8813785    0
# Check Test Set Performance
model_list <- list(marsFit, svmFit, nnetFit, knnFit)
names(model_list) <- c("MARS", "SVM", "NNet", "KNN")

test_perf <- lapply(model_list, function(m) {
  preds <- predict(m, testX)
  postResample(preds, testY)
})
do.call(rbind, test_perf)
##           RMSE  Rsquared       MAE
## MARS 0.6841243 0.4948002 0.5489217
## SVM  0.6827468 0.5797122 0.5498012
## NNet 0.7675024 0.5562665 0.5751219
## KNN  0.7517560 0.3529534 0.5965325

MARS and SVM are almost identical with a Mean RMSE: MARS = 0.604 and SVM = 0.609, Mean MAE: MARS = 0.493 and SVM = 0.493, and Mean R²: SVM = 0.654 and MARS = 0.664. SVM is slightly ahead but the difference is so minor I’d go with whatever I felt more comfortable with.

7.5 b

nonlinear_imp <- varImp(svmFit) # or marsFit
print(nonlinear_imp)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess32   99.69
## BiologicalMaterial06     82.26
## ManufacturingProcess17   81.40
## ManufacturingProcess36   74.11
## BiologicalMaterial03     74.06
## BiologicalMaterial12     70.78
## ManufacturingProcess09   63.14
## BiologicalMaterial02     59.73
## ManufacturingProcess31   59.03
## ManufacturingProcess06   56.76
## BiologicalMaterial11     50.67
## ManufacturingProcess33   47.36
## ManufacturingProcess11   46.09
## ManufacturingProcess12   44.13
## BiologicalMaterial04     42.82
## ManufacturingProcess02   42.78
## BiologicalMaterial08     42.10
## ManufacturingProcess29   41.51
## BiologicalMaterial01     36.61
# Plot the top 20
plot(nonlinear_imp, top = 20, cex.names = 0.5)

There are 7 Manufacturing variables and 3 Biological variables. Manufacturing dominates the top 10 and this pattern continues even into the top 20.

7.5 c

library(ggplot2)
plot_df <- data.frame(trainX, Yield = trainY)

ggplot(plot_df, aes(x = ManufacturingProcess32, y = Yield)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "loess", color = "red") + 
  labs(title = "Nonlinear Relationship Exploration",
       x = "Manufacturing Process 32",
       y = "Yield") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

The smoothed curve shows a clear nonlinear relationship with three phases:

Low values (−3 to −1): Yield rises steeply as ManufacturingProcess32 increases a strong positive association Mid values (−1 to +1): The curve flattens and yields stabilize and becomes relatively insensitive to further increases in the process variable High values (+1 to +2): The curve turns slightly downward suggesting that pushing this process variable too high may actually reduce yield

This S shaped curve pattern is the kind of relationship linear model couldn’t capture but SVM can. From a process stand point the plot shows there’s an optimal operating window for manufacturing process 32, and running the at very high levels not only won’t yield additional benefits but might actually be lower your result