Load Libraries

library(tidyverse)
library(mlbench)
library(caret)
library(earth)

Generate Data

Friedman (1991) introduced a nonlinear simulation:

\[y = 10\sin(\pi x_1 x_2) + 20(x_3 - 0.5)^2 + 10x_4 + 5x_5 + N(0, \sigma^2)\]

The predictors x1 through x5 are informative. x6 through x10 are non-informative noise.

set.seed(200)

trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- trainingData$x %>% data.frame()

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- testData$x %>% data.frame()
featurePlot(trainingData$x, trainingData$y)

Feature Plot: X1 through X5 show clear relationships with the response. X6 through X10 are just noise, no pattern at all.

KNN Model

knnModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "knn",
  preProc = c("center", "scale"),
  tuneLength = 10
)

knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
knnPerf <- postResample(pred = knnPred, obs = testData$y)
knnPerf
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

MARS Model

marsGrid <- expand.grid(
  degree = 1:2,
  nprune = seq(2, 20, by = 2)
)

marsModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "earth",
  tuneGrid = marsGrid
)

marsModel
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.383438  0.2405683  3.597961
##   1        4      2.727602  0.7035031  2.184240
##   1        6      2.331605  0.7835496  1.833420
##   1        8      1.870959  0.8585503  1.464551
##   1       10      1.787676  0.8711960  1.386944
##   1       12      1.821005  0.8670619  1.419893
##   1       14      1.862343  0.8623072  1.446050
##   1       16      1.875619  0.8597499  1.460975
##   1       18      1.879956  0.8591348  1.464279
##   1       20      1.879956  0.8591348  1.464279
##   2        2      4.383438  0.2405683  3.597961
##   2        4      2.730222  0.7028372  2.183075
##   2        6      2.336551  0.7823254  1.819513
##   2        8      1.890142  0.8550741  1.475571
##   2       10      1.592695  0.8961849  1.245305
##   2       12      1.426399  0.9169354  1.112717
##   2       14      1.417106  0.9196136  1.108673
##   2       16      1.465404  0.9150579  1.143336
##   2       18      1.460856  0.9160317  1.136890
##   2       20      1.460394  0.9161043  1.136405
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
plot(marsModel)

MARS Plot: RMSE drops as terms added, then flattens out. Degree-2 models tend to win because the true function has an interaction term (sin of pi * x1 * x2).

marsPred <- predict(marsModel, newdata = testData$x)
marsPerf <- postResample(pred = marsPred, obs = testData$y)
marsPerf
##      RMSE  Rsquared       MAE 
## 1.1722635 0.9448890 0.9324923
varImp(marsModel) %>% plot(main = "MARS Variable Importance")

MARS Variable Importance: X1 through X5 dominate. X6 through X10 get near-zero importance.

SVM (Radial Basis) Model

svmModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "svmRadial",
  preProc = c("center", "scale"),
  tuneLength = 10
)

svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  2.495040  0.7816921  1.987656
##     0.50  2.284073  0.7970033  1.808250
##     1.00  2.138541  0.8171060  1.681546
##     2.00  2.042548  0.8298213  1.598532
##     4.00  2.022622  0.8311882  1.580163
##     8.00  2.020573  0.8312937  1.577614
##    16.00  2.019380  0.8314497  1.577164
##    32.00  2.019380  0.8314497  1.577164
##    64.00  2.019380  0.8314497  1.577164
##   128.00  2.019380  0.8314497  1.577164
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06670077
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06670077 and C = 16.
svmPred <- predict(svmModel, newdata = testData$x)
svmPerf <- postResample(pred = svmPred, obs = testData$y)
svmPerf
##      RMSE  Rsquared       MAE 
## 2.0829707 0.8242096 1.5826017

Neural Network Model

nnetGrid <- expand.grid(
  size = seq(1, 10, by = 2),
  decay = c(0, 0.001, 0.01, 0.1)
)

nnetModel <- train(
  x = trainingData$x,
  y = trainingData$y,
  method = "nnet",
  preProc = c("center", "scale"),
  tuneGrid = nnetGrid,
  linout = TRUE,
  trace = FALSE,
  maxit = 500
)

nnetModel
## Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE      Rsquared   MAE     
##   1     0.000  2.684486  0.7057898  2.121664
##   1     0.001  2.676616  0.7033472  2.098347
##   1     0.010  2.511628  0.7438056  1.954489
##   1     0.100  2.522268  0.7409498  1.958387
##   3     0.000  3.298346  0.6409900  2.361192
##   3     0.001  3.105130  0.6367001  2.442877
##   3     0.010  2.893815  0.6776103  2.290386
##   3     0.100  2.593636  0.7329024  2.045915
##   5     0.000  6.236517  0.3727671  3.758907
##   5     0.001  3.577890  0.5687275  2.808683
##   5     0.010  3.499998  0.5870890  2.720500
##   5     0.100  3.017321  0.6572537  2.391736
##   7     0.000  7.376142  0.3807040  4.137385
##   7     0.001  4.339330  0.4958139  3.300207
##   7     0.010  3.604441  0.5879934  2.854082
##   7     0.100  3.449392  0.6018107  2.715443
##   9     0.000  5.376651  0.4818358  3.624401
##   9     0.001  3.953806  0.5288776  3.122270
##   9     0.010  3.880901  0.5494935  3.002392
##   9     0.100  3.382388  0.5976321  2.724585
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.01.
nnetPred <- predict(nnetModel, newdata = testData$x)
nnetPerf <- postResample(pred = nnetPred, obs = testData$y)
nnetPerf
##      RMSE  Rsquared       MAE 
## 2.6432552 0.7194546 2.0232008

Model Comparison

results <- list(
  KNN  = knnPerf,
  MARS = marsPerf,
  SVM  = svmPerf,
  NNET = nnetPerf
) %>%
  map_dfr(~ as_tibble(t(.x)), .id = "Model") %>%
  arrange(RMSE)

results %>% knitr::kable(digits = 4)
Model RMSE Rsquared MAE
MARS 1.1723 0.9449 0.9325
SVM 2.0830 0.8242 1.5826
NNET 2.6433 0.7195 2.0232
KNN 3.2041 0.6820 2.5683
results %>%
  pivot_longer(cols = c(RMSE, Rsquared), names_to = "Metric", values_to = "Value") %>%
  ggplot(aes(x = reorder(Model, -Value), y = Value, fill = Model)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(
    title = "Model Performance Comparison (Friedman 1)",
    x = NULL,
    y = NULL
  ) +
  theme_minimal(base_size = 14)

Comparison Chart: SVM and MARS typically come out on top here. KNN struggles with the dimensionality, and the neural net can be inconsistent.

Does MARS Select the Informative Predictors?

The informative predictors are X1 through X5. X6 through X10 are non-informative noise variables. MARS should assign high importance to X1 through X5 and low or zero importance to X6 through X10.

varImp(marsModel)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.24
## X2   48.74
## X5   15.53
## X3    0.00

Conclusion

MARS is the standout for this problem. It performs well and, unlike the other models, it also tells which predictors actually matter. That interpretability is a real advantage when you care about understanding the data, not just predicting well.