library(tidyverse)
library(mlbench)
library(caret)
library(earth)
Friedman (1991) introduced a nonlinear simulation:
\[y = 10\sin(\pi x_1 x_2) + 20(x_3 - 0.5)^2 + 10x_4 + 5x_5 + N(0, \sigma^2)\]
The predictors x1 through x5 are informative. x6 through x10 are non-informative noise.
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- trainingData$x %>% data.frame()
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- testData$x %>% data.frame()
featurePlot(trainingData$x, trainingData$y)
Feature Plot: X1 through X5 show clear relationships with the response. X6 through X10 are just noise, no pattern at all.
knnModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10
)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
knnPerf <- postResample(pred = knnPred, obs = testData$y)
knnPerf
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
marsGrid <- expand.grid(
degree = 1:2,
nprune = seq(2, 20, by = 2)
)
marsModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = marsGrid
)
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.383438 0.2405683 3.597961
## 1 4 2.727602 0.7035031 2.184240
## 1 6 2.331605 0.7835496 1.833420
## 1 8 1.870959 0.8585503 1.464551
## 1 10 1.787676 0.8711960 1.386944
## 1 12 1.821005 0.8670619 1.419893
## 1 14 1.862343 0.8623072 1.446050
## 1 16 1.875619 0.8597499 1.460975
## 1 18 1.879956 0.8591348 1.464279
## 1 20 1.879956 0.8591348 1.464279
## 2 2 4.383438 0.2405683 3.597961
## 2 4 2.730222 0.7028372 2.183075
## 2 6 2.336551 0.7823254 1.819513
## 2 8 1.890142 0.8550741 1.475571
## 2 10 1.592695 0.8961849 1.245305
## 2 12 1.426399 0.9169354 1.112717
## 2 14 1.417106 0.9196136 1.108673
## 2 16 1.465404 0.9150579 1.143336
## 2 18 1.460856 0.9160317 1.136890
## 2 20 1.460394 0.9161043 1.136405
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
plot(marsModel)
MARS Plot: RMSE drops as terms added, then flattens out. Degree-2 models tend to win because the true function has an interaction term (sin of pi * x1 * x2).
marsPred <- predict(marsModel, newdata = testData$x)
marsPerf <- postResample(pred = marsPred, obs = testData$y)
marsPerf
## RMSE Rsquared MAE
## 1.1722635 0.9448890 0.9324923
varImp(marsModel) %>% plot(main = "MARS Variable Importance")
MARS Variable Importance: X1 through X5 dominate. X6 through X10 get near-zero importance.
svmModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10
)
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.495040 0.7816921 1.987656
## 0.50 2.284073 0.7970033 1.808250
## 1.00 2.138541 0.8171060 1.681546
## 2.00 2.042548 0.8298213 1.598532
## 4.00 2.022622 0.8311882 1.580163
## 8.00 2.020573 0.8312937 1.577614
## 16.00 2.019380 0.8314497 1.577164
## 32.00 2.019380 0.8314497 1.577164
## 64.00 2.019380 0.8314497 1.577164
## 128.00 2.019380 0.8314497 1.577164
##
## Tuning parameter 'sigma' was held constant at a value of 0.06670077
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06670077 and C = 16.
svmPred <- predict(svmModel, newdata = testData$x)
svmPerf <- postResample(pred = svmPred, obs = testData$y)
svmPerf
## RMSE Rsquared MAE
## 2.0829707 0.8242096 1.5826017
nnetGrid <- expand.grid(
size = seq(1, 10, by = 2),
decay = c(0, 0.001, 0.01, 0.1)
)
nnetModel <- train(
x = trainingData$x,
y = trainingData$y,
method = "nnet",
preProc = c("center", "scale"),
tuneGrid = nnetGrid,
linout = TRUE,
trace = FALSE,
maxit = 500
)
nnetModel
## Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.000 2.684486 0.7057898 2.121664
## 1 0.001 2.676616 0.7033472 2.098347
## 1 0.010 2.511628 0.7438056 1.954489
## 1 0.100 2.522268 0.7409498 1.958387
## 3 0.000 3.298346 0.6409900 2.361192
## 3 0.001 3.105130 0.6367001 2.442877
## 3 0.010 2.893815 0.6776103 2.290386
## 3 0.100 2.593636 0.7329024 2.045915
## 5 0.000 6.236517 0.3727671 3.758907
## 5 0.001 3.577890 0.5687275 2.808683
## 5 0.010 3.499998 0.5870890 2.720500
## 5 0.100 3.017321 0.6572537 2.391736
## 7 0.000 7.376142 0.3807040 4.137385
## 7 0.001 4.339330 0.4958139 3.300207
## 7 0.010 3.604441 0.5879934 2.854082
## 7 0.100 3.449392 0.6018107 2.715443
## 9 0.000 5.376651 0.4818358 3.624401
## 9 0.001 3.953806 0.5288776 3.122270
## 9 0.010 3.880901 0.5494935 3.002392
## 9 0.100 3.382388 0.5976321 2.724585
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.01.
nnetPred <- predict(nnetModel, newdata = testData$x)
nnetPerf <- postResample(pred = nnetPred, obs = testData$y)
nnetPerf
## RMSE Rsquared MAE
## 2.6432552 0.7194546 2.0232008
results <- list(
KNN = knnPerf,
MARS = marsPerf,
SVM = svmPerf,
NNET = nnetPerf
) %>%
map_dfr(~ as_tibble(t(.x)), .id = "Model") %>%
arrange(RMSE)
results %>% knitr::kable(digits = 4)
| Model | RMSE | Rsquared | MAE |
|---|---|---|---|
| MARS | 1.1723 | 0.9449 | 0.9325 |
| SVM | 2.0830 | 0.8242 | 1.5826 |
| NNET | 2.6433 | 0.7195 | 2.0232 |
| KNN | 3.2041 | 0.6820 | 2.5683 |
results %>%
pivot_longer(cols = c(RMSE, Rsquared), names_to = "Metric", values_to = "Value") %>%
ggplot(aes(x = reorder(Model, -Value), y = Value, fill = Model)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ Metric, scales = "free_y") +
labs(
title = "Model Performance Comparison (Friedman 1)",
x = NULL,
y = NULL
) +
theme_minimal(base_size = 14)
Comparison Chart: SVM and MARS typically come out on top here. KNN struggles with the dimensionality, and the neural net can be inconsistent.
The informative predictors are X1 through X5. X6 through X10 are non-informative noise variables. MARS should assign high importance to X1 through X5 and low or zero importance to X6 through X10.
varImp(marsModel)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.24
## X2 48.74
## X5 15.53
## X3 0.00
MARS is the standout for this problem. It performs well and, unlike the other models, it also tells which predictors actually matter. That interpretability is a real advantage when you care about understanding the data, not just predicting well.