library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
# MARS
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
marsTune <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = expand.grid(degree = 1, nprune = 2:38))
marsTune
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.383438 0.2405683 3.597961
## 3 3.645469 0.4745962 2.930453
## 4 2.727602 0.7035031 2.184240
## 5 2.449243 0.7611230 1.939231
## 6 2.331605 0.7835496 1.833420
## 7 1.976830 0.8421599 1.562591
## 8 1.870959 0.8585503 1.464551
## 9 1.804342 0.8683110 1.410395
## 10 1.787676 0.8711960 1.386944
## 11 1.790700 0.8707740 1.393076
## 12 1.821005 0.8670619 1.419893
## 13 1.858688 0.8617344 1.445459
## 14 1.862343 0.8623072 1.446050
## 15 1.871033 0.8607099 1.457618
## 16 1.875619 0.8597499 1.460975
## 17 1.879956 0.8591348 1.464279
## 18 1.879956 0.8591348 1.464279
## 19 1.879956 0.8591348 1.464279
## 20 1.879956 0.8591348 1.464279
## 21 1.879956 0.8591348 1.464279
## 22 1.879956 0.8591348 1.464279
## 23 1.879956 0.8591348 1.464279
## 24 1.879956 0.8591348 1.464279
## 25 1.879956 0.8591348 1.464279
## 26 1.879956 0.8591348 1.464279
## 27 1.879956 0.8591348 1.464279
## 28 1.879956 0.8591348 1.464279
## 29 1.879956 0.8591348 1.464279
## 30 1.879956 0.8591348 1.464279
## 31 1.879956 0.8591348 1.464279
## 32 1.879956 0.8591348 1.464279
## 33 1.879956 0.8591348 1.464279
## 34 1.879956 0.8591348 1.464279
## 35 1.879956 0.8591348 1.464279
## 36 1.879956 0.8591348 1.464279
## 37 1.879956 0.8591348 1.464279
## 38 1.879956 0.8591348 1.464279
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.
MARSpred <- predict(marsTune, newdata = testData$x)
# SVM
svmTune <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 14)
svmTune
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.525979 0.7804630 2.016014
## 0.50 2.293423 0.7960080 1.808878
## 1.00 2.156969 0.8112034 1.697751
## 2.00 2.081486 0.8226986 1.631756
## 4.00 2.050874 0.8270465 1.605588
## 8.00 2.046707 0.8280418 1.602151
## 16.00 2.046387 0.8281076 1.601595
## 32.00 2.046387 0.8281076 1.601595
## 64.00 2.046387 0.8281076 1.601595
## 128.00 2.046387 0.8281076 1.601595
## 256.00 2.046387 0.8281076 1.601595
## 512.00 2.046387 0.8281076 1.601595
## 1024.00 2.046387 0.8281076 1.601595
## 2048.00 2.046387 0.8281076 1.601595
##
## Tuning parameter 'sigma' was held constant at a value of 0.06529705
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06529705 and C = 16.
SVMpred <- predict(svmTune, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
postResample(pred = MARSpred, obs = testData$y)
## RMSE Rsquared MAE
## 1.776575 0.872700 1.358367
postResample(pred = SVMpred, obs = testData$y)
## RMSE Rsquared MAE
## 2.0792960 0.8247794 1.5796158
# Importance
varImp(marsTune)
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.78
## X2 64.18
## X5 40.21
## X3 28.14
## X6 0.00
Among the models considered, MARS achieved the best predictive performance, with the lowest RMSE (1.78) and MAE (1.36), and highest R2 (0.87). MARS successfully selected the informative predictors (X1-X5).