library(mlbench)
## Warning: package 'mlbench' was built under R version 4.5.2
set.seed(123)
data <- mlbench.friedman1(200, sd = 1)
df <- data.frame(data$x)
df$y <- data$y
set.seed(123)
trainIndex <- sample(1:nrow(df), 0.8*nrow(df))
train <- df[trainIndex, ]
test <- df[-trainIndex, ]
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
## Loading required package: lattice
control <- trainControl(method = "cv", number = 5)
set.seed(123)
knn_model <- train(y ~ ., data = train,
method = "knn",
trControl = control,
tuneLength = 10)
set.seed(123)
svm_model <- train(y ~ ., data = train,
method = "svmRadial",
trControl = control,
tuneLength = 10)
set.seed(123)
nnet_model <- train(y ~ ., data = train,
method = "nnet",
trControl = control,
trace = FALSE,
tuneLength = 5)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
set.seed(123)
mars_model <- train(y ~ ., data = train,
method = "earth",
trControl = control,
tuneLength = 10)
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.5.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.5.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.5.2
summary(mars_model$finalModel)
## Call: earth(x=matrix[160,10], y=c(5.599,13.92,2...), keepxy=TRUE, degree=1,
## nprune=8)
##
## coefficients
## (Intercept) 18.253152
## h(0.58175-X1) -10.144056
## h(0.5849-X2) -12.294953
## h(0.444527-X3) 11.811415
## h(X3-0.444527) 8.907508
## h(0.903401-X4) -10.326271
## h(X5-0.191312) 5.354069
## h(X8-0.828207) 8.785975
##
## Selected 8 of 20 terms, and 6 of 10 predictors (nprune=8)
## Termination condition: Reached nk 21
## Importance: X4, X2, X1, X5, X3, X8, X6-unused, X7-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 7 (additive model)
## GCV 2.733329 RSS 359.1765 GRSq 0.8761635 RSq 0.8970111
results <- resamples(list(
kNN = knn_model,
SVM = svm_model,
NN = nnet_model,
MARS = mars_model
))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: kNN, SVM, NN, MARS
## Number of resamples: 5
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 2.029830 2.065465 2.192595 2.292289 2.353855 2.819702 0
## SVM 1.161699 1.513344 1.533736 1.474941 1.546174 1.619754 0
## NN 13.502103 13.641643 13.654174 13.641073 13.702464 13.704982 0
## MARS 1.251510 1.268957 1.290207 1.332457 1.420435 1.431177 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 2.534503 2.654364 2.701961 2.869894 2.923484 3.535158 0
## SVM 1.449625 1.825587 1.839818 1.845953 1.962907 2.151829 0
## NN 14.073605 14.362591 14.432178 14.416339 14.479334 14.733986 0
## MARS 1.537070 1.563061 1.690926 1.682290 1.724146 1.896249 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## kNN 0.5650286 0.6290567 0.6663214 0.6485648 0.6855466 0.6968708 0
## SVM 0.7518451 0.8367328 0.8420569 0.8491882 0.8998005 0.9155057 0
## NN NA NA NA NaN NA NA 5
## MARS 0.8044394 0.8646940 0.8819871 0.8732113 0.9036079 0.9113282 0
bwplot(results)
pred_knn <- predict(knn_model, test)
pred_svm <- predict(svm_model, test)
pred_nn <- predict(nnet_model, test)
pred_mars <- predict(mars_model, test)
data.frame(
Model = c("kNN", "SVM", "NN", "MARS"),
RMSE = c(
RMSE(pred_knn, test$y),
RMSE(pred_svm, test$y),
RMSE(pred_nn, test$y),
RMSE(pred_mars, test$y)
)
)
## Model RMSE
## 1 kNN 2.916709
## 2 SVM 1.870894
## 3 NN 14.303565
## 4 MARS 1.475404
The models showed clear differences in performance. MARS performed the best overall, with the lowest RMSE and strong R² values, followed by SVM. The kNN model was less accurate, while the neural network performed very poorly, with high error and no meaningful R² values.
Looking at the MARS model, it mostly selected the important predictors (X1–X5), which are known to drive the outcome. It also included one additional variable (X8), but the remaining variables were not used. This shows that MARS was able to identify the key predictors, with only a small amount of noise included.
Overall, MARS provided the best performance and was effective at identifying the most relevant variables, with SVM as a strong alternative.