library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(AppliedPredictiveModeling)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(nnet)
set.seed(200)
Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:
where the x values are random variables uniformly distributed
between [0, 1] (there are also 5 other non-informative variables also
created in the simulation). The package mlbench contains a function
called mlbench.friedman1 that simulates these data:
training_data <- mlbench.friedman1(200, sd = 1)
training_data$x <- data.frame(training_data$x)
featurePlot(training_data$x, training_data$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
test_data <- mlbench.friedman1(5000, sd = 1)
test_data$x <- data.frame(test_data$x)
ctrl <- trainControl(method = "cv", number = 10)
Tune several models on these data.
knn_model <- train(x = training_data$x,
y = training_data$y,
method = 'knn',
preProc = c("center", "scale"),
tuneLength = 10,
trControl = ctrl)
knn_model
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.305255 0.5721163 2.754163
## 7 3.149006 0.6185350 2.606181
## 9 3.088199 0.6674249 2.510615
## 11 3.071514 0.6867903 2.506869
## 13 3.067138 0.6963304 2.491396
## 15 3.120643 0.6955263 2.523599
## 17 3.107764 0.7108114 2.526434
## 19 3.124437 0.7189758 2.539485
## 21 3.156001 0.7136684 2.575904
## 23 3.179459 0.7172603 2.601060
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
knn_pred <- predict(knn_model, newdata = test_data$x)
pr_knn <- postResample(pred = knn_pred, obs = test_data$y)
compare <- data.frame(model = "knn",
r_squared = pr_knn[["Rsquared"]],
rmse = pr_knn[["RMSE"]],
mse = pr_knn[["RMSE"]]^2)
svm_model <- train(x = training_data$x,
y = training_data$y,
method = 'svmRadial',
preProc = c("center", "scale"),
tuneLength = 14,
trControl = ctrl)
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.503872 0.7977922 1.994784
## 0.50 2.263709 0.8131959 1.792161
## 1.00 2.089745 0.8350487 1.640996
## 2.00 1.984750 0.8499042 1.540859
## 4.00 1.920550 0.8599271 1.475066
## 8.00 1.891624 0.8643280 1.455714
## 16.00 1.880946 0.8665123 1.463006
## 32.00 1.880885 0.8665323 1.463142
## 64.00 1.880885 0.8665323 1.463142
## 128.00 1.880885 0.8665323 1.463142
## 256.00 1.880885 0.8665323 1.463142
## 512.00 1.880885 0.8665323 1.463142
## 1024.00 1.880885 0.8665323 1.463142
## 2048.00 1.880885 0.8665323 1.463142
##
## Tuning parameter 'sigma' was held constant at a value of 0.06273868
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06273868 and C = 32.
svm_pred <- predict(svm_model, newdata = test_data$x)
pr_svm <- postResample(pred = svm_pred, obs = test_data$y)
svm_row <- data.frame(model = "svm",
r_squared = pr_svm[["Rsquared"]],
rmse = pr_svm[["RMSE"]],
mse = pr_svm[["RMSE"]]^2)
compare <- rbind(compare, svm_row)
mars_grid <- expand.grid(.degree = 1:2, .nprune = 2:38)
mars_model <- train(x = training_data$x,
y = training_data$y,
method = 'earth',
tuneGrid = mars_grid,
trControl = ctrl)
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
mars_model
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.336509 0.2780552 3.644285
## 1 3 3.473744 0.5268355 2.810284
## 1 4 2.585918 0.7307972 2.097935
## 1 5 2.281714 0.7932946 1.850226
## 1 6 2.220784 0.8080737 1.752481
## 1 7 1.778204 0.8755039 1.404480
## 1 8 1.694713 0.8831280 1.314474
## 1 9 1.677352 0.8870617 1.299383
## 1 10 1.637649 0.8939230 1.261202
## 1 11 1.585028 0.9016419 1.254184
## 1 12 1.589668 0.9023533 1.243048
## 1 13 1.636098 0.8963126 1.274448
## 1 14 1.667410 0.8928254 1.288750
## 1 15 1.665869 0.8929734 1.289378
## 1 16 1.665869 0.8929734 1.289378
## 1 17 1.665869 0.8929734 1.289378
## 1 18 1.665869 0.8929734 1.289378
## 1 19 1.665869 0.8929734 1.289378
## 1 20 1.665869 0.8929734 1.289378
## 1 21 1.665869 0.8929734 1.289378
## 1 22 1.665869 0.8929734 1.289378
## 1 23 1.665869 0.8929734 1.289378
## 1 24 1.665869 0.8929734 1.289378
## 1 25 1.665869 0.8929734 1.289378
## 1 26 1.665869 0.8929734 1.289378
## 1 27 1.665869 0.8929734 1.289378
## 1 28 1.665869 0.8929734 1.289378
## 1 29 1.665869 0.8929734 1.289378
## 1 30 1.665869 0.8929734 1.289378
## 1 31 1.665869 0.8929734 1.289378
## 1 32 1.665869 0.8929734 1.289378
## 1 33 1.665869 0.8929734 1.289378
## 1 34 1.665869 0.8929734 1.289378
## 1 35 1.665869 0.8929734 1.289378
## 1 36 1.665869 0.8929734 1.289378
## 1 37 1.665869 0.8929734 1.289378
## 1 38 1.665869 0.8929734 1.289378
## 2 2 4.336509 0.2780552 3.644285
## 2 3 3.473744 0.5268355 2.810284
## 2 4 2.585918 0.7307972 2.097935
## 2 5 2.263535 0.7970551 1.850627
## 2 6 2.269918 0.7979362 1.760438
## 2 7 1.769627 0.8768774 1.395560
## 2 8 1.642242 0.8946240 1.292210
## 2 9 1.423601 0.9191185 1.133361
## 2 10 1.398904 0.9219194 1.095466
## 2 11 1.365153 0.9265078 1.056961
## 2 12 1.334194 0.9309022 1.054664
## 2 13 1.340735 0.9311837 1.050705
## 2 14 1.302618 0.9355875 1.025773
## 2 15 1.313371 0.9349276 1.029788
## 2 16 1.287783 0.9379375 1.014472
## 2 17 1.287783 0.9379375 1.014472
## 2 18 1.287783 0.9379375 1.014472
## 2 19 1.287783 0.9379375 1.014472
## 2 20 1.287783 0.9379375 1.014472
## 2 21 1.287783 0.9379375 1.014472
## 2 22 1.287783 0.9379375 1.014472
## 2 23 1.287783 0.9379375 1.014472
## 2 24 1.287783 0.9379375 1.014472
## 2 25 1.287783 0.9379375 1.014472
## 2 26 1.287783 0.9379375 1.014472
## 2 27 1.287783 0.9379375 1.014472
## 2 28 1.287783 0.9379375 1.014472
## 2 29 1.287783 0.9379375 1.014472
## 2 30 1.287783 0.9379375 1.014472
## 2 31 1.287783 0.9379375 1.014472
## 2 32 1.287783 0.9379375 1.014472
## 2 33 1.287783 0.9379375 1.014472
## 2 34 1.287783 0.9379375 1.014472
## 2 35 1.287783 0.9379375 1.014472
## 2 36 1.287783 0.9379375 1.014472
## 2 37 1.287783 0.9379375 1.014472
## 2 38 1.287783 0.9379375 1.014472
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 16 and degree = 2.
mars_pred <- predict(mars_model, newdata = test_data$x)
pr_mars <- postResample(pred = mars_pred, obs = test_data$y)
mars_row <- data.frame(model = "mars",
r_squared = pr_mars[["Rsquared"]],
rmse = pr_mars[["RMSE"]],
mse = pr_mars[["RMSE"]]^2)
compare <- rbind(compare, mars_row)
nnet_grid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10),
.bag = FALSE)
nnet_model <- train(x = training_data$x,
y = training_data$y,
method = 'avNNet',
tuneGrid = nnet_grid,
trControl = ctrl,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(training_data$x) + 1) + 10 + 1,
maxit = 500)
## Warning: executing %dopar% sequentially: no parallel backend registered
nnet_model
## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 2.425781 0.7772900 1.912506
## 0.00 2 2.445790 0.7763799 1.918591
## 0.00 3 2.050823 0.8347940 1.595904
## 0.00 4 2.027251 0.8375237 1.602809
## 0.00 5 2.529135 0.7697625 1.860244
## 0.00 6 2.738552 0.7316011 2.092375
## 0.00 7 4.489358 0.5593220 2.651495
## 0.00 8 5.676330 0.4963704 3.272238
## 0.00 9 5.839399 0.4988938 3.508280
## 0.00 10 3.259136 0.6455203 2.209138
## 0.01 1 2.407830 0.7808729 1.912057
## 0.01 2 2.414800 0.7768646 1.905023
## 0.01 3 2.079379 0.8283189 1.630607
## 0.01 4 2.046872 0.8352091 1.627392
## 0.01 5 2.066268 0.8371940 1.625184
## 0.01 6 2.140136 0.8186529 1.748623
## 0.01 7 2.191099 0.8195851 1.678329
## 0.01 8 2.307624 0.7944878 1.813945
## 0.01 9 2.569384 0.7484895 2.043404
## 0.01 10 2.737772 0.7233009 2.203260
## 0.10 1 2.418022 0.7762852 1.897950
## 0.10 2 2.452679 0.7719314 1.914702
## 0.10 3 2.106233 0.8278623 1.630821
## 0.10 4 2.041037 0.8348755 1.608782
## 0.10 5 2.077090 0.8347308 1.664639
## 0.10 6 2.251557 0.8020515 1.793479
## 0.10 7 2.127261 0.8211299 1.651059
## 0.10 8 2.195091 0.8096029 1.735512
## 0.10 9 2.200882 0.8179188 1.756793
## 0.10 10 2.332269 0.7906976 1.813211
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0 and bag = FALSE.
nnet_pred <- predict(nnet_model, newdata = test_data$x)
pr_nnet <- postResample(pred = nnet_pred, obs = test_data$y)
nnet_row <- data.frame(model = "nnet",
r_squared = pr_nnet[["Rsquared"]],
rmse = pr_nnet[["RMSE"]],
mse = pr_nnet[["RMSE"]]^2)
compare <- rbind(compare, nnet_row)
compare
## model r_squared rmse mse
## 1 knn 0.6747755 3.148156 9.910885
## 2 svm 0.8257365 2.073190 4.298119
## 3 mars 0.9471145 1.149250 1.320777
## 4 nnet 0.7692074 2.594235 6.730055
models <- list(SVM = svm_model, NNET = nnet_model,
KNN = knn_model, MARS = mars_model)
results <- sapply(models, function(m) {
preds <- predict(m, newdata = test_data$x)
RMSE(preds, test_data$y)
})
print(sort(results))
## MARS SVM NNET KNN
## 1.149250 2.073190 2.594235 3.148156
resamps <- resamples(models)
summary(resamps)
##
## Call:
## summary.resamples(object = resamps)
##
## Models: SVM, NNET, KNN, MARS
## Number of resamples: 10
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 1.156875 1.3117114 1.399348 1.463142 1.606072 1.865418 0
## NNET 1.268962 1.4215472 1.592861 1.602809 1.659504 2.229668 0
## KNN 1.834833 2.1471846 2.492130 2.491396 2.893954 3.005473 0
## MARS 0.860703 0.8903864 0.994729 1.014472 1.121062 1.236741 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 1.658470 1.719959 1.789232 1.880885 1.912734 2.385774 0
## NNET 1.501438 1.809560 1.969752 2.027251 2.028334 2.946942 0
## KNN 2.387649 2.844232 2.976898 3.067138 3.411783 3.646876 0
## MARS 1.059342 1.097373 1.260603 1.287783 1.347103 1.698515 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM 0.7798647 0.8423752 0.8731821 0.8665323 0.8997912 0.9295706 0
## NNET 0.7341464 0.8326077 0.8469449 0.8375237 0.8568889 0.9143177 0
## KNN 0.4963614 0.6105380 0.7121444 0.6963304 0.7896009 0.8407250 0
## MARS 0.8924392 0.9358747 0.9450349 0.9379375 0.9480275 0.9584190 0
dotplot(resamps)
Evaluation Analysis: In regards to the evaluation metrics shown above, the MARS model achieves the best overall performance, recording the lowest RMSE at 1.149250, the lowest MSE 1.320777 and the highest \(R^2\) value at 0.9471145. SVM performs marginally well with an \(R^2\) of 0.8257365 and an RMSE of 2.073190. NNET records a \(R2\) of 0.7692074 and RMSE of 2.594235 while KNN recorded the worst performance with an \(R^2\) of 0.6747755 and and RMSE of 3.148156.
par(mfrow = c(2,2), mar = c(3,3,2,1))
plot(test_data$y, knn_pred, xlab = "Actual", ylab = "Predicted", main = "KNN", col = "seagreen");abline(0,1, col = "red")
plot(test_data$y, svm_pred, xlab = "Actual", ylab = "Predicted", main = "SVM", col = "lightblue");abline(0,1, col = "red")
plot(test_data$y, mars_pred, xlab = "Actual", ylab = "Predicted", main = "MARS", col = "mediumpurple");abline(0,1, col = "red")
plot(test_data$y, nnet_pred, xlab = "Actual", ylab = "Predicted", main = "NNET", col = "wheat");abline(0,1, col = "red")
par(mfrow = c(1,1))
Predicted vs Actual Analysis: The red diagonal lines plotted above represent perfect prediction where \(prediction = actual\). MARS follows the diagonal most closely with minimal scatter, consistent with the evaluation metrics discussed previously. SVM and NNET show reasonable alignment along the diagonal but with lightly more spread while KNN shows the most deviation,with noticeable scatter suggesting that the model struggles to capture underlying relationship as well as the other 3 models, this is verified through the evaluation metrics above.
par(mfrow = c(2,2), mar = c(3,3,2,1))
plot(knn_pred, knn_pred - test_data$y, xlab = "Actual", ylab = "Predicted", main = "KNN", col = "seagreen");abline(h = 0, col = "red")
plot(svm_pred, svm_pred - test_data$y, xlab = "Actual", ylab = "Predicted", main = "SVM", col = "lightblue");abline(h = 0, col = "red")
plot(mars_pred, mars_pred - test_data$y, xlab = "Actual", ylab = "Predicted", main = "MARS", col = "mediumpurple");abline(h = 0, col = "red")
plot(nnet_pred, nnet_pred - test_data$y, xlab = "Actual", ylab = "Predicted", main = "NNET", col = "wheat");abline(h = 0, col = "red")
par(mfrow = c(1,1))
Residuals: The red horizontal line at zero represents no error. Mars residuals are captured in a narrow range, randomly scattered around zero, suggesting that it is a well fitted model. NNET and SVM closely follow with the same scattering while KNN displays the widest residual spread, showing some structure unlike the previous three models.
mars_imp <- varImp(mars_model)
mars_imp
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.23
## X2 48.73
## X5 15.52
## X3 0.00
plot(mars_imp, top = 20, main = "MARS Variable Importance")
summary(mars_model$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=2,
## nprune=16)
##
## coefficients
## (Intercept) 20.378441
## h(0.621722-X1) -15.512132
## h(X1-0.621722) 9.177132
## h(0.601063-X2) -17.940676
## h(X2-0.601063) 10.064356
## h(X3-0.281766) 11.590022
## h(0.447442-X3) 14.641640
## h(X3-0.447442) -12.924806
## h(X3-0.606015) 13.416764
## h(0.734892-X4) -10.074386
## h(X4-0.734892) 9.687149
## h(0.850094-X5) -5.385762
## h(0.218266-X1) * h(X2-0.601063) -55.372637
## h(X1-0.218266) * h(X2-0.601063) -27.542831
## h(X1-0.621722) * h(X2-0.295997) -26.527403
## h(0.649253-X1) * h(0.601063-X2) 26.129827
##
## Selected 16 of 18 terms, and 5 of 10 predictors (nprune=16)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 11 4
## GCV 1.61518 RSS 210.6377 GRSq 0.934423 RSq 0.9568093
Mars Variable Importance: The MARS model used predictors X1-X5, with X6-X10 left unused. This does show that MARS performed well in automatic variable selection. The variable importance ranking matches with the true equations, where X1 and X2 contribute to the sin interaction term, X4 has a large linear coefficient of 10, X5 has a smaller coefficient of 5 and X3 has an importance score of 0.00 despite it still being present in the model, but contributing four hinge terms as seen above in the coefficients.