Boston data set which includes housing data with features of the houses and their prices.
library(MASS)
library(caret)
library(earth)
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Predict the medv column or the median value
inTraining <- createDataPartition(Boston$medv, p = .80, list = FALSE)
training <- Boston[inTraining,]
testing <- Boston[-inTraining,]
earth: Multivariate Adaptive Regression Splines
https://cran.r-project.org/web/packages/earth/index.html
model = earth(medv ~., data = training)
model
## Selected 18 of 23 terms, and 9 of 13 predictors
## Termination condition: Reached nk 27
## Importance: rm, lstat, dis, nox, ptratio, tax, rad, indus, crim, zn-unused, ...
## Number of terms at each degree of interaction: 1 17 (additive model)
## GCV 11.94898 RSS 4062.771 GRSq 0.8640527 RSq 0.8858688
test.features = subset(testing, select=-c(medv))
test.target = subset(testing, select=medv)[,1]
predictions = predict(model, newdata = test.features)
# RMSE
sqrt(mean((test.target - predictions)^2))
## [1] 3.026785
RMSE measures the average difference between values predicted by a model and the actual values. It provides an estimation of how well the model is able to predict the target value (accuracy). Lower values of RMSE indicate better fit
# R2
cor(test.target, predictions) ^ 2
## medv
## [1,] 0.8748419
The higher the R-squared, the better the model fits the data.
using Caret Package we can fit the models and compare results
#https://www.rdocumentation.org/packages/caret/versions/6.0-92/topics/trainControl
#This may not be obvious as train does some optimizations for certain models.
control <- trainControl(method='repeatedcv', number=10, repeats=3)
# Support Vector Machines (SVM)
set.seed(101)
fit.svm <- train(medv~., data=training, method='svmRadial', metric='RMSE',
preProc=c('center', 'scale'), trControl=control)
# k-Nearest Neighbors (KNN)
set.seed(101)
fit.knn <- train(medv~., data=training, method='knn', metric='RMSE',
preProc=c('center', 'scale'), trControl=control)
# MARS
set.seed(101)
fit.mars <- train(medv~., data=training, method='earth', metric='RMSE',
preProc=c('center', 'scale'), trControl=control)
## https://www.rdocumentation.org/packages/caret/versions/4.47/topics/train
allresults <- resamples(list(svm=fit.svm, knn=fit.knn, mars = fit.mars))
# Summary and Plot of Results
summary(allresults)
##
## Call:
## summary.resamples(object = allresults)
##
## Models: svm, knn, mars
## Number of resamples: 30
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svm 1.579409 1.999353 2.249341 2.355454 2.830854 3.304638 0
## knn 1.954146 2.578427 2.989756 2.991557 3.357250 4.250244 0
## mars 2.226606 2.326069 2.635505 2.705679 2.917898 3.848573 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svm 2.085712 2.824973 3.585743 3.796691 4.494065 6.177018 0
## knn 2.644881 3.845461 4.398009 4.668636 5.799785 6.512648 0
## mars 2.703273 3.204444 3.741829 3.874780 4.244377 6.356789 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## svm 0.6857213 0.7867525 0.8597685 0.8411137 0.9002478 0.9482930 0
## knn 0.5262871 0.7399963 0.7887091 0.7644481 0.8276592 0.8958144 0
## mars 0.4999108 0.8191060 0.8437008 0.8292226 0.8776157 0.9228219 0
Lower values of RMSE indicate better fit
Higher values of R-squared indicates better fit
dotplot(allresults)