We will create four models(KNN, SVM, NueralNet, and MARS) and evaluate the accuracies of these models on the training and test data to see which models fits the data best .
library(mlbench)
library(kableExtra)
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
knn <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knn
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knn, newdata = testData$x)
knnAccuracy <- postResample(pred = knnPred, obs = testData$y)
svmRadial <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
tuneLength=10,
preProc = c("center", "scale"))
svmRadial
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.545335 0.7804647 2.015121
## 0.50 2.319786 0.7965148 1.830009
## 1.00 2.188349 0.8119636 1.726027
## 2.00 2.103655 0.8241314 1.655842
## 4.00 2.066879 0.8294322 1.631051
## 8.00 2.052681 0.8313929 1.623550
## 16.00 2.049867 0.8318312 1.621820
## 32.00 2.049867 0.8318312 1.621820
## 64.00 2.049867 0.8318312 1.621820
## 128.00 2.049867 0.8318312 1.621820
##
## Tuning parameter 'sigma' was held constant at a value of 0.06802164
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06802164 and C = 16.
svmRadialPred <- predict(svmRadial, newdata = testData$x)
svmRadialAccuracy <- postResample(pred = svmRadialPred, obs = testData$y)
nnetGrid <- expand.grid(.decay=c(0, 0.01, 0.1, 0.5, 0.9),
.size=c(1, 10, 15, 20),
.bag=FALSE)
nnet <- train(x = trainingData$x,
y = trainingData$y,
method = "avNNet",
tuneGrid = nnetGrid,
preProc = c("center", "scale"),
trace=FALSE,
linout=TRUE,
maxit=500)
## Warning: executing %dopar% sequentially: no parallel backend registered
nnet
## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 2.589902 0.7324403 2.014450
## 0.00 10 3.082909 0.6581005 2.345346
## 0.00 15 2.666684 0.7202739 2.135432
## 0.00 20 2.634394 0.7273102 2.116201
## 0.01 1 2.567190 0.7363972 1.993021
## 0.01 10 2.714509 0.7118994 2.173845
## 0.01 15 2.435366 0.7634558 1.935705
## 0.01 20 2.346376 0.7801847 1.850974
## 0.10 1 2.580129 0.7336990 2.000459
## 0.10 10 2.528971 0.7492960 2.003431
## 0.10 15 2.309856 0.7879857 1.823430
## 0.10 20 2.289300 0.7922572 1.799799
## 0.50 1 2.620985 0.7251648 2.034073
## 0.50 10 2.389468 0.7734132 1.893293
## 0.50 15 2.248817 0.7979988 1.778851
## 0.50 20 2.257951 0.7973906 1.768133
## 0.90 1 2.649162 0.7195330 2.057453
## 0.90 10 2.339031 0.7803865 1.849270
## 0.90 15 2.247236 0.7980673 1.774157
## 0.90 20 2.248629 0.7989807 1.770371
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 15, decay = 0.9 and bag = FALSE.
nnetPred <- predict(nnet, newdata = testData$x)
nnetAccuracy <- postResample(pred = nnetPred, obs = testData$y)
marsGrid <- expand.grid(.degree=1:2,
.nprune=2:20)
mars <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = marsGrid,
preProc = c("center", "scale"))
## Loading required package: earth
## Warning: package 'earth' was built under R version 3.6.3
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 3.6.3
mars
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.416233 0.2187486 3.630918
## 1 3 3.716956 0.4495842 2.981710
## 1 4 2.829383 0.6783492 2.285456
## 1 5 2.500824 0.7495068 2.002628
## 1 6 2.377484 0.7721684 1.897799
## 1 7 1.964112 0.8402449 1.540566
## 1 8 1.849850 0.8590078 1.446960
## 1 9 1.760287 0.8734284 1.379551
## 1 10 1.745434 0.8758361 1.358502
## 1 11 1.721573 0.8784527 1.333126
## 1 12 1.741003 0.8757510 1.341457
## 1 13 1.762562 0.8729306 1.355525
## 1 14 1.779852 0.8704986 1.376884
## 1 15 1.796118 0.8682289 1.386179
## 1 16 1.801970 0.8673854 1.392551
## 1 17 1.801970 0.8673854 1.392551
## 1 18 1.801970 0.8673854 1.392551
## 1 19 1.801970 0.8673854 1.392551
## 1 20 1.801970 0.8673854 1.392551
## 2 2 4.421087 0.2136318 3.622047
## 2 3 3.738888 0.4424544 3.004778
## 2 4 2.878704 0.6647839 2.315845
## 2 5 2.556082 0.7359640 2.039618
## 2 6 2.448842 0.7590378 1.941403
## 2 7 2.076809 0.8217323 1.631062
## 2 8 1.919449 0.8477097 1.506739
## 2 9 1.750995 0.8737507 1.383005
## 2 10 1.589003 0.8961730 1.262151
## 2 11 1.503925 0.9077370 1.182204
## 2 12 1.459184 0.9131975 1.139692
## 2 13 1.465062 0.9130346 1.140011
## 2 14 1.440347 0.9155789 1.123394
## 2 15 1.462849 0.9131866 1.141915
## 2 16 1.477622 0.9110707 1.148694
## 2 17 1.479198 0.9110804 1.152683
## 2 18 1.480900 0.9108744 1.151674
## 2 19 1.475717 0.9116473 1.146616
## 2 20 1.475717 0.9116473 1.146616
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
marsPred <- predict(mars, newdata = testData$x)
marsAccuracy <- postResample(pred = marsPred, obs = testData$y)
By examining the summary of the models above we see that MARS has the best accuracy on the training set. Table below summarises the accuracy of testing data. Based on the table below, we see that MARS has the best accuracy out four models we creted.
accuracies <- rbind(marsAccuracy,svmRadialAccuracy,knnAccuracy,nnetAccuracy)
rownames(accuracies )<- c("MARS","SVM","KNN", "NeuralNet")
accuracies%>%
kable() %>%
kable_styling()
RMSE | Rsquared | MAE | |
---|---|---|---|
MARS | 1.277999 | 0.9338365 | 1.014707 |
SVM | 2.086465 | 0.8236735 | 1.585465 |
KNN | 3.204060 | 0.6819919 | 2.568346 |
NeuralNet | 1.894755 | 0.8561760 | 1.441820 |
We will train four models, like previous example, and compare the accuracy metrics to see which model performs better.
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 3.6.3
data(ChemicalManufacturingProcess)
set.seed(100)
pdata <- preProcess(ChemicalManufacturingProcess[,-1], method = c("center", "scale", "knnImpute", "corr", "nzv"))
chemdata <- predict(pdata, ChemicalManufacturingProcess[,-1])
dp <- createDataPartition(ChemicalManufacturingProcess$Yield, p=0.8, list=FALSE)
xtrain <- chemdata[dp, ]
ytrain <- ChemicalManufacturingProcess$Yield[dp]
xtest <- chemdata[-dp, ]
ytest <- ChemicalManufacturingProcess$Yield[-dp]
set.seed(1)
knn <- train(x = xtrain,
y = ytrain,
method = "knn",
preProc = c("center", "scale", "knnImpute"),
tuneLength = 10)
knn
## k-Nearest Neighbors
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56), nearest neighbor imputation (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.482540 0.4100795 1.183748
## 7 1.485428 0.4049431 1.189266
## 9 1.473237 0.4184013 1.177743
## 11 1.467948 0.4248495 1.180289
## 13 1.479784 0.4165194 1.193682
## 15 1.485334 0.4192007 1.201442
## 17 1.496547 0.4160323 1.211116
## 19 1.510702 0.4102281 1.224902
## 21 1.521165 0.4035792 1.231536
## 23 1.525144 0.4097973 1.235632
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 11.
knnpred <- predict(knn, newdata = xtest)
knnAccuracy <- postResample(pred = knnpred, obs = ytest)
nnetGrid <- expand.grid(.decay=c(0, 0.01, 0.1),
.size=c(1, 5, 10),
.bag=FALSE)
set.seed(1)
nnet <- train(x = xtrain,
y = ytrain,
method = "avNNet",
tuneGrid = nnetGrid,
preProc = c("center", "scale", "knnImpute"),
trace=FALSE,
linout=TRUE,
maxit=500)
nnet
## Model Averaged Neural Network
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56), nearest neighbor imputation (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 1.670154 0.29877873 1.350984
## 0.00 5 2.605996 0.20831484 2.035429
## 0.00 10 10.978989 0.06047725 6.867954
## 0.01 1 1.789480 0.32795087 1.410891
## 0.01 5 2.072122 0.28544188 1.543500
## 0.01 10 2.432073 0.28266858 1.879018
## 0.10 1 2.000773 0.32085344 1.486073
## 0.10 5 2.151048 0.28253172 1.528974
## 0.10 10 1.908922 0.32919978 1.469870
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1, decay = 0 and bag = FALSE.
nnetPred <- predict(nnet, newdata = xtest)
nnetAccuracy <- postResample(pred = nnetPred, obs = ytest)
marsGrid <- expand.grid(.degree=1:2,
.nprune=2:10)
set.seed(1)
mars <- train(x = xtrain,
y = ytrain,
method = "earth",
tuneGrid = marsGrid,
preProc = c("center", "scale", "knnImpute"))
mars
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56), nearest neighbor imputation (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.494264 0.4136331 1.159887
## 1 3 1.354276 0.5142937 1.071760
## 1 4 1.324060 0.5374051 1.060051
## 1 5 1.346800 0.5289868 1.081678
## 1 6 1.361229 0.5230820 1.090879
## 1 7 1.426639 0.5056107 1.103635
## 1 8 1.482692 0.4867903 1.144496
## 1 9 1.505364 0.4871104 1.146406
## 1 10 1.495891 0.4928657 1.142761
## 2 2 1.494264 0.4136331 1.159887
## 2 3 1.425216 0.4812740 1.111889
## 2 4 1.375637 0.5187868 1.084589
## 2 5 7.066235 0.5047428 1.860955
## 2 6 1.419019 0.5133965 1.102925
## 2 7 6.739013 0.4769475 1.859735
## 2 8 6.157886 0.4572140 1.803299
## 2 9 5.928832 0.4537672 1.793980
## 2 10 5.678943 0.4475402 1.764904
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
marsPred <- predict(mars, newdata = xtest)
marsAccuracy <- postResample(pred = marsPred, obs = ytest)
set.seed(1)
svm <- train(x = xtrain,
y = ytrain,
method = "svmRadial",
tuneLength=10,
preProc = c("center", "scale", "knnImpute"))
svm
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56), nearest neighbor imputation (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.511314 0.4351025 1.215913
## 0.50 1.412499 0.4826241 1.137893
## 1.00 1.340844 0.5233655 1.080180
## 2.00 1.300915 0.5490888 1.045006
## 4.00 1.285525 0.5570409 1.026532
## 8.00 1.277907 0.5612855 1.019036
## 16.00 1.277799 0.5613581 1.018883
## 32.00 1.277799 0.5613581 1.018883
## 64.00 1.277799 0.5613581 1.018883
## 128.00 1.277799 0.5613581 1.018883
##
## Tuning parameter 'sigma' was held constant at a value of 0.01375209
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01375209 and C = 16.
svmPred <- predict(svm, newdata = xtest)
svmAccuracy <- postResample(pred = svmPred, obs = ytest)
Examining model summary above we see that SVM model has RMSE 1.277799 and R2 0.5613581. This model has the best accuracy metrics on the test data. Below table dispalys accuray of all the four models on the test set. We see SVM has the highest Rsquared value, howerver it also has highest RMSE value. We will select thise SVM as model because of the high Rsquared value.
accuracies <- rbind(marsAccuracy,svmRadialAccuracy,knnAccuracy,nnetAccuracy)
rownames(accuracies )<- c("MARS","SVM","KNN", "NeuralNet")
accuracies%>%
kable() %>%
kable_styling()
RMSE | Rsquared | MAE | |
---|---|---|---|
MARS | 1.165701 | 0.5172382 | 0.9320261 |
SVM | 2.086465 | 0.8236735 | 1.5854649 |
KNN | 1.243071 | 0.4442773 | 1.0406534 |
NeuralNet | 1.401899 | 0.2742625 | 1.2220281 |
Based on the table we see that ManufacturingProcess32 and ManufacturingProcess13 dominate the list. Besides the first two manufacturing process variables, we dont see one variable type dominationg the list over the other.
Top Ten variables from previous assignment
ManufacturingProcess32 100.00 ManufacturingProcess13 97.84 BiologicalMaterial06 82.22 ManufacturingProcess17 77.27 BiologicalMaterial03 76.21 ManufacturingProcess36 70.77 BiologicalMaterial02 68.79 ManufacturingProcess09 67.86 BiologicalMaterial12 63.36 ManufacturingProcess06 55.15
Comparing the above list to below, we see that linear and nolinear models genearted the same top ten predictors.
varImp(svm)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 97.84
## BiologicalMaterial06 82.22
## ManufacturingProcess17 77.27
## BiologicalMaterial03 76.21
## ManufacturingProcess36 70.77
## BiologicalMaterial02 68.79
## ManufacturingProcess09 67.86
## BiologicalMaterial12 63.36
## ManufacturingProcess06 55.15
## BiologicalMaterial04 54.31
## ManufacturingProcess33 49.26
## ManufacturingProcess31 47.73
## ManufacturingProcess11 45.72
## BiologicalMaterial11 42.44
## BiologicalMaterial08 41.89
## ManufacturingProcess29 41.28
## BiologicalMaterial01 41.19
## BiologicalMaterial09 39.70
## ManufacturingProcess02 36.69
plot(varImp(svm))
Based on the plot below we see that ManufacturingProcess32 and yeild are positively correlated and has correlation coefficient of .6083.
plot(ChemicalManufacturingProcess$ManufacturingProcess32, ChemicalManufacturingProcess$Yield)
abline(lm(ChemicalManufacturingProcess$Yield~ChemicalManufacturingProcess$ManufacturingProcess32),col="red",lwd=1.5)
cor(ChemicalManufacturingProcess$ManufacturingProcess32, ChemicalManufacturingProcess$Yield)
## [1] 0.6083321
Based on the plot below we see that ManufacturingProcess13 and yeild are negatively correlated and has correlation coefficient of -0.5036797.
plot(ChemicalManufacturingProcess$ManufacturingProcess13, ChemicalManufacturingProcess$Yield)
abline(lm(ChemicalManufacturingProcess$Yield~ChemicalManufacturingProcess$ManufacturingProcess13),col="red",lwd=1.5)
cor(ChemicalManufacturingProcess$ManufacturingProcess13, ChemicalManufacturingProcess$Yield)
## [1] -0.5036797