library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
KNN
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set performance values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
SVM
svmModel <- train(x=trainingData$x, y=trainingData$y,
method="svmRadial",
preProcess=c("center", "scale"),
tuneLength=20)
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.545335 0.7804647 2.015121
## 0.50 2.319786 0.7965148 1.830009
## 1.00 2.188349 0.8119636 1.726027
## 2.00 2.103655 0.8241314 1.655842
## 4.00 2.066879 0.8294322 1.631051
## 8.00 2.052681 0.8313929 1.623550
## 16.00 2.049867 0.8318312 1.621820
## 32.00 2.049867 0.8318312 1.621820
## 64.00 2.049867 0.8318312 1.621820
## 128.00 2.049867 0.8318312 1.621820
## 256.00 2.049867 0.8318312 1.621820
## 512.00 2.049867 0.8318312 1.621820
## 1024.00 2.049867 0.8318312 1.621820
## 2048.00 2.049867 0.8318312 1.621820
## 4096.00 2.049867 0.8318312 1.621820
## 8192.00 2.049867 0.8318312 1.621820
## 16384.00 2.049867 0.8318312 1.621820
## 32768.00 2.049867 0.8318312 1.621820
## 65536.00 2.049867 0.8318312 1.621820
## 131072.00 2.049867 0.8318312 1.621820
##
## Tuning parameter 'sigma' was held constant at a value of 0.06802164
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06802164 and C = 16.
svmPred <- predict(svmModel, newdata=testData$x)
svmpr <- postResample(pred=svmPred, obs=testData$y)
svmpr
## RMSE Rsquared MAE
## 2.0864652 0.8236735 1.5854649
Neural Network
library(nnet)
nnetModel <- nnet(trainingData$x, trainingData$y,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts=5 * (ncol(trainingData$x) + 1) + 5 + 1)
nnetPred=predict(nnetModel,testData$x)
postResample(pred = nnetPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.5842889 0.7416461 1.9705956
MARS: Multivariate Adaptive Regression Splines
marsGrid <- expand.grid(.degree=1:2,
.nprune=2:20)
marsModel <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = marsGrid,
preProc = c("center", "scale"))
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.1.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.1.3
## Loading required package: plotrix
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 4.1.3
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.441521 0.2189670 3.657091
## 1 3 3.735661 0.4433125 3.009521
## 1 4 2.866002 0.6689285 2.293544
## 1 5 2.515705 0.7424904 2.010746
## 1 6 2.383991 0.7709143 1.894179
## 1 7 1.974017 0.8423134 1.553429
## 1 8 1.857331 0.8604192 1.463521
## 1 9 1.824625 0.8657419 1.419261
## 1 10 1.808885 0.8686792 1.401253
## 1 11 1.820357 0.8673487 1.397892
## 1 12 1.858969 0.8617098 1.430222
## 1 13 1.855403 0.8621244 1.432273
## 1 14 1.869544 0.8596946 1.446095
## 1 15 1.885361 0.8575057 1.462116
## 1 16 1.881835 0.8578251 1.458653
## 1 17 1.884816 0.8575150 1.462251
## 1 18 1.887079 0.8571628 1.464440
## 1 19 1.887079 0.8571628 1.464440
## 1 20 1.887079 0.8571628 1.464440
## 2 2 4.444235 0.2179564 3.660101
## 2 3 3.736840 0.4423882 3.010383
## 2 4 2.865763 0.6682012 2.287784
## 2 5 2.480514 0.7500061 1.977090
## 2 6 2.340100 0.7804955 1.840455
## 2 7 1.987852 0.8405024 1.558922
## 2 8 1.870327 0.8596864 1.466339
## 2 9 1.758141 0.8753405 1.376451
## 2 10 1.627200 0.8928529 1.266051
## 2 11 1.580031 0.8989829 1.238208
## 2 12 1.514494 0.9071887 1.181274
## 2 13 1.510248 0.9078432 1.181635
## 2 14 1.492638 0.9093650 1.168696
## 2 15 1.513457 0.9072019 1.186319
## 2 16 1.515728 0.9069050 1.186683
## 2 17 1.524082 0.9059094 1.195224
## 2 18 1.534602 0.9048036 1.203880
## 2 19 1.536397 0.9046605 1.205418
## 2 20 1.537450 0.9045338 1.206899
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
marsPred <- predict(marsModel, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
## RMSE Rsquared MAE
## 1.2779993 0.9338365 1.0147070
rbind(
"mars" = postResample(pred = marsPred, obs = testData$y),
"svm" = postResample(pred = svmPred, obs = testData$y),
"net" = postResample(pred = nnetPred, obs = testData$y),
"knn" = postResample(pred = knnPred, obs = testData$y)
)
## RMSE Rsquared MAE
## mars 1.277999 0.9338365 1.014707
## svm 2.086465 0.8236735 1.585465
## net 2.584289 0.7416461 1.970596
## knn 3.204059 0.6819919 2.568346
It seems that the MARS model is the most favorable, with the lowest test set RMSE.
Below the variable importance in the MARS model are calculated:
varImp(marsModel)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.40
## X2 49.00
## X5 15.72
## X3 0.00
The MARS model does select the informative predictors.
Data Pre-Processing
library(AppliedPredictiveModeling)
## Warning: package 'AppliedPredictiveModeling' was built under R version 4.1.3
data("ChemicalManufacturingProcess")
(cmpImpute <- preProcess(ChemicalManufacturingProcess[,-c(1)], method=c('bagImpute')))
## Created from 152 samples and 57 variables
##
## Pre-processing:
## - bagged tree imputation (57)
## - ignored (0)
df <- predict(cmpImpute, ChemicalManufacturingProcess[,-c(1)])
Nonlinear Regression Models
set.seed(999)
trainRow <- createDataPartition(ChemicalManufacturingProcess$Yield, p=0.8, list=FALSE)
trainx <- df[trainRow, ]
trainy <- ChemicalManufacturingProcess$Yield[trainRow]
testx <- df[-trainRow, ]
testy <- ChemicalManufacturingProcess$Yield[-trainRow]
## KNN
knnModel <- train(x = trainx,
y = trainy,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
knnPred <- predict(knnModel, newdata = testx)
## NN
library(nnet)
nnetModel <- nnet(x = trainx,
y = trainy,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts=5 * (ncol(trainx) + 1) + 5 + 1)
nnetPred=predict(nnetModel,testx)
postResample(pred = nnetPred, obs = testy)
## RMSE Rsquared MAE
## 2.0553609 0.4133609 1.6090328
## MARS
marsGrid <- expand.grid(.degree=1:2,
.nprune=2:10)
marsModel <- train(trainx, trainy,
method = "earth",
tuneGrid = marsGrid,
preProc = c("center", "scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
marsPred <- predict(marsModel, newdata = testx)
## SVM
svmModel <- train(trainx, trainy,
method = "svmRadial",
tuneLength=10,
preProc = c("center", "scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in .local(x, ...): Variable(s) `' constant. Cannot scale data.
svmPred <- predict(svmModel, newdata = testx)
rbind(
"mars" = postResample(pred = predict(marsModel), obs = trainy),
"svm" = postResample(pred = predict(svmModel), obs = trainy),
"net" = postResample(pred = predict(nnetModel), obs = trainy),
"knn" = postResample(pred = predict(knnModel), obs = trainy)
)
## RMSE Rsquared MAE
## mars 1.1756076 0.5863735 0.9399376
## svm 0.1732936 0.9932868 0.1686219
## net 0.7171163 0.8463978 0.4765185
## knn 1.3100524 0.5393043 1.0472135
rbind(
"mars" = postResample(pred = marsPred, obs = testy),
"svm" = postResample(pred = svmPred, obs = testy),
"net" = postResample(pred = nnetPred, obs = testy),
"knn" = postResample(pred = knnPred, obs = testy)
)
## RMSE Rsquared MAE
## mars 1.0222880 0.7229700 0.8109983
## svm 0.9539216 0.7567363 0.7860088
## net 2.0553609 0.4133609 1.6090328
## knn 1.3240022 0.6163930 1.1042604
Looking at the lowest RMSE values, presented above for training and test set performances, SVM appears to be most optimal.
varImp(svmModel)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 88.14
## ManufacturingProcess36 80.40
## ManufacturingProcess13 76.90
## BiologicalMaterial03 75.15
## BiologicalMaterial02 64.46
## BiologicalMaterial12 63.47
## ManufacturingProcess17 57.78
## ManufacturingProcess09 55.72
## ManufacturingProcess31 53.54
## ManufacturingProcess33 52.96
## BiologicalMaterial04 49.38
## ManufacturingProcess29 47.25
## BiologicalMaterial11 44.88
## BiologicalMaterial01 43.98
## BiologicalMaterial08 41.02
## ManufacturingProcess06 38.80
## ManufacturingProcess02 31.54
## ManufacturingProcess11 30.83
## ManufacturingProcess18 23.57
There are slightly more process variables dominating the list rather than biological ones according to SVM.
varImp(marsModel)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100
## ManufacturingProcess09 0
varImp(nnetModel)
## Overall
## X1 1.21254012
## X2 1.14134501
## X3 0.58379587
## X4 0.50716621
## X5 1.80570109
## X6 1.69893291
## X7 0.68814032
## X8 1.26935423
## X9 1.33005086
## X10 0.66427342
## X11 1.02469605
## X12 0.88458482
## X13 0.25238399
## X14 0.31177291
## X15 2.68219709
## X16 1.43565067
## X17 1.55226875
## X18 1.52591138
## X19 1.14069528
## X20 1.42743433
## X21 0.28296990
## X22 1.01866726
## X23 0.17436537
## X24 1.62462250
## X25 0.99443216
## X26 7.19032309
## X27 8.79450087
## X28 6.60120731
## X29 1.07978100
## X30 7.26186498
## X31 9.18558331
## X32 6.92734822
## X33 0.88763455
## X34 0.15567304
## X35 0.97000843
## X36 0.31073899
## X37 2.06673324
## X38 2.78043903
## X39 1.55108235
## X40 1.00162006
## X41 0.76588130
## X42 1.87976394
## X43 0.46038099
## X44 1.64218825
## X45 0.26381346
## X46 0.44678987
## X47 1.02879464
## X48 0.08198565
## X49 0.85026082
## X50 1.16539795
## X51 0.21705652
## X52 0.50210232
## X53 1.00108410
## X54 0.39914825
## X55 1.61829428
## X56 2.34392542
## X57 1.33464124
varImp(knnModel)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 88.14
## ManufacturingProcess36 80.40
## ManufacturingProcess13 76.90
## BiologicalMaterial03 75.15
## BiologicalMaterial02 64.46
## BiologicalMaterial12 63.47
## ManufacturingProcess17 57.78
## ManufacturingProcess09 55.72
## ManufacturingProcess31 53.54
## ManufacturingProcess33 52.96
## BiologicalMaterial04 49.38
## ManufacturingProcess29 47.25
## BiologicalMaterial11 44.88
## BiologicalMaterial01 43.98
## BiologicalMaterial08 41.02
## ManufacturingProcess06 38.80
## ManufacturingProcess02 31.54
## ManufacturingProcess11 30.83
## ManufacturingProcess18 23.57
It appears that the ManufacturingProcess predictors are more important. The above examples show that process variables dominate the list as being the most important. However, different models selected different process variables in the top ten list of importance.
yield= ChemicalManufacturingProcess$Yield
ggplot(df, aes(ManufacturingProcess32, yield)) +
geom_point()
ggplot(df, aes(ManufacturingProcess13, yield)) +
geom_point()
ggplot(df, aes(BiologicalMaterial06, yield)) +
geom_point()
ggplot(df, aes(BiologicalMaterial03, yield)) +
geom_point()
The plots above show the relationship between the top 10 predictors and the response. These plots suggest that for the SVM , the top predictors have much of a linear relationship with the response.