library(ggplot2)
library(AppliedPredictiveModeling)
library(magrittr)
library(caret)
library(tidyr)
library(dplyr)
library(knitr)
library(kableExtra)
library(mlbench)
library(kernlab)
library(lattice)
library(earth)
library(doParallel)
registerDoParallel(cores=4)
transparentTheme(trans = .4)
Nonlinear Regression Models
Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:
\[y=10sin(\pi { x }_{ 1 }{ x }_{ 2 })+20{ ({ x }_{ 3 }-0.5) }^{ 2 }+10{ x }_{ 4 }+5{ x }_{ 5 }+N{ (0,{ \sigma }^{ 2 }) }\]
where the x values are random variables uniformly distributed between [0,1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(200)
tr_data <- mlbench.friedman1(200, sd = 1)
Organize data and view plot
tr_data$x <- data.frame(tr_data$x)
featurePlot(x = tr_data$x, y = tr_data$y, plot = "scatter",
type = c("p", "smooth"), span = .5,
layout = c(3, 1))
Simulate test set to estimate performance
tst_data <- mlbench.friedman1(5000, sd = 1)
tst_data$x <- data.frame(tst_data$x)
The preProcess class can be used for many operations on predictors, including centering and scaling. The function preProcess estimates the required parameters for each operation and predict.preProcess is used to apply them to specific data sets. This function can also be interfaces when calling the train function.
knnModel <- train(x = tr_data$x,
y = tr_data$y,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
The function postResample can be used to estimate the root mean squared error (RMSE), simple R2, and the mean absolute error (MAE) for numeric outcomes.
knnPred <- predict(knnModel, newdata = tst_data$x)
postResample(pred = knnPred, obs = tst_data$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1-X5)?
## The predictors cutoff is 0.75
findCorrelation(cor(tr_data$x), cutoff = .75)
## integer(0)
nnetGrid <- expand.grid(size = c(1:10),
decay = c(0, 0.01, 0.1),
bag = FALSE)
ctrl <- trainControl(method = "cv")
nnetTune <- train(tr_data$x, tr_data$y,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = ctrl,
preProcess = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(tr_data$x) + 1) + 10 + 1,
maxit = 500
)
nnetTune
## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.00 2.434845 0.7683498 1.921367
## 1 0.01 2.437343 0.7689596 1.935100
## 1 0.10 2.450747 0.7652079 1.941971
## 2 0.00 2.564619 0.7437707 2.049075
## 2 0.01 2.493438 0.7576380 2.004657
## 2 0.10 2.470469 0.7623416 1.942559
## 3 0.00 2.107356 0.8283417 1.681631
## 3 0.01 2.118485 0.8252286 1.695232
## 3 0.10 2.113338 0.8296311 1.726697
## 4 0.00 1.993740 0.8454879 1.559778
## 4 0.01 2.055123 0.8382896 1.618923
## 4 0.10 2.111877 0.8360611 1.687825
## 5 0.00 2.173066 0.8230241 1.721098
## 5 0.01 2.117938 0.8262887 1.708309
## 5 0.10 2.126142 0.8313323 1.668135
## 6 0.00 3.011411 0.7133146 2.156020
## 6 0.01 2.245598 0.8059848 1.812856
## 6 0.10 2.200281 0.8204366 1.778666
## 7 0.00 6.423508 0.4711110 3.500348
## 7 0.01 2.369153 0.8044959 1.896450
## 7 0.10 2.277288 0.8082229 1.826526
## 8 0.00 5.161402 0.5296825 3.078540
## 8 0.01 2.375962 0.7935227 1.874543
## 8 0.10 2.355936 0.7886816 1.877549
## 9 0.00 7.052266 0.4257688 4.006578
## 9 0.01 2.443129 0.7865069 1.912185
## 9 0.10 2.242392 0.8071338 1.780744
## 10 0.00 2.906893 0.7036006 2.190651
## 10 0.01 2.516951 0.7569720 2.016432
## 10 0.10 2.238560 0.8113030 1.787851
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0 and bag = FALSE.
nnetPred <- predict(nnetTune, newdata = tst_data$x)
postResample(pred = nnetPred, obs = tst_data$y)
## RMSE Rsquared MAE
## 2.496722 0.784618 1.685182
Multivariate Adaptive Regression Splines
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsTuned <- train(tr_data$x, tr_data$y,
method = "earth",
tuneGrid = marsGrid,
trControl = ctrl)
marsTuned
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.413446 0.2478941 3.6399239
## 1 3 3.574483 0.5013467 2.8729864
## 1 4 2.617745 0.7237716 2.0980328
## 1 5 2.320707 0.7886105 1.8754620
## 1 6 2.260119 0.7883800 1.7778789
## 1 7 1.762424 0.8755876 1.4062920
## 1 8 1.714411 0.8837484 1.3496555
## 1 9 1.719717 0.8824787 1.3339340
## 1 10 1.702872 0.8849333 1.3225238
## 1 11 1.751669 0.8809872 1.3531874
## 1 12 1.711261 0.8872353 1.3368464
## 1 13 1.715594 0.8876740 1.3356978
## 1 14 1.705170 0.8894903 1.3253158
## 1 15 1.708649 0.8891224 1.3270375
## 1 16 1.708649 0.8891224 1.3270375
## 1 17 1.708649 0.8891224 1.3270375
## 1 18 1.708649 0.8891224 1.3270375
## 1 19 1.708649 0.8891224 1.3270375
## 1 20 1.708649 0.8891224 1.3270375
## 1 21 1.708649 0.8891224 1.3270375
## 1 22 1.708649 0.8891224 1.3270375
## 1 23 1.708649 0.8891224 1.3270375
## 1 24 1.708649 0.8891224 1.3270375
## 1 25 1.708649 0.8891224 1.3270375
## 1 26 1.708649 0.8891224 1.3270375
## 1 27 1.708649 0.8891224 1.3270375
## 1 28 1.708649 0.8891224 1.3270375
## 1 29 1.708649 0.8891224 1.3270375
## 1 30 1.708649 0.8891224 1.3270375
## 1 31 1.708649 0.8891224 1.3270375
## 1 32 1.708649 0.8891224 1.3270375
## 1 33 1.708649 0.8891224 1.3270375
## 1 34 1.708649 0.8891224 1.3270375
## 1 35 1.708649 0.8891224 1.3270375
## 1 36 1.708649 0.8891224 1.3270375
## 1 37 1.708649 0.8891224 1.3270375
## 1 38 1.708649 0.8891224 1.3270375
## 2 2 4.446426 0.2463165 3.6801871
## 2 3 3.693557 0.4680639 2.9539729
## 2 4 2.604103 0.7280379 2.0888994
## 2 5 2.302083 0.7913823 1.8518115
## 2 6 2.274632 0.7898184 1.7702669
## 2 7 1.755874 0.8759812 1.4073734
## 2 8 1.708271 0.8844751 1.3477172
## 2 9 1.463317 0.9139567 1.1744075
## 2 10 1.387041 0.9257526 1.1002772
## 2 11 1.328055 0.9326865 1.0485877
## 2 12 1.279350 0.9362082 1.0111897
## 2 13 1.259643 0.9393238 0.9954768
## 2 14 1.271698 0.9383447 1.0111163
## 2 15 1.272656 0.9384563 1.0099987
## 2 16 1.280985 0.9373221 1.0229847
## 2 17 1.280985 0.9373221 1.0229847
## 2 18 1.280985 0.9373221 1.0229847
## 2 19 1.280985 0.9373221 1.0229847
## 2 20 1.280985 0.9373221 1.0229847
## 2 21 1.280985 0.9373221 1.0229847
## 2 22 1.280985 0.9373221 1.0229847
## 2 23 1.280985 0.9373221 1.0229847
## 2 24 1.280985 0.9373221 1.0229847
## 2 25 1.280985 0.9373221 1.0229847
## 2 26 1.280985 0.9373221 1.0229847
## 2 27 1.280985 0.9373221 1.0229847
## 2 28 1.280985 0.9373221 1.0229847
## 2 29 1.280985 0.9373221 1.0229847
## 2 30 1.280985 0.9373221 1.0229847
## 2 31 1.280985 0.9373221 1.0229847
## 2 32 1.280985 0.9373221 1.0229847
## 2 33 1.280985 0.9373221 1.0229847
## 2 34 1.280985 0.9373221 1.0229847
## 2 35 1.280985 0.9373221 1.0229847
## 2 36 1.280985 0.9373221 1.0229847
## 2 37 1.280985 0.9373221 1.0229847
## 2 38 1.280985 0.9373221 1.0229847
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 2.
varImp(marsTuned)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.33
## X2 48.88
## X5 15.63
## X3 0.00
marsPred <- predict(marsTuned, newdata = tst_data$x)
postResample(pred = marsPred, obs = tst_data$y)
## RMSE Rsquared MAE
## 1.2803060 0.9335241 1.0168673
Support Vector Machines
svmRTuned <- train(tr_data$x, tr_data$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 15,
trControl = ctrl)
svmRTuned
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.506432 0.8074352 1.991203
## 0.50 2.245243 0.8248294 1.772495
## 1.00 2.055802 0.8483389 1.613707
## 2.00 1.945652 0.8601214 1.506706
## 4.00 1.858992 0.8677698 1.456227
## 8.00 1.841591 0.8687158 1.470754
## 16.00 1.850739 0.8672508 1.485498
## 32.00 1.850714 0.8672494 1.485458
## 64.00 1.850714 0.8672494 1.485458
## 128.00 1.850714 0.8672494 1.485458
## 256.00 1.850714 0.8672494 1.485458
## 512.00 1.850714 0.8672494 1.485458
## 1024.00 1.850714 0.8672494 1.485458
## 2048.00 1.850714 0.8672494 1.485458
## 4096.00 1.850714 0.8672494 1.485458
##
## Tuning parameter 'sigma' was held constant at a value of 0.0623166
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0623166 and C = 8.
svmRPred <- predict(svmRTuned, newdata = tst_data$x)
postResample(pred = svmRPred, obs = tst_data$y)
## RMSE Rsquared MAE
## 2.0514462 0.8294636 1.5563873
MARS model appears to give the best performance as evident from RMSE value in the table below.
rbind(
"mars" = postResample(pred = marsPred, obs = tst_data$y),
"svm" = postResample(pred = svmRPred, obs = tst_data$y),
"net" = postResample(pred = nnetPred, obs = tst_data$y),
"knn" = postResample(pred = knnPred, obs = tst_data$y)
) %>%
kable() %>%
kable_styling()
RMSE | Rsquared | MAE | |
---|---|---|---|
mars | 1.280306 | 0.9335241 | 1.016867 |
svm | 2.051446 | 0.8294636 | 1.556387 |
net | 2.496722 | 0.7846180 | 1.685182 |
knn | 3.204060 | 0.6819919 | 2.568346 |
The MARS model does select the informative predictors (those named X1-X5) as shown by the variable importance table (varImp) below.
varImp(marsTuned)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.33
## X2 48.88
## X5 15.63
## X3 0.00
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
data("ChemicalManufacturingProcess")
preP <- preProcess(ChemicalManufacturingProcess,
method = c("BoxCox", "knnImpute", "center", "scale"))
df <- predict(preP, ChemicalManufacturingProcess)
## Restore to original
df$Yield = ChemicalManufacturingProcess$Yield
## Split the data
trainRows <- createDataPartition(df$Yield, p = .80, list = FALSE)
df.train <- df[trainRows, ]
df.test <- df[-trainRows, ]
Nonlinear Regression Models Training
colYield <- which(colnames(df) == "Yield")
trainX <- df.train[, -colYield]
trainY <- df.train$Yield
testX <- df.test[, -colYield]
testY <- df.test$Yield
KNN Model
knnModel <- train(x = trainX,
y = trainY,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10)
knnPred <- predict(knnModel, newdata = testX)
Neural Networks Model
tooHigh <- findCorrelation(cor(trainX), cutoff = .75)
trainXnnet <- trainX[, -tooHigh]
testXnnet <- testX[, -tooHigh]
nnetGrid <- expand.grid(size = c(1:10),
decay = c(0, 0.01, 0.1),
bag = FALSE)
ctrl <- trainControl(method = "cv")
nnetTune <- train(trainXnnet, trainY,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = ctrl,
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(tr_data$x) + 1) + 10 + 1,
maxit = 500
)
nnetPred <- predict(nnetTune, newdata = testXnnet)
MARS Model
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsTuned <- train(trainX, trainY,
method = "earth",
tuneGrid = marsGrid,
trControl = ctrl)
marsPred <- predict(marsTuned, newdata = testX)
SVM Model
svmRTuned <- train(trainX, trainY,
method = "svmRadial",
tuneLength = 15,
trControl = ctrl)
svmRPred <- predict(svmRTuned, newdata = testX)
Train set performance
rbind(
"mars" = postResample(pred = predict(marsTuned), obs = trainY),
"svm" = postResample(pred = predict(svmRTuned), obs = trainY),
"net" = postResample(pred = predict(nnetTune), obs = trainY),
"knn" = postResample(pred = predict(knnModel), obs = trainY)
) %>%
kable() %>%
kable_styling()
RMSE | Rsquared | MAE | |
---|---|---|---|
mars | 0.8141725 | 0.8162069 | 0.6495019 |
svm | 0.3102481 | 0.9759565 | 0.2271820 |
net | 0.5294216 | 0.9269025 | 0.3558515 |
knn | 1.1561029 | 0.6596899 | 0.8897816 |
Test set performance
rbind(
"mars" = postResample(pred = marsPred, obs = testY),
"svm" = postResample(pred = svmRPred, obs = testY),
"net" = postResample(pred = nnetPred, obs = testY),
"knn" = postResample(pred = knnPred, obs = testY)
) %>%
kable() %>%
kable_styling()
RMSE | Rsquared | MAE | |
---|---|---|---|
mars | 1.321779 | 0.4528653 | 1.137929 |
svm | 1.169016 | 0.4698346 | 0.887459 |
net | 1.336501 | 0.3965722 | 1.051339 |
knn | 1.250764 | 0.3593780 | 1.116389 |
The RMSE value for SVM was the lowest i.e Train: 0.3102481 and Test: 1.169016. Looks like SVM is optimal
varImp(svmRTuned)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess32 95.71
## ManufacturingProcess17 88.31
## BiologicalMaterial06 81.77
## ManufacturingProcess36 75.41
## ManufacturingProcess09 70.71
## BiologicalMaterial12 70.31
## BiologicalMaterial03 70.30
## ManufacturingProcess31 57.99
## ManufacturingProcess06 57.52
## BiologicalMaterial02 54.56
## BiologicalMaterial11 47.81
## ManufacturingProcess33 46.19
## ManufacturingProcess11 45.22
## BiologicalMaterial04 44.54
## ManufacturingProcess30 42.56
## BiologicalMaterial08 41.76
## ManufacturingProcess12 39.68
## BiologicalMaterial01 38.16
## ManufacturingProcess02 37.51
The above list shows most important predictors at the top, for the optimal model (SVM). There are slightly more process variables dominating the list rather than biological ones.
The below listings of top ten important predictors from the less optimal models against the SVM model, confirm that process variables dominate the list as being the most important. However, different models selected different process variables in the top ten list of importance.
varImp(marsTuned)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess25 52.90
## ManufacturingProcess17 35.66
## BiologicalMaterial02 34.99
## ManufacturingProcess28 34.99
## BiologicalMaterial03 34.99
## ManufacturingProcess06 27.66
## ManufacturingProcess33 15.89
## BiologicalMaterial09 0.00
varImp(nnetTune)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 37)
##
## Overall
## ManufacturingProcess17 100.00
## ManufacturingProcess36 85.40
## BiologicalMaterial03 79.61
## ManufacturingProcess31 65.67
## ManufacturingProcess06 65.14
## ManufacturingProcess33 52.30
## ManufacturingProcess11 51.21
## ManufacturingProcess12 44.94
## ManufacturingProcess02 42.47
## BiologicalMaterial09 42.09
## ManufacturingProcess01 33.81
## ManufacturingProcess04 32.17
## ManufacturingProcess27 28.35
## ManufacturingProcess35 23.09
## ManufacturingProcess20 22.43
## BiologicalMaterial10 21.69
## ManufacturingProcess16 19.95
## ManufacturingProcess24 19.63
## ManufacturingProcess21 16.20
## ManufacturingProcess28 14.26
varImp(knnModel)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess32 95.71
## ManufacturingProcess17 88.31
## BiologicalMaterial06 81.77
## ManufacturingProcess36 75.41
## ManufacturingProcess09 70.71
## BiologicalMaterial12 70.31
## BiologicalMaterial03 70.30
## ManufacturingProcess31 57.99
## ManufacturingProcess06 57.52
## BiologicalMaterial02 54.56
## BiologicalMaterial11 47.81
## ManufacturingProcess33 46.19
## ManufacturingProcess11 45.22
## BiologicalMaterial04 44.54
## ManufacturingProcess30 42.56
## BiologicalMaterial08 41.76
## ManufacturingProcess12 39.68
## BiologicalMaterial01 38.16
## ManufacturingProcess02 37.51
dotPlot(varImp(knnModel), top=20)
vip <- varImp(svmRTuned)$importance
top10Vars <- head(rownames(vip)[order(-vip$Overall)], 10)
as.data.frame(top10Vars)
## top10Vars
## 1 ManufacturingProcess13
## 2 ManufacturingProcess32
## 3 ManufacturingProcess17
## 4 BiologicalMaterial06
## 5 ManufacturingProcess36
## 6 ManufacturingProcess09
## 7 BiologicalMaterial12
## 8 BiologicalMaterial03
## 9 ManufacturingProcess31
## 10 ManufacturingProcess06
plotX <- df[,top10Vars]
plotY <- df[,colYield]
colnames(plotX) <- gsub("(Process|Material)", "", colnames(plotX))
featurePlot(x = plotX, y = plotY, plot = "scatter",
type = c("p", "smooth"), span = .5,
layout = c(3, 1))
This shows the relationship between the topn 10 predictors and the response. SVM is the optimal model. The top predictors appears to have linear relationship with the response. As per our findings ManufacturingProcess13 has the most positive correlation. Some of the Manufacturing Process have negative impact (negative correlation) on Yield, while most of the Biological Maeterial maintain a positive and balanced correlation.