\(y = 10 sin(\pi x_1x_2) + 20(x_3 − 0.5)^2 + 10x_4 + 5x_5 + N(0, \sigma^2)\)
where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
library(mlbench)
library(caret)
set.seed(200)
trainingData = mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x = data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData = mlbench.friedman1(5000, sd = 1)
testData$x = data.frame(testData$x)
For example:
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
k-Nearest Neighbors
200 samples
10 predictor
Pre-processing: centered (10), scaled (10)
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
Resampling results across tuning parameters:
k RMSE Rsquared MAE
5 3.466085 0.5121775 2.816838
7 3.349428 0.5452823 2.727410
9 3.264276 0.5785990 2.660026
11 3.214216 0.6024244 2.603767
13 3.196510 0.6176570 2.591935
15 3.184173 0.6305506 2.577482
17 3.183130 0.6425367 2.567787
19 3.198752 0.6483184 2.592683
21 3.188993 0.6611428 2.588787
23 3.200458 0.6638353 2.604529
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was k = 17.
postResample(predict(knnModel, newdata = testData$x), testData$y)
RMSE Rsquared MAE
3.2040595 0.6819919 2.5683461
findCorrelation(cor(trainingData$x), cutoff = .75)
integer(0)
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10),
## The next option is to use bagging (see the
## next chapter) instead of different random
## seeds.
.bag = FALSE)
nnetTune <- train(trainingData$x, trainingData$y,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = trainControl(method = "cv"),
preProcess = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
maxit = 500
)
nnetTune
Model Averaged Neural Network
200 samples
10 predictor
Pre-processing: centered (10), scaled (10)
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
Resampling results across tuning parameters:
decay size RMSE Rsquared MAE
0.00 1 2.434845 0.7683498 1.921367
0.00 2 2.497822 0.7558233 1.993325
0.00 3 2.037880 0.8419798 1.609407
0.00 4 1.900438 0.8584417 1.530058
0.00 5 2.172494 0.8105644 1.625860
0.00 6 2.739018 0.7277206 1.989002
0.00 7 3.541548 0.6412605 2.535998
0.00 8 4.050682 0.5925266 2.760923
0.00 9 4.183739 0.5149146 2.787096
0.00 10 4.682237 0.5847214 2.818050
0.01 1 2.437231 0.7689665 1.934978
0.01 2 2.510986 0.7596191 1.988260
0.01 3 1.999943 0.8419567 1.555751
0.01 4 2.003357 0.8445288 1.549723
0.01 5 2.104801 0.8296459 1.664982
0.01 6 2.314704 0.7997307 1.857949
0.01 7 2.341042 0.8072423 1.872533
0.01 8 2.216736 0.8147526 1.760416
0.01 9 2.263142 0.8145917 1.776603
0.01 10 2.453311 0.7709663 1.981977
0.10 1 2.450897 0.7652309 1.942945
0.10 2 2.489399 0.7606443 1.997060
0.10 3 2.200693 0.8155496 1.786599
0.10 4 2.059322 0.8432340 1.651716
0.10 5 2.189025 0.8133603 1.729453
0.10 6 2.215091 0.8128993 1.757966
0.10 7 2.209519 0.8196476 1.786771
0.10 8 2.316952 0.8010619 1.826695
0.10 9 2.286711 0.7928430 1.849003
0.10 10 2.240376 0.8113316 1.786771
Tuning parameter 'bag' was held constant at a value of FALSE
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were size = 4, decay = 0 and bag = FALSE.
postResample(predict(nnetTune, newdata = testData$x) ,testData$y)
RMSE Rsquared MAE
2.496722 0.784618 1.685182
#Load the libraru
library(earth)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsTuned <- train(trainingData$x, trainingData$y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marsTuned
Multivariate Adaptive Regression Spline
200 samples
10 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
Resampling results across tuning parameters:
degree nprune RMSE Rsquared MAE
1 2 4.334325 0.2599883 3.607719
1 3 3.599334 0.4805557 2.888987
1 4 2.637145 0.7290848 2.087677
1 5 2.283872 0.7939684 1.817343
1 6 2.125875 0.8183677 1.647491
1 7 1.766013 0.8733619 1.410328
1 8 1.671282 0.8842102 1.324258
1 9 1.645406 0.8867947 1.322041
1 10 1.597968 0.8926582 1.297518
1 11 1.540109 0.8996361 1.237949
1 12 1.545349 0.8992979 1.243771
1 13 1.535169 0.9010122 1.233571
1 14 1.529405 0.9018457 1.223874
1 15 1.529405 0.9018457 1.223874
1 16 1.529405 0.9018457 1.223874
1 17 1.529405 0.9018457 1.223874
1 18 1.529405 0.9018457 1.223874
1 19 1.529405 0.9018457 1.223874
1 20 1.529405 0.9018457 1.223874
1 21 1.529405 0.9018457 1.223874
1 22 1.529405 0.9018457 1.223874
1 23 1.529405 0.9018457 1.223874
1 24 1.529405 0.9018457 1.223874
1 25 1.529405 0.9018457 1.223874
1 26 1.529405 0.9018457 1.223874
1 27 1.529405 0.9018457 1.223874
1 28 1.529405 0.9018457 1.223874
1 29 1.529405 0.9018457 1.223874
1 30 1.529405 0.9018457 1.223874
1 31 1.529405 0.9018457 1.223874
1 32 1.529405 0.9018457 1.223874
1 33 1.529405 0.9018457 1.223874
1 34 1.529405 0.9018457 1.223874
1 35 1.529405 0.9018457 1.223874
1 36 1.529405 0.9018457 1.223874
1 37 1.529405 0.9018457 1.223874
1 38 1.529405 0.9018457 1.223874
2 2 4.334325 0.2599883 3.607719
2 3 3.599334 0.4805557 2.888987
2 4 2.637145 0.7290848 2.087677
2 5 2.271844 0.7927888 1.823675
2 6 2.114868 0.8200184 1.659485
2 7 1.780140 0.8733216 1.429346
2 8 1.663164 0.8891928 1.294968
2 9 1.460976 0.9122520 1.180387
2 10 1.399692 0.9175376 1.122526
2 11 1.380002 0.9216251 1.110556
2 12 1.312883 0.9284253 1.063321
2 13 1.285612 0.9343029 1.014216
2 14 1.328520 0.9286650 1.052185
2 15 1.322954 0.9298515 1.045527
2 16 1.341454 0.9283961 1.053190
2 17 1.344590 0.9280972 1.054209
2 18 1.340821 0.9285264 1.050274
2 19 1.340821 0.9285264 1.050274
2 20 1.340821 0.9285264 1.050274
2 21 1.340821 0.9285264 1.050274
2 22 1.340821 0.9285264 1.050274
2 23 1.340821 0.9285264 1.050274
2 24 1.340821 0.9285264 1.050274
2 25 1.340821 0.9285264 1.050274
2 26 1.340821 0.9285264 1.050274
2 27 1.340821 0.9285264 1.050274
2 28 1.340821 0.9285264 1.050274
2 29 1.340821 0.9285264 1.050274
2 30 1.340821 0.9285264 1.050274
2 31 1.340821 0.9285264 1.050274
2 32 1.340821 0.9285264 1.050274
2 33 1.340821 0.9285264 1.050274
2 34 1.340821 0.9285264 1.050274
2 35 1.340821 0.9285264 1.050274
2 36 1.340821 0.9285264 1.050274
2 37 1.340821 0.9285264 1.050274
2 38 1.340821 0.9285264 1.050274
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nprune = 13 and degree = 2.
postResample(predict(marsTuned, newdata = testData$x), testData$y)
RMSE Rsquared MAE
1.2803060 0.9335241 1.0168673
svmRTuned <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 15,
trControl = trainControl(method = "cv"))
svmRTuned
Support Vector Machines with Radial Basis Function Kernel
200 samples
10 predictor
Pre-processing: centered (10), scaled (10)
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
Resampling results across tuning parameters:
C RMSE Rsquared MAE
0.25 2.504105 0.7940789 1.987142
0.50 2.219946 0.8148914 1.750249
1.00 2.028115 0.8388693 1.590383
2.00 1.899331 0.8561464 1.486326
4.00 1.815632 0.8669708 1.424246
8.00 1.798299 0.8702910 1.427678
16.00 1.797165 0.8702715 1.431259
32.00 1.795246 0.8705225 1.429235
64.00 1.795246 0.8705225 1.429235
128.00 1.795246 0.8705225 1.429235
256.00 1.795246 0.8705225 1.429235
512.00 1.795246 0.8705225 1.429235
1024.00 1.795246 0.8705225 1.429235
2048.00 1.795246 0.8705225 1.429235
4096.00 1.795246 0.8705225 1.429235
Tuning parameter 'sigma' was held constant at a value of 0.06104815
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were sigma = 0.06104815 and C = 32.
postResample(predict(svmRTuned, newdata = testData$x), testData$y)
RMSE Rsquared MAE
2.0693488 0.8263553 1.5718972
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
library(RANN)
impute <- preProcess(ChemicalManufacturingProcess, "knnImpute")
chem_data <- predict(impute, ChemicalManufacturingProcess)
library(dplyr)
chem_data <- chem_data %>% select(!nearZeroVar(.))
train_index_chen <- createDataPartition(chem_data$Yield , p=.8, list=F)
train_chem <- chem_data[ train_index_chen,]
test_chem <- chem_data[-train_index_chen,]
knnModel2 <- train( Yield~.,
data = train_chem,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
postResample(predict(knnModel2, test_chem), test_chem$Yield)
RMSE Rsquared MAE
0.5808940 0.7487493 0.4562489
cutoff <- findCorrelation(cor(train_chem), cutoff = .75)
train_Chemnnet <- train_chem[, -cutoff]
test_Chemnnet <- train_chem[, -cutoff]
nnetTune2 <- train(Yield~. ,
data = train_Chemnnet,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = trainControl(method = "cv"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(train_Chemnnet) + 1) + 10 + 1,
maxit = 500
)
postResample(predict(nnetTune2, test_Chemnnet), test_Chemnnet$Yield)
RMSE Rsquared MAE
0.004108857 0.999986655 0.003110546
marsTuned2 <- train(Yield~. ,
data = train_chem,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
postResample(predict(marsTuned2, test_chem), test_chem$Yield)
RMSE Rsquared MAE
0.5618249 0.6708873 0.4473569
svmRTuned2 <- train(Yield~. ,
data = train_chem,
method = "svmRadial",
tuneLength = 15,
trControl = trainControl(method = "cv"))
postResample(predict(svmRTuned2, test_chem), test_chem$Yield)
RMSE Rsquared MAE
0.4537185 0.7928029 0.3543973
Based on the above findings, the neural network model fit the data best performing way better than all other models.
plot(varImp(nnetTune2), top = 10)
library(DataExplorer)
cutoffChem <- chem_data[, -cutoff]
cutoffChem %>% select(Yield , ends_with(c("03","36","17","06","11","33","11","09","30")) & !c("ManufacturingProcess03") ) %>%
plot_correlation()