7.2. Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:
where the x value are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.565620 0.4887976 2.886629
## 7 3.422420 0.5300524 2.752964
## 9 3.368072 0.5536927 2.715310
## 11 3.323010 0.5779056 2.669375
## 13 3.275835 0.6030846 2.628663
## 15 3.261864 0.6163510 2.621192
## 17 3.261973 0.6267032 2.616956
## 19 3.286299 0.6281075 2.640585
## 21 3.280950 0.6390386 2.643807
## 23 3.292397 0.6440392 2.656080
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.1750657 0.6785946 2.5443169
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1 - X5)?
set.seed(66)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsTuned <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
marsTuned
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.205230 0.2960210 3.4709970
## 1 3 3.501801 0.4970850 2.8191364
## 1 4 2.588739 0.7260805 2.0659267
## 1 5 2.330593 0.7830410 1.8966573
## 1 6 2.310999 0.7804358 1.8505313
## 1 7 1.851929 0.8640780 1.4394269
## 1 8 1.799326 0.8724573 1.4315502
## 1 9 1.714155 0.8854505 1.3172544
## 1 10 1.700495 0.8884705 1.3190100
## 1 11 1.667026 0.8906987 1.3031268
## 1 12 1.613710 0.8958219 1.2632119
## 1 13 1.601422 0.8977722 1.2338402
## 1 14 1.601422 0.8977722 1.2338402
## 1 15 1.601422 0.8977722 1.2338402
## 1 16 1.601422 0.8977722 1.2338402
## 1 17 1.601422 0.8977722 1.2338402
## 1 18 1.601422 0.8977722 1.2338402
## 1 19 1.601422 0.8977722 1.2338402
## 1 20 1.601422 0.8977722 1.2338402
## 1 21 1.601422 0.8977722 1.2338402
## 1 22 1.601422 0.8977722 1.2338402
## 1 23 1.601422 0.8977722 1.2338402
## 1 24 1.601422 0.8977722 1.2338402
## 1 25 1.601422 0.8977722 1.2338402
## 1 26 1.601422 0.8977722 1.2338402
## 1 27 1.601422 0.8977722 1.2338402
## 1 28 1.601422 0.8977722 1.2338402
## 1 29 1.601422 0.8977722 1.2338402
## 1 30 1.601422 0.8977722 1.2338402
## 1 31 1.601422 0.8977722 1.2338402
## 1 32 1.601422 0.8977722 1.2338402
## 1 33 1.601422 0.8977722 1.2338402
## 1 34 1.601422 0.8977722 1.2338402
## 1 35 1.601422 0.8977722 1.2338402
## 1 36 1.601422 0.8977722 1.2338402
## 1 37 1.601422 0.8977722 1.2338402
## 1 38 1.601422 0.8977722 1.2338402
## 2 2 4.205230 0.2960210 3.4709970
## 2 3 3.501801 0.4970850 2.8191364
## 2 4 2.588739 0.7260805 2.0659267
## 2 5 2.329308 0.7819707 1.8831832
## 2 6 2.337793 0.7760424 1.8666206
## 2 7 1.778304 0.8762167 1.3839444
## 2 8 1.578691 0.9031528 1.2315201
## 2 9 1.408917 0.9209837 1.1428608
## 2 10 1.324213 0.9311816 1.0844596
## 2 11 1.302358 0.9310355 1.0582753
## 2 12 1.279700 0.9354886 1.0265207
## 2 13 1.242464 0.9409399 0.9996987
## 2 14 1.255764 0.9403760 1.0150534
## 2 15 1.243018 0.9420856 1.0032233
## 2 16 1.251833 0.9405813 1.0019765
## 2 17 1.248408 0.9414134 1.0102409
## 2 18 1.247797 0.9418122 1.0016946
## 2 19 1.247797 0.9418122 1.0016946
## 2 20 1.247797 0.9418122 1.0016946
## 2 21 1.247797 0.9418122 1.0016946
## 2 22 1.247797 0.9418122 1.0016946
## 2 23 1.247797 0.9418122 1.0016946
## 2 24 1.247797 0.9418122 1.0016946
## 2 25 1.247797 0.9418122 1.0016946
## 2 26 1.247797 0.9418122 1.0016946
## 2 27 1.247797 0.9418122 1.0016946
## 2 28 1.247797 0.9418122 1.0016946
## 2 29 1.247797 0.9418122 1.0016946
## 2 30 1.247797 0.9418122 1.0016946
## 2 31 1.247797 0.9418122 1.0016946
## 2 32 1.247797 0.9418122 1.0016946
## 2 33 1.247797 0.9418122 1.0016946
## 2 34 1.247797 0.9418122 1.0016946
## 2 35 1.247797 0.9418122 1.0016946
## 2 36 1.247797 0.9418122 1.0016946
## 2 37 1.247797 0.9418122 1.0016946
## 2 38 1.247797 0.9418122 1.0016946
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 2.
marsPred <- predict(marsTuned, newdata = testData$x)
marsPred_df <- postResample(pred = marsPred, obs = testData$y)
marsPred_df$model = "MARSTuned"
## Warning in marsPred_df$model = "MARSTuned": Coercing LHS to a list
library(nnet)
set.seed(66)
nnetFit <- nnet(x = trainingData$x,
y = trainingData$y,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1
)
nnetFit
## a 10-5-1 network with 61 weights
## options were - linear output units decay=0.01
nnetTest <- predict(nnetFit, newdata = testData$x)
nnetTest_df <- postResample(pred = nnetTest, obs = testData$y)
nnetTest_df$model = "NNetFit"
## Warning in nnetTest_df$model = "NNetFit": Coercing LHS to a list
## to mimic the earlier approach of choosing the number of hidden units and the amount of weight decay via resample
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
registerDoParallel(cores=2)
nnetGrid <- expand.grid(.decay = c(0, .01, .1),
.size = c(1:10),
.bag = FALSE
)
set.seed(66)
nnetTuned <- train(x = trainingData$x,
y = trainingData$y,
method = "avNNet",
tuneGrid = nnetGrid,
preProcess = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
maxit = 500
)
nnetTuned
## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 2.521186 0.7417834 1.982490
## 0.00 2 2.520414 0.7419825 1.993329
## 0.00 3 2.362144 0.7706206 1.843838
## 0.00 4 3.023347 0.6739413 2.150650
## 0.00 5 3.840806 0.5656757 2.613405
## 0.00 6 4.158499 0.5420653 2.845487
## 0.00 7 5.295389 0.4196468 3.532638
## 0.00 8 4.701203 0.5059307 3.102302
## 0.00 9 3.426894 0.6071652 2.418269
## 0.00 10 2.977890 0.6738519 2.299839
## 0.01 1 2.460918 0.7508904 1.918080
## 0.01 2 2.485528 0.7490986 1.946753
## 0.01 3 2.277091 0.7875200 1.777048
## 0.01 4 2.410785 0.7639231 1.901185
## 0.01 5 2.577487 0.7374959 2.043790
## 0.01 6 2.662942 0.7241475 2.108571
## 0.01 7 2.800171 0.7055512 2.215772
## 0.01 8 2.722243 0.7162556 2.145649
## 0.01 9 2.632721 0.7256076 2.080347
## 0.01 10 2.738151 0.7095037 2.172516
## 0.10 1 2.466176 0.7493734 1.920796
## 0.10 2 2.477581 0.7500751 1.933210
## 0.10 3 2.320396 0.7786873 1.817580
## 0.10 4 2.321836 0.7816417 1.830630
## 0.10 5 2.428977 0.7658994 1.914278
## 0.10 6 2.492374 0.7549731 1.980510
## 0.10 7 2.525205 0.7462239 1.992585
## 0.10 8 2.594834 0.7321745 2.058944
## 0.10 9 2.524062 0.7467976 1.984428
## 0.10 10 2.461175 0.7598873 1.954640
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 3, decay = 0.01 and bag
## = FALSE.
nnetPred <- predict(nnetTuned, newdata = testData$x)
nnetPred_df <- postResample(pred = nnetPred, obs = testData$y)
nnetPred_df$model = "NNetTuned"
## Warning in nnetPred_df$model = "NNetTuned": Coercing LHS to a list
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
## svmRadial
set.seed(66)
svmRTuned <- train(x = trainingData$x,
y = trainingData$y,
method = 'svmRadial',
preProcess = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv")
)
svmRTuned
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.483765 0.8078604 2.003953
## 0.50 2.257091 0.8204252 1.799294
## 1.00 2.114199 0.8368631 1.677329
## 2.00 2.042744 0.8479294 1.615030
## 4.00 1.979289 0.8549778 1.572385
## 8.00 1.965298 0.8551536 1.573027
## 16.00 1.960283 0.8555684 1.570603
## 32.00 1.960283 0.8555684 1.570603
## 64.00 1.960283 0.8555684 1.570603
## 128.00 1.960283 0.8555684 1.570603
## 256.00 1.960283 0.8555684 1.570603
## 512.00 1.960283 0.8555684 1.570603
## 1024.00 1.960283 0.8555684 1.570603
## 2048.00 1.960283 0.8555684 1.570603
##
## Tuning parameter 'sigma' was held constant at a value of 0.06398114
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06398114 and C = 16.
svmPred <- predict(svmRTuned, newdata = testData$x)
svmPred_df <- postResample(pred = svmPred, obs = testData$y)
svmPred_df$model <- "SVMTuned"
## Warning in svmPred_df$model <- "SVMTuned": Coercing LHS to a list
df <- rbind(as.data.frame(marsPred_df),
as.data.frame(nnetTest_df),
as.data.frame(nnetPred_df),
as.data.frame(svmPred_df)
)
df <- df[order(df[, 1]),]
par(mfrow=c(2,1))
plot(df$model, df$RMSE,
xlab = "Model",
ylab = "RMSE")
plot(df$model, df$RMSE,
xlab = "Model",
ylab = "Rsquared")
## importance of variables
varImp(marsTuned)
## earth variable importance
##
## Overall
## X1 100.00
## X4 85.05
## X2 69.03
## X5 48.88
## X3 39.40
## X6 0.00
## X7 0.00
## X10 0.00
## X8 0.00
## X9 0.00
library(AppliedPredictiveModeling)
library(caret)
data(ChemicalManufacturingProcess)
#remove near zero variables
rev_col <- nearZeroVar(ChemicalManufacturingProcess)
CMP <- ChemicalManufacturingProcess[, -rev_col]
# impute missing values and transform variables.
CMP_Pre <- preProcess(CMP,method=c("BoxCox","center","scale","knnImpute"))
CMP_df <- predict(CMP_Pre, CMP)
# x <- subset(CMP_df,select= -Yield)
# y <- subset(CMP_df,select="Yield")
set.seed(366)
splitIndex <- createDataPartition(CMP_df$Yield,
p = 0.7,
list = FALSE)
CMP_train <- CMP_df[splitIndex,]
#y_train <- CMP_df[splitIndex,]
CMP_test <- CMP_df[-splitIndex,]
#y_test <- y[-splitIndex,]
set.seed(366)
knnTuned <- train(x = CMP_train[, -1],
y = CMP_train$Yield,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
knnTuned
## k-Nearest Neighbors
##
## 124 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 111, 110, 112, 111, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.6733437 0.5809810 0.5506681
## 7 0.7006137 0.5529617 0.5760410
## 9 0.7127940 0.5329361 0.5902799
## 11 0.7315038 0.5012297 0.6061655
## 13 0.7277822 0.5385565 0.5930791
## 15 0.7485073 0.5029019 0.6052283
## 17 0.7605215 0.4856939 0.6148293
## 19 0.7609769 0.4952105 0.6121936
## 21 0.7755243 0.4700540 0.6303403
## 23 0.7744201 0.4851380 0.6346615
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
knnPred <- predict(knnTuned, newdata = CMP_test[, -1])
knnPred_df <- postResample(pred = knnPred, obs = CMP_test$Yield)
knnPred_df$model <- "KnnTuned"
## Warning in knnPred_df$model <- "KnnTuned": Coercing LHS to a list
set.seed(66)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsTuned <- train( x = CMP_train[, -1],
y = CMP_train$Yield,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marsTuned
## Multivariate Adaptive Regression Spline
##
## 124 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 111, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 0.8337924 0.3271845 0.6462113
## 1 3 0.5960717 0.6417534 0.4769708
## 1 4 0.5999913 0.6344236 0.4880780
## 1 5 0.5954319 0.6390917 0.4852993
## 1 6 0.5874221 0.6448273 0.4840795
## 1 7 0.5825868 0.6468110 0.4833863
## 1 8 0.5913735 0.6367361 0.4811005
## 1 9 0.5855489 0.6406308 0.4797753
## 1 10 0.6016089 0.6269914 0.4962357
## 1 11 0.6234883 0.6194539 0.5155225
## 1 12 0.6328606 0.6201854 0.5084636
## 1 13 0.6571091 0.6066147 0.5275781
## 1 14 0.6497580 0.6120058 0.5241639
## 1 15 0.6643395 0.6026346 0.5336185
## 1 16 0.6666142 0.6004533 0.5336402
## 1 17 0.6576347 0.6090091 0.5270253
## 1 18 0.6579111 0.6092852 0.5284017
## 1 19 0.6567678 0.6103253 0.5260416
## 1 20 0.6567678 0.6103253 0.5260416
## 1 21 0.6567678 0.6103253 0.5260416
## 1 22 0.6567678 0.6103253 0.5260416
## 1 23 0.6567678 0.6103253 0.5260416
## 1 24 0.6567678 0.6103253 0.5260416
## 1 25 0.6567678 0.6103253 0.5260416
## 1 26 0.6567678 0.6103253 0.5260416
## 1 27 0.6567678 0.6103253 0.5260416
## 1 28 0.6567678 0.6103253 0.5260416
## 1 29 0.6567678 0.6103253 0.5260416
## 1 30 0.6567678 0.6103253 0.5260416
## 1 31 0.6567678 0.6103253 0.5260416
## 1 32 0.6567678 0.6103253 0.5260416
## 1 33 0.6567678 0.6103253 0.5260416
## 1 34 0.6567678 0.6103253 0.5260416
## 1 35 0.6567678 0.6103253 0.5260416
## 1 36 0.6567678 0.6103253 0.5260416
## 1 37 0.6567678 0.6103253 0.5260416
## 1 38 0.6567678 0.6103253 0.5260416
## 2 2 0.8337924 0.3271845 0.6462113
## 2 3 0.5960717 0.6417534 0.4769708
## 2 4 0.5936357 0.6340390 0.4816456
## 2 5 0.6823289 0.5754081 0.5310619
## 2 6 0.6610439 0.5970577 0.5173687
## 2 7 0.6318837 0.6227272 0.5037244
## 2 8 0.6402254 0.6135770 0.5147265
## 2 9 0.6649527 0.5787768 0.5249500
## 2 10 0.6844138 0.5648677 0.5480256
## 2 11 0.6970470 0.5575744 0.5694692
## 2 12 0.7429641 0.5661879 0.6075400
## 2 13 0.7529619 0.5555610 0.6097132
## 2 14 0.7596289 0.5667127 0.6144450
## 2 15 0.7618092 0.5697383 0.6141019
## 2 16 0.7854361 0.5663618 0.6435582
## 2 17 0.7876070 0.5730045 0.6359478
## 2 18 0.7826650 0.5676472 0.6355409
## 2 19 0.7820779 0.5681732 0.6305044
## 2 20 0.7877219 0.5655005 0.6345747
## 2 21 0.7877219 0.5655005 0.6345747
## 2 22 0.7839783 0.5685907 0.6305939
## 2 23 0.7808715 0.5696769 0.6329854
## 2 24 0.7808715 0.5696769 0.6329854
## 2 25 0.7808715 0.5696769 0.6329854
## 2 26 0.7808715 0.5696769 0.6329854
## 2 27 0.7808715 0.5696769 0.6329854
## 2 28 0.7808715 0.5696769 0.6329854
## 2 29 0.7808715 0.5696769 0.6329854
## 2 30 0.7808715 0.5696769 0.6329854
## 2 31 0.7808715 0.5696769 0.6329854
## 2 32 0.7808715 0.5696769 0.6329854
## 2 33 0.7808715 0.5696769 0.6329854
## 2 34 0.7808715 0.5696769 0.6329854
## 2 35 0.7808715 0.5696769 0.6329854
## 2 36 0.7808715 0.5696769 0.6329854
## 2 37 0.7808715 0.5696769 0.6329854
## 2 38 0.7808715 0.5696769 0.6329854
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 7 and degree = 1.
marsPred <- predict(marsTuned, newdata = CMP_test[, -1])
marsPred_df <- postResample(pred = marsPred, obs = CMP_test$Yield)
marsPred_df$model = "MARSTuned"
## Warning in marsPred_df$model = "MARSTuned": Coercing LHS to a list
set.seed(66)
nnetFit <- nnet(x = CMP_train[, -1],
y = CMP_train$Yield,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(CMP_train[, -1]) + 1) + 5 + 1,
trControl = trainControl(method = "cv")
)
nnetFit
## a 56-5-1 network with 291 weights
## options were - linear output units decay=0.01
nnetTest <- predict(nnetFit, newdata = CMP_test[, -1])
nnetTest_df <- postResample(pred = nnetTest, obs = CMP_test$Yield)
nnetTest_df$model = "NNetFit"
## Warning in nnetTest_df$model = "NNetFit": Coercing LHS to a list
## to mimic the earlier approach of choosing the number of hidden units and the amount of weight decay via resample
nnetGrid <- expand.grid(.decay = c(0, .01, .1),
.size = c(1:10),
.bag = FALSE
)
set.seed(66)
nnetTuned <- train(x = CMP_train[, -1],
y = CMP_train$Yield,
method = "avNNet",
tuneGrid = nnetGrid,
preProcess = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(CMP_train[, -1]) + 1) + 10 + 1,
maxit = 500,
trControl = trainControl(method = "cv")
)
nnetTuned
## Model Averaged Neural Network
##
## 124 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 111, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 0.9231374 0.3151059 0.7155010
## 0.00 2 0.8708406 0.4554158 0.6356395
## 0.00 3 0.8251656 0.4817613 0.6537838
## 0.00 4 0.7285201 0.5285236 0.5691464
## 0.00 5 0.6821834 0.6133254 0.5371281
## 0.00 6 0.6657124 0.5752629 0.5377642
## 0.00 7 0.6426986 0.5932692 0.4955748
## 0.00 8 0.6155879 0.6167441 0.4741011
## 0.00 9 0.5885947 0.6416963 0.4631838
## 0.00 10 0.6034396 0.6349331 0.4777200
## 0.01 1 0.9567139 0.3553083 0.7493575
## 0.01 2 0.8201451 0.4982107 0.6422164
## 0.01 3 0.6495137 0.6094142 0.5066449
## 0.01 4 0.6377360 0.6203818 0.5262271
## 0.01 5 0.5620845 0.6773011 0.4450136
## 0.01 6 0.6027698 0.6385799 0.4946598
## 0.01 7 0.6061337 0.6282992 0.5002626
## 0.01 8 0.5730110 0.6610089 0.4695588
## 0.01 9 0.6007971 0.6313555 0.4911159
## 0.01 10 0.6164690 0.6296011 0.5058480
## 0.10 1 0.9178380 0.3633944 0.7027481
## 0.10 2 0.6783530 0.6071878 0.5372288
## 0.10 3 0.6311379 0.6133288 0.5093124
## 0.10 4 0.6148085 0.6248657 0.4985254
## 0.10 5 0.6179252 0.6187718 0.5042476
## 0.10 6 0.6361227 0.6122396 0.5080438
## 0.10 7 0.6229851 0.6311885 0.5023797
## 0.10 8 0.6142662 0.6298199 0.5005721
## 0.10 9 0.6177898 0.6297785 0.5034394
## 0.10 10 0.6299718 0.6191815 0.5112328
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5, decay = 0.01 and bag
## = FALSE.
nnetPred <- predict(nnetTuned, newdata = CMP_test[, -1])
nnetPred_df <- postResample(pred = nnetPred, obs = CMP_test$Yield)
nnetPred_df$model = "NNetTuned"
## Warning in nnetPred_df$model = "NNetTuned": Coercing LHS to a list
set.seed(66)
svmRTuned <- train(x = CMP_train[, -1],
y = CMP_train$Yield,
method = 'svmRadial',
preProcess = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv")
)
svmRTuned
## Support Vector Machines with Radial Basis Function Kernel
##
## 124 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 111, 111, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.7435672 0.5070494 0.6005719
## 0.50 0.6767859 0.5626753 0.5437010
## 1.00 0.6168262 0.6238160 0.4979281
## 2.00 0.5753522 0.6628272 0.4640795
## 4.00 0.5623798 0.6681324 0.4584052
## 8.00 0.5701332 0.6542966 0.4654985
## 16.00 0.5716977 0.6548889 0.4663325
## 32.00 0.5716977 0.6548889 0.4663325
## 64.00 0.5716977 0.6548889 0.4663325
## 128.00 0.5716977 0.6548889 0.4663325
## 256.00 0.5716977 0.6548889 0.4663325
## 512.00 0.5716977 0.6548889 0.4663325
## 1024.00 0.5716977 0.6548889 0.4663325
## 2048.00 0.5716977 0.6548889 0.4663325
##
## Tuning parameter 'sigma' was held constant at a value of 0.01179007
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01179007 and C = 4.
svmPred <- predict(svmRTuned, newdata = CMP_test[, -1])
svmPred_df <- postResample(pred = svmPred, obs = CMP_test$Yield)
svmPred_df$model <- "SVMTuned"
## Warning in svmPred_df$model <- "SVMTuned": Coercing LHS to a list
resamps <- resamples(list(MARS = marsTuned,
SVM = svmRTuned,
KNN = knnTuned,
#NNetTest = nnetFit,
NNetTune = nnetTuned
)
)
resamps
##
## Call:
## resamples.default(x = list(MARS = marsTuned, SVM = svmRTuned, KNN
## = knnTuned, NNetTune = nnetTuned))
##
## Models: MARS, SVM, KNN, NNetTune
## Number of resamples: 10
## Performance metrics: MAE, RMSE, Rsquared
## Time estimates for: everything, final model fit
trellis.par.set(caretTheme())
par(mfrow=c(2,1))
dotplot(resamps, metric = "Rsquared")
dotplot(resamps, metric = "RMSE")
df <- rbind(as.data.frame(marsPred_df),
as.data.frame(knnPred_df),
as.data.frame(nnetTest_df),
as.data.frame(nnetPred_df),
as.data.frame(svmPred_df)
)
df <- df[order(df[, 1]),]
plot(df$model, df$Rsquared,
xlab = "Model",
ylab = "Rsquared")
lmImp <- varImp(svmRTuned)
lmImp
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 98.06
## ManufacturingProcess09 89.62
## ManufacturingProcess17 84.99
## ManufacturingProcess06 80.40
## BiologicalMaterial12 72.58
## BiologicalMaterial03 69.64
## BiologicalMaterial06 69.09
## ManufacturingProcess11 63.56
## BiologicalMaterial11 61.28
## ManufacturingProcess30 59.33
## ManufacturingProcess31 56.14
## ManufacturingProcess36 54.97
## BiologicalMaterial02 48.33
## BiologicalMaterial09 47.49
## ManufacturingProcess18 47.20
## BiologicalMaterial08 41.93
## ManufacturingProcess29 41.08
## BiologicalMaterial04 40.61
## ManufacturingProcess25 39.17
viporder <- order(abs(lmImp$importance),decreasing=TRUE)
topVIP = rownames(lmImp$importance)[viporder[c(1:10)]]
## remove the comment predictors in nonlinear and linear models
par(mfrow=c(2,4))
unique_VIP <- topVIP[!topVIP=="ManufacturingProcess32" & !topVIP == "ManufacturingProcess09"]
featurePlot(CMP_train[, unique_VIP],
CMP_train$Yield,
plot = "scatter",
between = list(x = 1, y = 1),
type = c("g", "p", "smooth"),
layout = c(3,1),
labels = rep("", 2))