library(caret)
library(mice)
library(mlbench)
library(AppliedPredictiveModeling)
set.seed(200)
trainingData <- mlbench.friedman1(200,sd=1)
str(trainingData)## List of 2
## $ x: num [1:200, 1:10] 0.534 0.584 0.59 0.691 0.667 ...
## $ y: num [1:200] 18.5 16.1 17.8 13.8 18.4 ...
## we convert the 'x' data from s matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## look at the data using
featurePlot(trainingData$x,trainingData$y)## or other methods
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000,sd=1)
testData$x <- data.frame(testData$x)knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel,newdata = testData$x)
postResample(pred = knnPred , obs = testData$y)## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
varImp(knnModel)## loess r-squared variable importance
##
## Overall
## X4 100.0000
## X1 95.5047
## X2 89.6186
## X5 45.2170
## X3 29.9330
## X9 6.3299
## X10 5.5182
## X8 3.2527
## X6 0.8884
## X7 0.0000
marsG <- expand.grid(.degree = 1:2 ,.nprune = 2:38)
set.seed(100)
marsModel <- train(trainingData$x,trainingData$y,
method= 'earth',
tuneGrid = marsG,
trControl = trainControl(method = "cv"))## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
marsModel## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.327937 0.2544880 3.600474
## 1 3 3.572450 0.4912720 2.895811
## 1 4 2.596841 0.7183600 2.106341
## 1 5 2.370161 0.7659777 1.918669
## 1 6 2.276141 0.7881481 1.810001
## 1 7 1.766728 0.8751831 1.390215
## 1 8 1.780946 0.8723243 1.401345
## 1 9 1.665091 0.8819775 1.325515
## 1 10 1.663804 0.8821283 1.327657
## 1 11 1.657738 0.8822967 1.331730
## 1 12 1.653784 0.8827903 1.331504
## 1 13 1.648496 0.8823663 1.316407
## 1 14 1.639073 0.8841742 1.312833
## 1 15 1.639073 0.8841742 1.312833
## 1 16 1.639073 0.8841742 1.312833
## 1 17 1.639073 0.8841742 1.312833
## 1 18 1.639073 0.8841742 1.312833
## 1 19 1.639073 0.8841742 1.312833
## 1 20 1.639073 0.8841742 1.312833
## 1 21 1.639073 0.8841742 1.312833
## 1 22 1.639073 0.8841742 1.312833
## 1 23 1.639073 0.8841742 1.312833
## 1 24 1.639073 0.8841742 1.312833
## 1 25 1.639073 0.8841742 1.312833
## 1 26 1.639073 0.8841742 1.312833
## 1 27 1.639073 0.8841742 1.312833
## 1 28 1.639073 0.8841742 1.312833
## 1 29 1.639073 0.8841742 1.312833
## 1 30 1.639073 0.8841742 1.312833
## 1 31 1.639073 0.8841742 1.312833
## 1 32 1.639073 0.8841742 1.312833
## 1 33 1.639073 0.8841742 1.312833
## 1 34 1.639073 0.8841742 1.312833
## 1 35 1.639073 0.8841742 1.312833
## 1 36 1.639073 0.8841742 1.312833
## 1 37 1.639073 0.8841742 1.312833
## 1 38 1.639073 0.8841742 1.312833
## 2 2 4.327937 0.2544880 3.600474
## 2 3 3.572450 0.4912720 2.895811
## 2 4 2.661826 0.7070510 2.173471
## 2 5 2.404015 0.7578971 1.975387
## 2 6 2.243927 0.7914805 1.783072
## 2 7 1.856336 0.8605482 1.435682
## 2 8 1.754607 0.8763186 1.396841
## 2 9 1.603578 0.8938666 1.261361
## 2 10 1.492421 0.9084998 1.168700
## 2 11 1.317350 0.9292504 1.033926
## 2 12 1.304327 0.9320133 1.019108
## 2 13 1.277510 0.9323681 1.002927
## 2 14 1.269626 0.9350024 1.003346
## 2 15 1.266217 0.9359400 1.013893
## 2 16 1.268470 0.9354868 1.011414
## 2 17 1.268470 0.9354868 1.011414
## 2 18 1.268470 0.9354868 1.011414
## 2 19 1.268470 0.9354868 1.011414
## 2 20 1.268470 0.9354868 1.011414
## 2 21 1.268470 0.9354868 1.011414
## 2 22 1.268470 0.9354868 1.011414
## 2 23 1.268470 0.9354868 1.011414
## 2 24 1.268470 0.9354868 1.011414
## 2 25 1.268470 0.9354868 1.011414
## 2 26 1.268470 0.9354868 1.011414
## 2 27 1.268470 0.9354868 1.011414
## 2 28 1.268470 0.9354868 1.011414
## 2 29 1.268470 0.9354868 1.011414
## 2 30 1.268470 0.9354868 1.011414
## 2 31 1.268470 0.9354868 1.011414
## 2 32 1.268470 0.9354868 1.011414
## 2 33 1.268470 0.9354868 1.011414
## 2 34 1.268470 0.9354868 1.011414
## 2 35 1.268470 0.9354868 1.011414
## 2 36 1.268470 0.9354868 1.011414
## 2 37 1.268470 0.9354868 1.011414
## 2 38 1.268470 0.9354868 1.011414
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 15 and degree = 2.
marsPred <- predict(marsModel,newdata = testData$x)
postResample(pred = marsPred , obs = testData$y)## RMSE Rsquared MAE
## 1.1589948 0.9460418 0.9250230
varImp(marsModel)## earth variable importance
##
## Overall
## X1 100.00
## X4 85.14
## X2 69.24
## X5 49.31
## X3 40.00
## X9 0.00
## X6 0.00
## X8 0.00
## X7 0.00
## X10 0.00
svmModel <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmModel## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.536604 0.7865906 2.035796
## 0.50 2.262783 0.8033031 1.800168
## 1.00 2.087501 0.8225671 1.636606
## 2.00 1.973976 0.8359125 1.540666
## 4.00 1.890687 0.8494370 1.489167
## 8.00 1.837229 0.8573247 1.465234
## 16.00 1.830431 0.8587775 1.459156
## 32.00 1.830431 0.8587775 1.459156
## 64.00 1.830431 0.8587775 1.459156
## 128.00 1.830431 0.8587775 1.459156
## 256.00 1.830431 0.8587775 1.459156
## 512.00 1.830431 0.8587775 1.459156
## 1024.00 1.830431 0.8587775 1.459156
## 2048.00 1.830431 0.8587775 1.459156
##
## Tuning parameter 'sigma' was held constant at a value of 0.06450665
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06450665 and C = 16.
svmPred <- predict(svmModel,newdata = testData$x)
postResample(pred = svmPred , obs = testData$y)## RMSE Rsquared MAE
## 2.0772741 0.8250955 1.5779991
varImp(svmModel)## loess r-squared variable importance
##
## Overall
## X4 100.0000
## X1 95.5047
## X2 89.6186
## X5 45.2170
## X3 29.9330
## X9 6.3299
## X10 5.5182
## X8 3.2527
## X6 0.8884
## X7 0.0000
nneG <- expand.grid( .decay = c(0, 0.01, .1), .size = c(1:10), .bag= F )
set.seed(250)
nneModel <- train(x = trainingData$x,
y = trainingData$y,
method = "avNNet",
preProc = c("center", "scale"),
tuneGrid = nneG,
trControl = trainControl(method = "cv",number = 10),
linout = T,
trace= F,
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1,
maxit = 500)
nneModel## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 2.556994 0.7589440 2.032872
## 0.00 2 2.494050 0.7586231 1.981149
## 0.00 3 2.047486 0.8357651 1.598231
## 0.00 4 2.297429 0.8032324 1.748941
## 0.00 5 2.322853 0.8038168 1.801989
## 0.00 6 NaN NaN NaN
## 0.00 7 NaN NaN NaN
## 0.00 8 NaN NaN NaN
## 0.00 9 NaN NaN NaN
## 0.00 10 NaN NaN NaN
## 0.01 1 2.441458 0.7706298 1.901161
## 0.01 2 2.517953 0.7524312 2.001539
## 0.01 3 2.100117 0.8249787 1.655071
## 0.01 4 2.059455 0.8322590 1.619748
## 0.01 5 2.173606 0.8142874 1.688838
## 0.01 6 NaN NaN NaN
## 0.01 7 NaN NaN NaN
## 0.01 8 NaN NaN NaN
## 0.01 9 NaN NaN NaN
## 0.01 10 NaN NaN NaN
## 0.10 1 2.450103 0.7686716 1.912001
## 0.10 2 2.513383 0.7535860 1.998902
## 0.10 3 2.139093 0.8254581 1.697052
## 0.10 4 2.145967 0.8173106 1.697125
## 0.10 5 2.189502 0.8129027 1.703283
## 0.10 6 NaN NaN NaN
## 0.10 7 NaN NaN NaN
## 0.10 8 NaN NaN NaN
## 0.10 9 NaN NaN NaN
## 0.10 10 NaN NaN NaN
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 3, decay = 0 and bag = FALSE.
nnePred <- predict(nneModel,newdata = testData$x)
postResample(pred = nnePred , obs = testData$y)## RMSE Rsquared MAE
## 1.9277921 0.8515753 1.4709947
varImp(nneModel)## loess r-squared variable importance
##
## Overall
## X4 100.0000
## X1 95.5047
## X2 89.6186
## X5 45.2170
## X3 29.9330
## X9 6.3299
## X10 5.5182
## X8 3.2527
## X6 0.8884
## X7 0.0000
data.frame(rbind(NNE = postResample(pred = nnePred , obs = testData$y),SVM = postResample(pred = svmPred , obs = testData$y) , MARS = postResample(pred = marsPred , obs = testData$y),KNN = postResample(pred = knnPred , obs = testData$y)))## RMSE Rsquared MAE
## NNE 1.927792 0.8515753 1.470995
## SVM 2.077274 0.8250955 1.577999
## MARS 1.158995 0.9460418 0.925023
## KNN 3.204059 0.6819919 2.568346
Which model appear to give the best performance ? Does MARS sleect the informative predictors (those named x1- x5) ?
The MARS model seems to have the best performance. It has the lowest RMSE and the highest R2 performance values. The MARS model selects the informative predictors between x1 and x5.
data("ChemicalManufacturingProcess")
data <- ChemicalManufacturingProcess
Amelia::missmap(data)temp <- mice(data,m=5, maxit = 50, method = 'pmm', seed = 500,printFlag = F)## Warning: Number of logged events: 6750
imputed.data <- complete(temp)
imputed.data <- imputed.data[,-nearZeroVar(imputed.data)]
Amelia::missmap(imputed.data)set.seed(250)
yield_data <- createDataPartition(imputed.data$Yield, p=0.8, list=F)
train_data <- imputed.data[yield_data, ]
test_data <- imputed.data[-yield_data, ]knnModel <- train(Yield ~ ., data= train_data,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel## k-Nearest Neighbors
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.391176 0.4202001 1.079618
## 7 1.386731 0.4153368 1.098796
## 9 1.404874 0.4017264 1.126877
## 11 1.433611 0.3722210 1.153598
## 13 1.436205 0.3684589 1.154995
## 15 1.432417 0.3739066 1.150110
## 17 1.437431 0.3718760 1.154604
## 19 1.426967 0.3846216 1.143816
## 21 1.429630 0.3869589 1.146518
## 23 1.434949 0.3868964 1.151420
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.
knnPred <- predict(knnModel,newdata = test_data)
postResample(pred = knnPred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.2676222 0.6075643 0.9855357
varImp(knnModel)## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 81.97
## ManufacturingProcess13 79.67
## ManufacturingProcess17 66.58
## ManufacturingProcess36 63.59
## BiologicalMaterial03 62.11
## BiologicalMaterial12 61.82
## BiologicalMaterial02 60.83
## ManufacturingProcess09 59.46
## ManufacturingProcess31 58.67
## ManufacturingProcess06 56.28
## ManufacturingProcess33 51.26
## BiologicalMaterial04 44.16
## BiologicalMaterial11 42.20
## BiologicalMaterial09 40.17
## BiologicalMaterial08 38.08
## BiologicalMaterial01 36.33
## ManufacturingProcess29 35.18
## ManufacturingProcess12 30.52
## ManufacturingProcess11 28.01
marsG <- expand.grid(.degree = 1:2 ,.nprune = 2:38)
set.seed(100)
marsModel <- train(Yield ~ ., data= train_data ,
method= 'earth',
tuneGrid = marsG,
trControl = trainControl(method = "cv"))
marsModel## Multivariate Adaptive Regression Spline
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 130, 130, 130, 130, 130, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.410098 0.4617915 1.1236571
## 1 3 1.362018 0.4906932 1.1372965
## 1 4 1.215424 0.5822624 1.0021177
## 1 5 1.245327 0.5556927 1.0340471
## 1 6 1.278050 0.5398665 1.0606359
## 1 7 1.269932 0.5487258 1.0449809
## 1 8 1.268580 0.5588443 1.0315129
## 1 9 1.215936 0.5897583 0.9951248
## 1 10 1.168948 0.6142118 0.9544524
## 1 11 1.144261 0.6287768 0.9360284
## 1 12 1.158899 0.6117474 0.9176477
## 1 13 1.205755 0.5923064 0.9505991
## 1 14 1.204665 0.5969457 0.9496959
## 1 15 1.199402 0.6006009 0.9461242
## 1 16 1.196674 0.5999350 0.9436079
## 1 17 1.189780 0.6034355 0.9374538
## 1 18 1.189780 0.6034355 0.9374538
## 1 19 1.189780 0.6034355 0.9374538
## 1 20 1.189780 0.6034355 0.9374538
## 1 21 1.189780 0.6034355 0.9374538
## 1 22 1.189780 0.6034355 0.9374538
## 1 23 1.189780 0.6034355 0.9374538
## 1 24 1.189780 0.6034355 0.9374538
## 1 25 1.189780 0.6034355 0.9374538
## 1 26 1.189780 0.6034355 0.9374538
## 1 27 1.189780 0.6034355 0.9374538
## 1 28 1.189780 0.6034355 0.9374538
## 1 29 1.189780 0.6034355 0.9374538
## 1 30 1.189780 0.6034355 0.9374538
## 1 31 1.189780 0.6034355 0.9374538
## 1 32 1.189780 0.6034355 0.9374538
## 1 33 1.189780 0.6034355 0.9374538
## 1 34 1.189780 0.6034355 0.9374538
## 1 35 1.189780 0.6034355 0.9374538
## 1 36 1.189780 0.6034355 0.9374538
## 1 37 1.189780 0.6034355 0.9374538
## 1 38 1.189780 0.6034355 0.9374538
## 2 2 1.410098 0.4617915 1.1236571
## 2 3 1.321215 0.4982752 1.0546064
## 2 4 1.236545 0.5776655 0.9989182
## 2 5 1.236284 0.5697772 1.0066220
## 2 6 1.267595 0.5606386 1.0278800
## 2 7 1.260217 0.5631212 1.0224957
## 2 8 1.229997 0.5997203 0.9996484
## 2 9 1.280179 0.5838526 1.0279952
## 2 10 1.331328 0.5680024 1.0542226
## 2 11 1.283122 0.5950108 1.0027737
## 2 12 1.273955 0.6105659 0.9885581
## 2 13 1.296905 0.6026524 0.9967485
## 2 14 1.331754 0.5663198 1.0452380
## 2 15 1.315923 0.5732705 1.0352413
## 2 16 1.314638 0.5801461 1.0311172
## 2 17 1.356028 0.5631419 1.0671458
## 2 18 1.363416 0.5635965 1.0685763
## 2 19 1.379744 0.5619564 1.0771696
## 2 20 1.387699 0.5565859 1.0785880
## 2 21 1.365660 0.5657150 1.0583838
## 2 22 1.361409 0.5680962 1.0592572
## 2 23 1.355420 0.5710559 1.0539445
## 2 24 1.344960 0.5760599 1.0493777
## 2 25 1.344960 0.5760599 1.0493777
## 2 26 1.344960 0.5760599 1.0493777
## 2 27 1.344960 0.5760599 1.0493777
## 2 28 1.344960 0.5760599 1.0493777
## 2 29 1.344960 0.5760599 1.0493777
## 2 30 1.344960 0.5760599 1.0493777
## 2 31 1.344960 0.5760599 1.0493777
## 2 32 1.344960 0.5760599 1.0493777
## 2 33 1.344960 0.5760599 1.0493777
## 2 34 1.344960 0.5760599 1.0493777
## 2 35 1.344960 0.5760599 1.0493777
## 2 36 1.344960 0.5760599 1.0493777
## 2 37 1.344960 0.5760599 1.0493777
## 2 38 1.344960 0.5760599 1.0493777
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 11 and degree = 1.
marsPred <- predict(marsModel,newdata = test_data)
postResample(pred = marsPred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.1544504 0.6172556 0.8222224
varImp(marsModel)## earth variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess09 63.08
## ManufacturingProcess13 45.62
## ManufacturingProcess39 30.05
## ManufacturingProcess28 28.37
## BiologicalMaterial12 21.21
## ManufacturingProcess01 10.88
## BiologicalMaterial09 0.00
## ManufacturingProcess25 0.00
## ManufacturingProcess35 0.00
## ManufacturingProcess27 0.00
## ManufacturingProcess45 0.00
## ManufacturingProcess19 0.00
## ManufacturingProcess24 0.00
## ManufacturingProcess18 0.00
## BiologicalMaterial06 0.00
## ManufacturingProcess03 0.00
## ManufacturingProcess12 0.00
## ManufacturingProcess04 0.00
## ManufacturingProcess11 0.00
svmModel <- train( Yield ~. ,
data= train_data,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmModel## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 131, 128, 128, 129, 129, 132, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.392070 0.5111576 1.1357101
## 0.50 1.303644 0.5540321 1.0572934
## 1.00 1.208754 0.6152950 0.9798188
## 2.00 1.142094 0.6543169 0.9245189
## 4.00 1.117510 0.6703391 0.9012980
## 8.00 1.118240 0.6719824 0.8996575
## 16.00 1.113606 0.6742910 0.8961293
## 32.00 1.113606 0.6742910 0.8961293
## 64.00 1.113606 0.6742910 0.8961293
## 128.00 1.113606 0.6742910 0.8961293
## 256.00 1.113606 0.6742910 0.8961293
## 512.00 1.113606 0.6742910 0.8961293
## 1024.00 1.113606 0.6742910 0.8961293
## 2048.00 1.113606 0.6742910 0.8961293
##
## Tuning parameter 'sigma' was held constant at a value of 0.01286917
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01286917 and C = 16.
svmPred <- predict(svmModel,newdata = test_data)
postResample(pred = svmPred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.1102162 0.6531059 0.8674914
varImp(svmModel)## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 81.97
## ManufacturingProcess13 79.67
## ManufacturingProcess17 66.58
## ManufacturingProcess36 63.59
## BiologicalMaterial03 62.11
## BiologicalMaterial12 61.82
## BiologicalMaterial02 60.83
## ManufacturingProcess09 59.46
## ManufacturingProcess31 58.67
## ManufacturingProcess06 56.28
## ManufacturingProcess33 51.26
## BiologicalMaterial04 44.16
## BiologicalMaterial11 42.20
## BiologicalMaterial09 40.17
## BiologicalMaterial08 38.08
## BiologicalMaterial01 36.33
## ManufacturingProcess29 35.18
## ManufacturingProcess12 30.52
## ManufacturingProcess11 28.01
nneG <- expand.grid( .decay = c(0, 0.01, .1), .size = 1:10, .bag= T )
set.seed(250)
nneModel <- train(Yield ~.,
data =train_data,
method = "avNNet",
preProc = c("center", "scale"),
tuneGrid = nneG,
trControl = trainControl(method = "cv",number = 10),
linout = T,
trace= F,
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1,
maxit = 500)
nneModel## Model Averaged Neural Network
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 129, 129, 130, 129, 131, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 1.615132 0.3320404 1.326625
## 0.00 2 NaN NaN NaN
## 0.00 3 NaN NaN NaN
## 0.00 4 NaN NaN NaN
## 0.00 5 NaN NaN NaN
## 0.00 6 NaN NaN NaN
## 0.00 7 NaN NaN NaN
## 0.00 8 NaN NaN NaN
## 0.00 9 NaN NaN NaN
## 0.00 10 NaN NaN NaN
## 0.01 1 1.460104 0.4572877 1.158534
## 0.01 2 NaN NaN NaN
## 0.01 3 NaN NaN NaN
## 0.01 4 NaN NaN NaN
## 0.01 5 NaN NaN NaN
## 0.01 6 NaN NaN NaN
## 0.01 7 NaN NaN NaN
## 0.01 8 NaN NaN NaN
## 0.01 9 NaN NaN NaN
## 0.01 10 NaN NaN NaN
## 0.10 1 1.546839 0.4768993 1.254676
## 0.10 2 NaN NaN NaN
## 0.10 3 NaN NaN NaN
## 0.10 4 NaN NaN NaN
## 0.10 5 NaN NaN NaN
## 0.10 6 NaN NaN NaN
## 0.10 7 NaN NaN NaN
## 0.10 8 NaN NaN NaN
## 0.10 9 NaN NaN NaN
## 0.10 10 NaN NaN NaN
##
## Tuning parameter 'bag' was held constant at a value of TRUE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1, decay = 0.01 and bag = TRUE.
nnePred <- predict(nneModel,newdata = test_data)
postResample(pred = nnePred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.0921315 0.6627739 0.8790418
varImp(nneModel)## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 81.97
## ManufacturingProcess13 79.67
## ManufacturingProcess17 66.58
## ManufacturingProcess36 63.59
## BiologicalMaterial03 62.11
## BiologicalMaterial12 61.82
## BiologicalMaterial02 60.83
## ManufacturingProcess09 59.46
## ManufacturingProcess31 58.67
## ManufacturingProcess06 56.28
## ManufacturingProcess33 51.26
## BiologicalMaterial04 44.16
## BiologicalMaterial11 42.20
## BiologicalMaterial09 40.17
## BiologicalMaterial08 38.08
## BiologicalMaterial01 36.33
## ManufacturingProcess29 35.18
## ManufacturingProcess12 30.52
## ManufacturingProcess11 28.01
data.frame(rbind(NNE = postResample(pred = nnePred , obs = test_data$Yield),SVM = postResample(pred = svmPred , obs = test_data$Yield) , MARS = postResample(pred = marsPred , obs = test_data$Yield),KNN = postResample(pred = knnPred , obs = test_data$Yield)))## RMSE Rsquared MAE
## NNE 1.092132 0.6627739 0.8790418
## SVM 1.110216 0.6531059 0.8674914
## MARS 1.154450 0.6172556 0.8222224
## KNN 1.267622 0.6075643 0.9855357
The SVM model has the best perfomance, we can see its has the lowest RMSE and the Highest R2.
## PLS
plsModel <- train(
Yield~., data = train_data, method = "pls",
trControl = trainControl("cv", number = 10),
scale = T,
tuneLength = 20
)
plsModel## Partial Least Squares
##
## 144 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 128, 129, 131, 131, 130, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.389468 0.4538082 1.1264244
## 2 1.276724 0.5448672 1.0542224
## 3 1.207833 0.6164957 0.9953997
## 4 1.226972 0.5931409 1.0017668
## 5 1.319500 0.5749664 1.0379418
## 6 1.378568 0.5515128 1.0861774
## 7 1.519504 0.5337391 1.1445597
## 8 1.638908 0.5282018 1.1971488
## 9 1.722537 0.5274967 1.2203502
## 10 1.807979 0.5203816 1.2498122
## 11 1.911960 0.5113783 1.2855025
## 12 2.012447 0.4996369 1.3203884
## 13 2.152254 0.4963789 1.3682073
## 14 2.210534 0.4926021 1.3916001
## 15 2.301751 0.4867255 1.4249798
## 16 2.347668 0.4929740 1.4263511
## 17 2.398101 0.4946707 1.4387352
## 18 2.444954 0.4982246 1.4517782
## 19 2.458790 0.5020647 1.4540572
## 20 2.478503 0.5059654 1.4571032
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 3.
plot(plsModel)plsPred <- predict(plsModel,newdata = test_data)
postResample(pred = plsPred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.6880579 0.3808667 1.1458658
varImp(plsModel)##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess36 76.62
## ManufacturingProcess13 71.23
## ManufacturingProcess09 69.26
## ManufacturingProcess17 66.09
## ManufacturingProcess06 61.12
## ManufacturingProcess33 61.00
## BiologicalMaterial02 56.78
## BiologicalMaterial06 54.14
## BiologicalMaterial08 51.87
## BiologicalMaterial12 48.38
## BiologicalMaterial11 48.19
## BiologicalMaterial03 48.05
## ManufacturingProcess12 48.02
## BiologicalMaterial01 47.92
## ManufacturingProcess29 47.55
## BiologicalMaterial04 45.74
## ManufacturingProcess04 41.77
## ManufacturingProcess28 38.63
## ManufacturingProcess37 38.16
## SVM
svmModel <- train( Yield ~. ,
data= train_data,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmModel## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 130, 129, 131, 128, 131, 130, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.413796 0.4905337 1.1389073
## 0.50 1.306345 0.5375649 1.0477407
## 1.00 1.215587 0.6008978 0.9742044
## 2.00 1.129670 0.6585113 0.9066379
## 4.00 1.102913 0.6720668 0.8882549
## 8.00 1.092266 0.6742279 0.8812234
## 16.00 1.090904 0.6750304 0.8798718
## 32.00 1.090904 0.6750304 0.8798718
## 64.00 1.090904 0.6750304 0.8798718
## 128.00 1.090904 0.6750304 0.8798718
## 256.00 1.090904 0.6750304 0.8798718
## 512.00 1.090904 0.6750304 0.8798718
## 1024.00 1.090904 0.6750304 0.8798718
## 2048.00 1.090904 0.6750304 0.8798718
##
## Tuning parameter 'sigma' was held constant at a value of 0.01524453
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01524453 and C = 16.
plot(svmModel)svmPred <- predict(svmModel,newdata = test_data)
postResample(pred = svmPred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.1342679 0.6414604 0.8770061
varImp(svmModel)## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 81.97
## ManufacturingProcess13 79.67
## ManufacturingProcess17 66.58
## ManufacturingProcess36 63.59
## BiologicalMaterial03 62.11
## BiologicalMaterial12 61.82
## BiologicalMaterial02 60.83
## ManufacturingProcess09 59.46
## ManufacturingProcess31 58.67
## ManufacturingProcess06 56.28
## ManufacturingProcess33 51.26
## BiologicalMaterial04 44.16
## BiologicalMaterial11 42.20
## BiologicalMaterial09 40.17
## BiologicalMaterial08 38.08
## BiologicalMaterial01 36.33
## ManufacturingProcess29 35.18
## ManufacturingProcess12 30.52
## ManufacturingProcess11 28.01
data.frame(rbind(PLS = postResample(pred = plsPred , obs = test_data$Yield),SVM = postResample(pred = svmPred , obs = test_data$Yield) ))## RMSE Rsquared MAE
## PLS 1.688058 0.3808667 1.1458658
## SVM 1.134268 0.6414604 0.8770061
The SVM did slightly better than PLS, they both have almost the same top 10 preditors variables. we can see some predictors were not as important for SVM as was for PLS, for example, “ManufacturingProcess33” but instead add the “BiologicalMaterial12”. We also notice some variables were in the top 5 in the PLS model as important predictors but in the SVM they keep the first 2 top as important predictor too but after that, the order of important variables changed. we can say that “ManufacturingProcess” variables dominate the top of the list.
plot(test_data$Yield,test_data$ManufacturingProcess32)cor(test_data$Yield,test_data$ManufacturingProcess32)## [1] 0.5298126
cor(test_data$Yield,test_data$ManufacturingProcess13)## [1] -0.6961826
plot(test_data$Yield,test_data$ManufacturingProcess13)cor(test_data$Yield,test_data$BiologicalMaterial06)## [1] 0.5788852
plot(test_data$Yield,test_data$BiologicalMaterial06)Observing the correlation plot between the top predictor variables and the response we can see that processes 33, 13, and biological material have the third top place. We can see in the plot for process 32 a positive correlation and this process affect the result on the Yield results in a positive manner too. Process 13 has a negative correlation and Biological Material 06 has a positive correlation and also affects the Yield results.