library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(AppliedPredictiveModeling)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune a KNN model
library(caret)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
Calculate the test error for KNN
knn_predictions <- predict(knnModel, newdata = testData$x)
postResample(pred = knn_predictions, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
We get a test set R squared of 0.69 and a RMSE of 3.23
Tune a Neural Network on a grid with decay = 0, 0.01 and .1 and 1-10 hidden units
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10),
.bag = FALSE)
set.seed(100)
nnetTune <- train(trainingData$x, trainingData$y,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = trainControl(method = "cv", number = 10),
## Automatically standardize data prior to modeling
## and prediction
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
maxit = 500)
## Warning: executing %dopar% sequentially: no parallel backend registered
nnetTune
## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 2.377561 0.7652984 1.871760
## 0.00 2 2.424860 0.7537317 1.923262
## 0.00 3 2.045746 0.8221663 1.633843
## 0.00 4 3.346114 0.7590866 2.036976
## 0.00 5 2.457188 0.7657704 1.817001
## 0.00 6 2.866640 0.7278212 2.073852
## 0.00 7 3.669284 0.6314007 2.555045
## 0.00 8 6.277662 0.4931125 3.460387
## 0.00 9 4.492067 0.5727668 2.844588
## 0.00 10 4.121561 0.5477281 2.642319
## 0.01 1 2.385448 0.7602875 1.887828
## 0.01 2 2.417233 0.7524406 1.930443
## 0.01 3 2.151191 0.8016038 1.701919
## 0.01 4 2.091927 0.8154378 1.676655
## 0.01 5 2.181253 0.7975402 1.743285
## 0.01 6 2.241167 0.8062092 1.798545
## 0.01 7 2.410191 0.7692383 1.921161
## 0.01 8 2.454173 0.7723310 1.937375
## 0.01 9 2.459049 0.7609472 1.977813
## 0.01 10 2.503769 0.7375237 1.995359
## 0.10 1 2.393974 0.7596411 1.894198
## 0.10 2 2.421890 0.7536066 1.922984
## 0.10 3 2.169915 0.7982379 1.726855
## 0.10 4 2.059079 0.8224159 1.648608
## 0.10 5 1.986915 0.8368428 1.595313
## 0.10 6 2.187117 0.8048747 1.723156
## 0.10 7 2.155724 0.8153579 1.690736
## 0.10 8 2.239891 0.7985597 1.786384
## 0.10 9 2.339440 0.7768716 1.801583
## 0.10 10 2.351005 0.7652809 1.866334
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 5, decay = 0.1 and bag = FALSE.
We can see the best Neural network that we generated has 5 hidden units and 0.1 decay this model gave a training RMSE of 2.06 and a R squared of .84
Now lets check the test RMSE and RSquared
nn_predictions <- predict(nnetTune, newdata = testData$x)
postResample(pred = nn_predictions, obs = testData$y)
## RMSE Rsquared MAE
## 1.9857101 0.8450826 1.4871632
With the tuned Neural Network we get a test set R squared of 0.85 and a RMSE of 1.99 so far this is much better than the KNN model that we tuned.
Now lets try a MARS model.
# Define the candidate models to test
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
# Fix the seed so that the results can be reproduced
set.seed(100)
marsTuned <- train(trainingData$x, trainingData$y,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
## Loading required package: earth
## Warning: package 'earth' was built under R version 4.3.3
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 4.3.3
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 4.3.3
marsTuned
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.327937 0.2544880 3.600474
## 1 3 3.572450 0.4912720 2.895811
## 1 4 2.596841 0.7183600 2.106341
## 1 5 2.370161 0.7659777 1.918669
## 1 6 2.276141 0.7881481 1.810001
## 1 7 1.766728 0.8751831 1.390215
## 1 8 1.780946 0.8723243 1.401345
## 1 9 1.665091 0.8819775 1.325515
## 1 10 1.663804 0.8821283 1.327657
## 1 11 1.657738 0.8822967 1.331730
## 1 12 1.653784 0.8827903 1.331504
## 1 13 1.648496 0.8823663 1.316407
## 1 14 1.639073 0.8841742 1.312833
## 1 15 1.639073 0.8841742 1.312833
## 1 16 1.639073 0.8841742 1.312833
## 1 17 1.639073 0.8841742 1.312833
## 1 18 1.639073 0.8841742 1.312833
## 1 19 1.639073 0.8841742 1.312833
## 1 20 1.639073 0.8841742 1.312833
## 1 21 1.639073 0.8841742 1.312833
## 1 22 1.639073 0.8841742 1.312833
## 1 23 1.639073 0.8841742 1.312833
## 1 24 1.639073 0.8841742 1.312833
## 1 25 1.639073 0.8841742 1.312833
## 1 26 1.639073 0.8841742 1.312833
## 1 27 1.639073 0.8841742 1.312833
## 1 28 1.639073 0.8841742 1.312833
## 1 29 1.639073 0.8841742 1.312833
## 1 30 1.639073 0.8841742 1.312833
## 1 31 1.639073 0.8841742 1.312833
## 1 32 1.639073 0.8841742 1.312833
## 1 33 1.639073 0.8841742 1.312833
## 1 34 1.639073 0.8841742 1.312833
## 1 35 1.639073 0.8841742 1.312833
## 1 36 1.639073 0.8841742 1.312833
## 1 37 1.639073 0.8841742 1.312833
## 1 38 1.639073 0.8841742 1.312833
## 2 2 4.327937 0.2544880 3.600474
## 2 3 3.572450 0.4912720 2.895811
## 2 4 2.661826 0.7070510 2.173471
## 2 5 2.404015 0.7578971 1.975387
## 2 6 2.243927 0.7914805 1.783072
## 2 7 1.856336 0.8605482 1.435682
## 2 8 1.754607 0.8763186 1.396841
## 2 9 1.603578 0.8938666 1.261361
## 2 10 1.492421 0.9084998 1.168700
## 2 11 1.317350 0.9292504 1.033926
## 2 12 1.304327 0.9320133 1.019108
## 2 13 1.277510 0.9323681 1.002927
## 2 14 1.269626 0.9350024 1.003346
## 2 15 1.266217 0.9359400 1.013893
## 2 16 1.268470 0.9354868 1.011414
## 2 17 1.268470 0.9354868 1.011414
## 2 18 1.268470 0.9354868 1.011414
## 2 19 1.268470 0.9354868 1.011414
## 2 20 1.268470 0.9354868 1.011414
## 2 21 1.268470 0.9354868 1.011414
## 2 22 1.268470 0.9354868 1.011414
## 2 23 1.268470 0.9354868 1.011414
## 2 24 1.268470 0.9354868 1.011414
## 2 25 1.268470 0.9354868 1.011414
## 2 26 1.268470 0.9354868 1.011414
## 2 27 1.268470 0.9354868 1.011414
## 2 28 1.268470 0.9354868 1.011414
## 2 29 1.268470 0.9354868 1.011414
## 2 30 1.268470 0.9354868 1.011414
## 2 31 1.268470 0.9354868 1.011414
## 2 32 1.268470 0.9354868 1.011414
## 2 33 1.268470 0.9354868 1.011414
## 2 34 1.268470 0.9354868 1.011414
## 2 35 1.268470 0.9354868 1.011414
## 2 36 1.268470 0.9354868 1.011414
## 2 37 1.268470 0.9354868 1.011414
## 2 38 1.268470 0.9354868 1.011414
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 15 and degree = 2.
The the tuned MARS model we can see the best model was found to be the one with nprune = 15 and degree = 2 it give a training RMSE of 1.27 and a RSquared of 0.94
Now lets test the MARS model on the test set
mars_predictions <- predict(marsTuned, newdata = testData$x)
postResample(pred = mars_predictions, obs = testData$y)
## RMSE Rsquared MAE
## 1.1589948 0.9460418 0.9250230
For the MARS model on the test data we get a RMSE of 1.16 and a test RSquared of 0.95 so far this seems like the best model for our data.
Now lets try a SVM model on the data.
svmRTuned <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmRTuned
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.490737 0.8009120 1.982118
## 0.50 2.246868 0.8153042 1.774454
## 1.00 2.051872 0.8400992 1.614368
## 2.00 1.949675 0.8534647 1.524185
## 4.00 1.886125 0.8610205 1.465373
## 8.00 1.849224 0.8654728 1.436621
## 16.00 1.834674 0.8673468 1.429839
## 32.00 1.833292 0.8675584 1.428719
## 64.00 1.833292 0.8675584 1.428719
## 128.00 1.833292 0.8675584 1.428719
## 256.00 1.833292 0.8675584 1.428719
## 512.00 1.833292 0.8675584 1.428719
## 1024.00 1.833292 0.8675584 1.428719
## 2048.00 1.833292 0.8675584 1.428719
##
## Tuning parameter 'sigma' was held constant at a value of 0.06315483
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06315483 and C = 32.
The best SVM has sigma as 0.062, and C = 16 this model gives a RMSE of 1.86 and a RSquared of 0.87
Now lets test the model on the test set
svm_predictions <- predict(svmRTuned, newdata = testData$x)
postResample(pred = svm_predictions, obs = testData$y)
## RMSE Rsquared MAE
## 2.0741633 0.8255819 1.5755127
The tuned SVM gives a Test set RMSE of 2.07 and a test RSquared of 0.83
The best model that I found was the MARS model with nprune= 15 and degree = 2
Now lets see if MARS selects the informattive predictors
summary(marsTuned$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=2,
## nprune=15)
##
## coefficients
## (Intercept) 21.218097
## h(0.621722-X1) -15.691434
## h(X1-0.621722) 9.066076
## h(0.601063-X2) -18.220627
## h(X2-0.601063) 10.428191
## h(X3-0.281766) 3.419830
## h(0.447442-X3) 12.201226
## h(X3-0.606015) 7.875117
## h(0.734892-X4) -10.033343
## h(X4-0.734892) 9.894681
## h(0.850094-X5) -5.385087
## h(0.218266-X1) * h(X2-0.601063) -58.628609
## h(X1-0.218266) * h(X2-0.601063) -29.226178
## h(X1-0.621722) * h(X2-0.295997) -25.768101
## h(0.649253-X1) * h(0.601063-X2) 26.940533
##
## Selected 15 of 18 terms, and 5 of 10 predictors (nprune=15)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6-unused, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 10 4
## GCV 1.618197 RSS 217.6151 GRSq 0.9343005 RSq 0.9553786
Looking at the output above we can see that the MARS model did select the informative predictors X1-X5 and it ignores the noise predictors.
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
Which nonlinear regression model gives the optimal resampling and test set performance?
Apply the same data imputation and pre processing as before.
data(ChemicalManufacturingProcess)
# Create a pre-processing object to apply KNN imputation
preProc <- preProcess(ChemicalManufacturingProcess, method = "knnImpute")
# Apply the imputation to the data
imputed_data <- predict(preProc, ChemicalManufacturingProcess)
sum(is.na(imputed_data))
## [1] 0
head(imputed_data)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 -1.1792673 -0.2261036 -1.5140979 -2.68303622
## 2 1.2263678 2.2391498 1.3089960 -0.05623504
## 3 1.0042258 2.2391498 1.3089960 -0.05623504
## 4 0.6737219 2.2391498 1.3089960 -0.05623504
## 5 1.2534583 1.4827653 1.8939391 1.13594780
## 6 1.8386128 -0.4081962 0.6620886 -0.59859075
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1 0.2201765 0.4941942 -1.3828880
## 2 1.2964386 0.4128555 1.1290767
## 3 1.2964386 0.4128555 1.1290767
## 4 1.2964386 0.4128555 1.1290767
## 5 0.9414412 -0.3734185 1.5348350
## 6 1.5894524 1.7305423 0.6192092
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1 -0.1313107 -1.233131 -3.3962895
## 2 -0.1313107 2.282619 -0.7227225
## 3 -0.1313107 2.282619 -0.7227225
## 4 -0.1313107 2.282619 -0.7227225
## 5 -0.1313107 1.071310 -0.1205678
## 6 -0.1313107 1.189487 -1.7343424
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1 1.1005296 -1.838655 -1.7709224
## 2 1.1005296 1.393395 1.0989855
## 3 1.1005296 1.393395 1.0989855
## 4 1.1005296 1.393395 1.0989855
## 5 0.4162193 0.136256 1.0989855
## 6 1.6346255 1.022062 0.7240877
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1 0.2154105 0.5662872 0.3765810
## 2 -6.1497028 -1.9692525 0.1979962
## 3 -6.1497028 -1.9692525 0.1087038
## 4 -6.1497028 -1.9692525 0.4658734
## 5 -0.2784345 -1.9692525 0.1087038
## 6 0.4348971 -1.9692525 0.5551658
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1 0.5655598 -0.44593467 -0.5414997
## 2 -2.3669726 0.99933318 0.9625383
## 3 -3.1638563 0.06246417 -0.1117745
## 4 -3.3232331 0.42279841 2.1850322
## 5 -2.2075958 0.84537219 -0.6304083
## 6 -1.2513352 0.49486525 0.5550403
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1 -0.1596700 -0.3095182 -1.7201524
## 2 -0.9580199 0.8941637 0.5883746
## 3 1.0378549 0.8941637 -0.3815947
## 4 -0.9580199 -1.1119728 -0.4785917
## 5 1.0378549 0.8941637 -0.4527258
## 6 1.0378549 0.8941637 -0.2199332
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1 -0.07700901 -0.09157342 -0.4806937
## 2 0.52297397 1.08204765 -0.4806937
## 3 0.31428424 0.55112383 -0.4806937
## 4 -0.02483658 0.80261406 -0.4806937
## 5 -0.39004361 0.10403009 -0.4806937
## 6 0.28819802 1.41736795 -0.4806937
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1 0.97711512 0.8093999 1.1846438
## 2 -0.50030980 0.2775205 0.9617071
## 3 0.28765016 0.4425865 0.8245152
## 4 0.28765016 0.7910592 1.0817499
## 5 0.09066017 2.5334227 3.3282665
## 6 -0.50030980 2.4050380 3.1396277
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1 0.3303945 0.9263296 0.1505348
## 2 0.1455765 -0.2753953 0.1559773
## 3 0.1455765 0.3655246 0.1831898
## 4 0.1967569 0.3655246 0.1695836
## 5 0.4754056 -0.3555103 0.2076811
## 6 0.6261033 -0.7560852 0.1423710
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1 0.4563798 0.3109942 0.2109804
## 2 1.5095063 0.1849230 0.2109804
## 3 1.0926437 0.1849230 0.2109804
## 4 0.9829430 0.1562704 0.2109804
## 5 1.6192070 0.2938027 -0.6884239
## 6 1.9044287 0.3998171 -0.5599376
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1 0.05833309 0.8317688 0.8907291
## 2 -0.72230090 -1.8147683 -1.0060115
## 3 -0.42205706 -1.2132826 -0.8335805
## 4 -0.12181322 -0.6117969 -0.6611496
## 5 0.77891831 0.5911745 1.5804530
## 6 1.07916216 -1.2132826 -1.3508734
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1 0.1200183 0.1256347 0.3460352
## 2 0.1093082 0.1966227 0.1906613
## 3 0.1842786 0.2159831 0.2104362
## 4 0.1708910 0.2052273 0.1906613
## 5 0.2726365 0.2912733 0.3432102
## 6 0.1146633 0.2417969 0.3516852
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1 0.7826636 0.5943242 0.7566948
## 2 0.8779201 0.8347250 0.7566948
## 3 0.8588688 0.7746248 0.2444430
## 4 0.8588688 0.7746248 0.2444430
## 5 0.8969714 0.9549255 -0.1653585
## 6 0.9160227 1.0150257 0.9615956
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1 -0.1952552 -0.4568829 0.9890307
## 2 -0.2672523 1.9517531 0.9890307
## 3 -0.1592567 2.6928719 0.9890307
## 4 -0.1592567 2.3223125 1.7943843
## 5 -0.1412574 2.3223125 2.5997378
## 6 -0.3572486 2.6928719 2.5997378
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1 -1.7202722 -0.88694718 -0.6557774
## 2 1.9568096 1.14638329 -0.6557774
## 3 1.9568096 1.23880740 -1.8000420
## 4 0.1182687 0.03729394 -1.8000420
## 5 0.1182687 -2.55058120 -2.9443066
## 6 0.1182687 -0.51725073 -1.8000420
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1 -1.1540243 0.7174727 0.2317270
## 2 2.2161351 -0.8224687 0.2317270
## 3 -0.7046697 -0.8224687 0.2317270
## 4 0.4187168 -0.8224687 0.2317270
## 5 -1.8280562 -0.8224687 0.2981503
## 6 -1.3787016 -0.8224687 0.2317270
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1 0.05969714 -0.06900773 0.20279570
## 2 2.14909691 2.34626280 -0.05472265
## 3 -0.46265281 -0.44058781 0.40881037
## 4 -0.46265281 -0.44058781 -0.31224099
## 5 -0.46265281 -0.44058781 -0.10622632
## 6 -0.46265281 -0.44058781 0.15129203
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1 2.40564734 -0.01588055 0.64371849
## 2 -0.01374656 0.29467248 0.15220242
## 3 0.10146268 -0.01588055 0.39796046
## 4 0.21667191 -0.01588055 -0.09355562
## 5 0.21667191 -0.32643359 -0.09355562
## 6 1.48397347 -0.01588055 -0.33931365
Now we have a cleaned data set with no missing values.
Now split into a train and test set
library(dplyr)
set.seed(123)
train_index <- createDataPartition(imputed_data$Yield, p = 0.7, list = FALSE)
X_train <- imputed_data[train_index, ]
X_train <- X_train %>%
select(-Yield)
y_train <- imputed_data$Yield[train_index]
X_test <- imputed_data[-train_index, ] %>%
select(-Yield)
y_test <- imputed_data$Yield[-train_index]
head(X_test)
## BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 -0.2261036 -1.514098 -2.68303622
## 2 2.2391498 1.308996 -0.05623504
## 3 2.2391498 1.308996 -0.05623504
## 4 2.2391498 1.308996 -0.05623504
## 5 1.4827653 1.893939 1.13594780
## 10 0.7403878 1.960861 1.08846043
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1 0.2201765 0.4941942 -1.382888
## 2 1.2964386 0.4128555 1.129077
## 3 1.2964386 0.4128555 1.129077
## 4 1.2964386 0.4128555 1.129077
## 5 0.9414412 -0.3734185 1.534835
## 10 1.8881010 0.4453910 1.550852
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 1 -0.1313107 -1.233131 -3.3962895
## 2 -0.1313107 2.282619 -0.7227225
## 3 -0.1313107 2.282619 -0.7227225
## 4 -0.1313107 2.282619 -0.7227225
## 5 -0.1313107 1.071310 -0.1205678
## 10 -0.1313107 2.001950 0.6742764
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 1 1.1005296 -1.838655 -1.770922
## 2 1.1005296 1.393395 1.098986
## 3 1.1005296 1.393395 1.098986
## 4 1.1005296 1.393395 1.098986
## 5 0.4162193 0.136256 1.098986
## 10 1.7514590 1.503343 1.616086
## ManufacturingProcess01 ManufacturingProcess02 ManufacturingProcess03
## 1 0.2154105 0.5662872 0.3765810
## 2 -6.1497028 -1.9692525 0.1979962
## 3 -6.1497028 -1.9692525 0.1087038
## 4 -6.1497028 -1.9692525 0.4658734
## 5 -0.2784345 -1.9692525 0.1087038
## 10 0.4348971 -1.9692525 0.4658734
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess06
## 1 0.5655598 -0.44593467 -0.5414997
## 2 -2.3669726 0.99933318 0.9625383
## 3 -3.1638563 0.06246417 -0.1117745
## 4 -3.3232331 0.42279841 2.1850322
## 5 -2.2075958 0.84537219 -0.6304083
## 10 0.9799394 0.06901570 0.8884478
## ManufacturingProcess07 ManufacturingProcess08 ManufacturingProcess09
## 1 -0.1596700 -0.3095182 -1.7201524
## 2 -0.9580199 0.8941637 0.5883746
## 3 1.0378549 0.8941637 -0.3815947
## 4 -0.9580199 -1.1119728 -0.4785917
## 5 1.0378549 0.8941637 -0.4527258
## 10 -0.9580199 -1.1119728 0.9375635
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess12
## 1 -0.07700901 -0.09157342 -0.4806937
## 2 0.52297397 1.08204765 -0.4806937
## 3 0.31428424 0.55112383 -0.4806937
## 4 -0.02483658 0.80261406 -0.4806937
## 5 -0.39004361 0.10403009 -0.4806937
## 10 1.20121560 1.13793436 -0.4806937
## ManufacturingProcess13 ManufacturingProcess14 ManufacturingProcess15
## 1 0.97711512 0.8093999 1.1846438
## 2 -0.50030980 0.2775205 0.9617071
## 3 0.28765016 0.4425865 0.8245152
## 4 0.28765016 0.7910592 1.0817499
## 5 0.09066017 2.5334227 3.3282665
## 10 -0.20482482 -0.1443149 0.6530254
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 1 0.3303945 0.9263296 0.15053478
## 2 0.1455765 -0.2753953 0.15597729
## 3 0.1455765 0.3655246 0.18318982
## 4 0.1967569 0.3655246 0.16958356
## 5 0.4754056 -0.3555103 0.20768110
## 10 0.1370464 0.7660996 0.08250345
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 1 0.4563798 0.3109942 0.2109804
## 2 1.5095063 0.1849230 0.2109804
## 3 1.0926437 0.1849230 0.2109804
## 4 0.9829430 0.1562704 0.2109804
## 5 1.6192070 0.2938027 -0.6884239
## 10 1.3778655 0.1648662 1.4958436
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 1 0.05833309 0.8317688 0.8907291
## 2 -0.72230090 -1.8147683 -1.0060115
## 3 -0.42205706 -1.2132826 -0.8335805
## 4 -0.12181322 -0.6117969 -0.6611496
## 5 0.77891831 0.5911745 1.5804530
## 10 -0.42205706 -1.2132826 -0.8335805
## ManufacturingProcess25 ManufacturingProcess26 ManufacturingProcess27
## 1 0.1200183 0.1256347 0.3460352
## 2 0.1093082 0.1966227 0.1906613
## 3 0.1842786 0.2159831 0.2104362
## 4 0.1708910 0.2052273 0.1906613
## 5 0.2726365 0.2912733 0.3432102
## 10 0.1735685 0.2568549 0.2471609
## ManufacturingProcess28 ManufacturingProcess29 ManufacturingProcess30
## 1 0.7826636 0.5943242 0.7566948
## 2 0.8779201 0.8347250 0.7566948
## 3 0.8588688 0.7746248 0.2444430
## 4 0.8588688 0.7746248 0.2444430
## 5 0.8969714 0.9549255 -0.1653585
## 10 0.9160227 1.0150257 0.6542445
## ManufacturingProcess31 ManufacturingProcess32 ManufacturingProcess33
## 1 -0.1952552 -0.4568829 0.9890307
## 2 -0.2672523 1.9517531 0.9890307
## 3 -0.1592567 2.6928719 0.9890307
## 4 -0.1592567 2.3223125 1.7943843
## 5 -0.1412574 2.3223125 2.5997378
## 10 -0.3032508 1.0253547 0.9890307
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 1 -1.7202722 -0.88694718 -0.6557774
## 2 1.9568096 1.14638329 -0.6557774
## 3 1.9568096 1.23880740 -1.8000420
## 4 0.1182687 0.03729394 -1.8000420
## 5 0.1182687 -2.55058120 -2.9443066
## 10 0.1182687 -0.70209896 -0.6557774
## ManufacturingProcess37 ManufacturingProcess38 ManufacturingProcess39
## 1 -1.1540243 0.7174727 0.2317270
## 2 2.2161351 -0.8224687 0.2317270
## 3 -0.7046697 -0.8224687 0.2317270
## 4 0.4187168 -0.8224687 0.2317270
## 5 -1.8280562 -0.8224687 0.2981503
## 10 1.7667805 0.7174727 0.1653036
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess42
## 1 0.05969714 -0.06900773 0.20279570
## 2 2.14909691 2.34626280 -0.05472265
## 3 -0.46265281 -0.44058781 0.40881037
## 4 -0.46265281 -0.44058781 -0.31224099
## 5 -0.46265281 -0.44058781 -0.10622632
## 10 -0.46265281 -0.44058781 0.04828469
## ManufacturingProcess43 ManufacturingProcess44 ManufacturingProcess45
## 1 2.40564734 -0.01588055 0.64371849
## 2 -0.01374656 0.29467248 0.15220242
## 3 0.10146268 -0.01588055 0.39796046
## 4 0.21667191 -0.01588055 -0.09355562
## 5 0.21667191 -0.32643359 -0.09355562
## 10 -0.12895579 0.29467248 0.64371849
head(y_test)
## [1] -1.1792673 1.2263678 1.0042258 0.6737219 1.2534583 1.2317859
First, lets try a KNN model
man_knn_tuned <- train(x = X_train,
y = y_train,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
man_knn_tuned
## k-Nearest Neighbors
##
## 124 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 124, 124, 124, 124, 124, 124, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.8395593 0.3567826 0.6631784
## 7 0.8260996 0.3786101 0.6523597
## 9 0.8259430 0.3792625 0.6543182
## 11 0.8333005 0.3650900 0.6602237
## 13 0.8359877 0.3606504 0.6612686
## 15 0.8361310 0.3653082 0.6613372
## 17 0.8374222 0.3633723 0.6606113
## 19 0.8382826 0.3651721 0.6607740
## 21 0.8375441 0.3726192 0.6607616
## 23 0.8341735 0.3825055 0.6583335
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
Lets remove the predictors with zero variance and try again.
nzv <- nearZeroVar(imputed_data)
filtered_data <- imputed_data[, -nzv]
Now lets redo the train test split
set.seed(123)
train_index <- createDataPartition(filtered_data$Yield, p = 0.7, list = FALSE)
X_train <- filtered_data[train_index, ]
X_train <- X_train %>%
select(-Yield)
y_train <- filtered_data$Yield[train_index]
X_test <- filtered_data[-train_index, ] %>%
select(-Yield)
y_test <- filtered_data$Yield[-train_index]
head(X_test)
## BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 1 -0.2261036 -1.514098 -2.68303622
## 2 2.2391498 1.308996 -0.05623504
## 3 2.2391498 1.308996 -0.05623504
## 4 2.2391498 1.308996 -0.05623504
## 5 1.4827653 1.893939 1.13594780
## 10 0.7403878 1.960861 1.08846043
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 1 0.2201765 0.4941942 -1.382888
## 2 1.2964386 0.4128555 1.129077
## 3 1.2964386 0.4128555 1.129077
## 4 1.2964386 0.4128555 1.129077
## 5 0.9414412 -0.3734185 1.534835
## 10 1.8881010 0.4453910 1.550852
## BiologicalMaterial08 BiologicalMaterial09 BiologicalMaterial10
## 1 -1.233131 -3.3962895 1.1005296
## 2 2.282619 -0.7227225 1.1005296
## 3 2.282619 -0.7227225 1.1005296
## 4 2.282619 -0.7227225 1.1005296
## 5 1.071310 -0.1205678 0.4162193
## 10 2.001950 0.6742764 1.7514590
## BiologicalMaterial11 BiologicalMaterial12 ManufacturingProcess01
## 1 -1.838655 -1.770922 0.2154105
## 2 1.393395 1.098986 -6.1497028
## 3 1.393395 1.098986 -6.1497028
## 4 1.393395 1.098986 -6.1497028
## 5 0.136256 1.098986 -0.2784345
## 10 1.503343 1.616086 0.4348971
## ManufacturingProcess02 ManufacturingProcess03 ManufacturingProcess04
## 1 0.5662872 0.3765810 0.5655598
## 2 -1.9692525 0.1979962 -2.3669726
## 3 -1.9692525 0.1087038 -3.1638563
## 4 -1.9692525 0.4658734 -3.3232331
## 5 -1.9692525 0.1087038 -2.2075958
## 10 -1.9692525 0.4658734 0.9799394
## ManufacturingProcess05 ManufacturingProcess06 ManufacturingProcess07
## 1 -0.44593467 -0.5414997 -0.1596700
## 2 0.99933318 0.9625383 -0.9580199
## 3 0.06246417 -0.1117745 1.0378549
## 4 0.42279841 2.1850322 -0.9580199
## 5 0.84537219 -0.6304083 1.0378549
## 10 0.06901570 0.8884478 -0.9580199
## ManufacturingProcess08 ManufacturingProcess09 ManufacturingProcess10
## 1 -0.3095182 -1.7201524 -0.07700901
## 2 0.8941637 0.5883746 0.52297397
## 3 0.8941637 -0.3815947 0.31428424
## 4 -1.1119728 -0.4785917 -0.02483658
## 5 0.8941637 -0.4527258 -0.39004361
## 10 -1.1119728 0.9375635 1.20121560
## ManufacturingProcess11 ManufacturingProcess12 ManufacturingProcess13
## 1 -0.09157342 -0.4806937 0.97711512
## 2 1.08204765 -0.4806937 -0.50030980
## 3 0.55112383 -0.4806937 0.28765016
## 4 0.80261406 -0.4806937 0.28765016
## 5 0.10403009 -0.4806937 0.09066017
## 10 1.13793436 -0.4806937 -0.20482482
## ManufacturingProcess14 ManufacturingProcess15 ManufacturingProcess16
## 1 0.8093999 1.1846438 0.3303945
## 2 0.2775205 0.9617071 0.1455765
## 3 0.4425865 0.8245152 0.1455765
## 4 0.7910592 1.0817499 0.1967569
## 5 2.5334227 3.3282665 0.4754056
## 10 -0.1443149 0.6530254 0.1370464
## ManufacturingProcess17 ManufacturingProcess18 ManufacturingProcess19
## 1 0.9263296 0.15053478 0.4563798
## 2 -0.2753953 0.15597729 1.5095063
## 3 0.3655246 0.18318982 1.0926437
## 4 0.3655246 0.16958356 0.9829430
## 5 -0.3555103 0.20768110 1.6192070
## 10 0.7660996 0.08250345 1.3778655
## ManufacturingProcess20 ManufacturingProcess21 ManufacturingProcess22
## 1 0.3109942 0.2109804 0.05833309
## 2 0.1849230 0.2109804 -0.72230090
## 3 0.1849230 0.2109804 -0.42205706
## 4 0.1562704 0.2109804 -0.12181322
## 5 0.2938027 -0.6884239 0.77891831
## 10 0.1648662 1.4958436 -0.42205706
## ManufacturingProcess23 ManufacturingProcess24 ManufacturingProcess25
## 1 0.8317688 0.8907291 0.1200183
## 2 -1.8147683 -1.0060115 0.1093082
## 3 -1.2132826 -0.8335805 0.1842786
## 4 -0.6117969 -0.6611496 0.1708910
## 5 0.5911745 1.5804530 0.2726365
## 10 -1.2132826 -0.8335805 0.1735685
## ManufacturingProcess26 ManufacturingProcess27 ManufacturingProcess28
## 1 0.1256347 0.3460352 0.7826636
## 2 0.1966227 0.1906613 0.8779201
## 3 0.2159831 0.2104362 0.8588688
## 4 0.2052273 0.1906613 0.8588688
## 5 0.2912733 0.3432102 0.8969714
## 10 0.2568549 0.2471609 0.9160227
## ManufacturingProcess29 ManufacturingProcess30 ManufacturingProcess31
## 1 0.5943242 0.7566948 -0.1952552
## 2 0.8347250 0.7566948 -0.2672523
## 3 0.7746248 0.2444430 -0.1592567
## 4 0.7746248 0.2444430 -0.1592567
## 5 0.9549255 -0.1653585 -0.1412574
## 10 1.0150257 0.6542445 -0.3032508
## ManufacturingProcess32 ManufacturingProcess33 ManufacturingProcess34
## 1 -0.4568829 0.9890307 -1.7202722
## 2 1.9517531 0.9890307 1.9568096
## 3 2.6928719 0.9890307 1.9568096
## 4 2.3223125 1.7943843 0.1182687
## 5 2.3223125 2.5997378 0.1182687
## 10 1.0253547 0.9890307 0.1182687
## ManufacturingProcess35 ManufacturingProcess36 ManufacturingProcess37
## 1 -0.88694718 -0.6557774 -1.1540243
## 2 1.14638329 -0.6557774 2.2161351
## 3 1.23880740 -1.8000420 -0.7046697
## 4 0.03729394 -1.8000420 0.4187168
## 5 -2.55058120 -2.9443066 -1.8280562
## 10 -0.70209896 -0.6557774 1.7667805
## ManufacturingProcess38 ManufacturingProcess39 ManufacturingProcess40
## 1 0.7174727 0.2317270 0.05969714
## 2 -0.8224687 0.2317270 2.14909691
## 3 -0.8224687 0.2317270 -0.46265281
## 4 -0.8224687 0.2317270 -0.46265281
## 5 -0.8224687 0.2981503 -0.46265281
## 10 0.7174727 0.1653036 -0.46265281
## ManufacturingProcess41 ManufacturingProcess42 ManufacturingProcess43
## 1 -0.06900773 0.20279570 2.40564734
## 2 2.34626280 -0.05472265 -0.01374656
## 3 -0.44058781 0.40881037 0.10146268
## 4 -0.44058781 -0.31224099 0.21667191
## 5 -0.44058781 -0.10622632 0.21667191
## 10 -0.44058781 0.04828469 -0.12895579
## ManufacturingProcess44 ManufacturingProcess45
## 1 -0.01588055 0.64371849
## 2 0.29467248 0.15220242
## 3 -0.01588055 0.39796046
## 4 -0.01588055 -0.09355562
## 5 -0.32643359 -0.09355562
## 10 0.29467248 0.64371849
head(y_test)
## [1] -1.1792673 1.2263678 1.0042258 0.6737219 1.2534583 1.2317859
Now lets try the knn model again
man_knn_tuned <- train(x = X_train,
y = y_train,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
man_knn_tuned
## k-Nearest Neighbors
##
## 124 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 124, 124, 124, 124, 124, 124, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.8490399 0.3434324 0.6711031
## 7 0.8401711 0.3574148 0.6659776
## 9 0.8349372 0.3643461 0.6624490
## 11 0.8406163 0.3538120 0.6685099
## 13 0.8435341 0.3482929 0.6683856
## 15 0.8414838 0.3555783 0.6662481
## 17 0.8432075 0.3533221 0.6664618
## 19 0.8435626 0.3552517 0.6664848
## 21 0.8423974 0.3634102 0.6662091
## 23 0.8382169 0.3752666 0.6612879
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
The results are pretty much the same we get a best knn model with k=9 this gives a training set RMSE of 0.83 and a RSquared of 0.36 which is not good at all.
Lets calculate the test performance
knn_man_predictions <- predict(man_knn_tuned, newdata = X_test)
postResample(pred = knn_man_predictions, obs = y_test)
## RMSE Rsquared MAE
## 0.5824118 0.6004892 0.4900947
The tuned KNN model gives a test set RMSE of 0.58 and a test RSquared of 0.60 which is not particularly great as it only explains 60% of the variance.
Now lets try a Neural Network
man_nnetTune <- train(X_train, y_train,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = trainControl(method = "cv", number = 10),
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(X_train) + 1) + 10 + 1,
maxit = 500)
man_nnetTune
## Model Averaged Neural Network
##
## 124 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 112, 112, 112, 112, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 0.8098224 0.4542788 0.6462931
## 0.00 2 0.7900891 0.5532403 0.6295673
## 0.00 3 0.8288836 0.5208346 0.6497438
## 0.00 4 0.8493773 0.4610422 0.6667221
## 0.00 5 0.7853867 0.5318186 0.6490855
## 0.00 6 0.7890533 0.5063759 0.6514889
## 0.00 7 0.7427854 0.5674823 0.5981937
## 0.00 8 0.7367960 0.5476032 0.5930822
## 0.00 9 0.7307412 0.5549023 0.6070514
## 0.00 10 0.6765033 0.6122430 0.5464638
## 0.01 1 0.8613248 0.5087751 0.6792396
## 0.01 2 0.7622895 0.5334613 0.6112235
## 0.01 3 0.7931876 0.4915810 0.6396652
## 0.01 4 0.7763135 0.5158764 0.6225116
## 0.01 5 0.7135941 0.5584813 0.5798837
## 0.01 6 0.7198643 0.5833966 0.5808882
## 0.01 7 0.6853676 0.5945207 0.5661585
## 0.01 8 0.6684905 0.6110804 0.5334603
## 0.01 9 0.6885512 0.5863170 0.5494777
## 0.01 10 0.6786262 0.6030410 0.5667354
## 0.10 1 0.8320330 0.5217196 0.6613924
## 0.10 2 0.7706355 0.5506160 0.6118057
## 0.10 3 0.7174592 0.5686208 0.5764041
## 0.10 4 0.6896948 0.5845229 0.5683484
## 0.10 5 0.7158334 0.5632265 0.5948507
## 0.10 6 0.6695403 0.6211213 0.5304735
## 0.10 7 0.6800336 0.6132060 0.5577864
## 0.10 8 0.6853114 0.5930475 0.5567327
## 0.10 9 0.6799443 0.6073574 0.5499570
## 0.10 10 0.6779700 0.6179189 0.5494188
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 8, decay = 0.01 and bag = FALSE.
We can see that the optimally tuned NN is a NN with 8 hidden units, and decay set to .01
This model gives a trainng RMSE of 0.67 and a training RSquared of 0.58 not great…
Lets get the performance on the test set
nn_man_predictions <- predict(man_nnetTune, newdata = X_test)
postResample(pred = nn_man_predictions, obs = y_test)
## RMSE Rsquared MAE
## 0.6129824 0.5905380 0.5137248
On the test set we get a RMSE of 0.61 and a RSquared of 0.59 again not really that great.
Now lets try a MARS model
# Define the candidate models to test
marsGrid <- expand.grid(.degree = 1:3, .nprune = 2:38)
# Fix the seed so that the results can be reproduced
set.seed(100)
man_marsTuned <- train(X_train, y_train,
method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
man_marsTuned
## Multivariate Adaptive Regression Spline
##
## 124 samples
## 56 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 111, 112, 112, 112, 111, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 0.7674952 0.4800369 0.6272878
## 1 3 0.6976920 0.5715392 0.5833544
## 1 4 0.7317991 0.5192876 0.6035316
## 1 5 0.7465267 0.4887583 0.6176625
## 1 6 0.7338009 0.5016466 0.6024888
## 1 7 0.7282048 0.5056478 0.5995776
## 1 8 0.7307994 0.5154864 0.6017473
## 1 9 0.7429866 0.5092006 0.6179497
## 1 10 0.7480816 0.5095078 0.6219228
## 1 11 0.8158135 0.4814037 0.6452220
## 1 12 0.8155965 0.4777260 0.6454296
## 1 13 0.8285474 0.4629733 0.6495360
## 1 14 0.8547603 0.4446908 0.6603505
## 1 15 0.8547603 0.4446908 0.6603505
## 1 16 0.8547603 0.4446908 0.6603505
## 1 17 0.8547603 0.4446908 0.6603505
## 1 18 0.8547603 0.4446908 0.6603505
## 1 19 0.8547603 0.4446908 0.6603505
## 1 20 0.8547603 0.4446908 0.6603505
## 1 21 0.8547603 0.4446908 0.6603505
## 1 22 0.8547603 0.4446908 0.6603505
## 1 23 0.8547603 0.4446908 0.6603505
## 1 24 0.8547603 0.4446908 0.6603505
## 1 25 0.8547603 0.4446908 0.6603505
## 1 26 0.8547603 0.4446908 0.6603505
## 1 27 0.8547603 0.4446908 0.6603505
## 1 28 0.8547603 0.4446908 0.6603505
## 1 29 0.8547603 0.4446908 0.6603505
## 1 30 0.8547603 0.4446908 0.6603505
## 1 31 0.8547603 0.4446908 0.6603505
## 1 32 0.8547603 0.4446908 0.6603505
## 1 33 0.8547603 0.4446908 0.6603505
## 1 34 0.8547603 0.4446908 0.6603505
## 1 35 0.8547603 0.4446908 0.6603505
## 1 36 0.8547603 0.4446908 0.6603505
## 1 37 0.8547603 0.4446908 0.6603505
## 1 38 0.8547603 0.4446908 0.6603505
## 2 2 0.7674952 0.4800369 0.6272878
## 2 3 0.6786747 0.5803151 0.5709900
## 2 4 0.7361509 0.5234287 0.5993991
## 2 5 0.7496205 0.5270731 0.6049434
## 2 6 0.7316270 0.5265711 0.6012456
## 2 7 0.7336035 0.5247535 0.6041361
## 2 8 0.7033784 0.5621176 0.5733790
## 2 9 0.7103251 0.5728297 0.5737909
## 2 10 0.7413993 0.5385157 0.5910901
## 2 11 0.7484723 0.5368504 0.5988287
## 2 12 0.7804540 0.5235389 0.6138092
## 2 13 0.8547394 0.4774635 0.6632880
## 2 14 0.8555676 0.4783253 0.6629350
## 2 15 0.8940728 0.4543232 0.6915826
## 2 16 0.8415194 0.4878612 0.6482062
## 2 17 0.8375286 0.5045403 0.6358476
## 2 18 0.8444921 0.4982050 0.6412836
## 2 19 0.8498865 0.5032521 0.6389198
## 2 20 0.8617150 0.5043054 0.6431879
## 2 21 0.8618351 0.5086037 0.6389328
## 2 22 0.8625272 0.5099750 0.6363763
## 2 23 0.8630888 0.5135993 0.6327429
## 2 24 0.8486417 0.5220780 0.6288280
## 2 25 0.8486417 0.5220780 0.6288280
## 2 26 0.8486417 0.5220780 0.6288280
## 2 27 0.8486417 0.5220780 0.6288280
## 2 28 0.8486417 0.5220780 0.6288280
## 2 29 0.8486417 0.5220780 0.6288280
## 2 30 0.8486417 0.5220780 0.6288280
## 2 31 0.8486417 0.5220780 0.6288280
## 2 32 0.8486417 0.5220780 0.6288280
## 2 33 0.8486417 0.5220780 0.6288280
## 2 34 0.8486417 0.5220780 0.6288280
## 2 35 0.8486417 0.5220780 0.6288280
## 2 36 0.8486417 0.5220780 0.6288280
## 2 37 0.8486417 0.5220780 0.6288280
## 2 38 0.8486417 0.5220780 0.6288280
## 3 2 0.7674952 0.4800369 0.6272878
## 3 3 0.7277091 0.5299248 0.6051291
## 3 4 0.7539826 0.5116158 0.6252526
## 3 5 0.7725015 0.4962910 0.6318883
## 3 6 0.7816498 0.4786462 0.6416305
## 3 7 0.7799816 0.4777417 0.6415142
## 3 8 0.7794913 0.4864705 0.6313861
## 3 9 0.7606996 0.5119277 0.6104401
## 3 10 0.7865397 0.4945068 0.6296108
## 3 11 0.7791711 0.5044319 0.6259885
## 3 12 0.7488631 0.5386841 0.5928220
## 3 13 0.8296997 0.5150707 0.6130779
## 3 14 0.8523660 0.5105600 0.6293687
## 3 15 0.8418807 0.5204353 0.6148023
## 3 16 0.8650592 0.4960215 0.6288537
## 3 17 0.9105796 0.4701465 0.6760545
## 3 18 0.9206066 0.4636366 0.6877968
## 3 19 0.9354144 0.4487058 0.6894071
## 3 20 0.9696134 0.4386796 0.7089251
## 3 21 1.6109902 0.4171441 0.9220531
## 3 22 1.6571769 0.4151449 0.9287368
## 3 23 1.7371309 0.4205724 0.9602106
## 3 24 1.7246757 0.4324557 0.9496026
## 3 25 1.7132728 0.4437542 0.9387098
## 3 26 1.7227777 0.4313095 0.9363891
## 3 27 1.7227777 0.4313095 0.9363891
## 3 28 1.7227777 0.4313095 0.9363891
## 3 29 1.7227777 0.4313095 0.9363891
## 3 30 1.7227777 0.4313095 0.9363891
## 3 31 1.7227777 0.4313095 0.9363891
## 3 32 1.7227777 0.4313095 0.9363891
## 3 33 1.7227777 0.4313095 0.9363891
## 3 34 1.7227777 0.4313095 0.9363891
## 3 35 1.7227777 0.4313095 0.9363891
## 3 36 1.7227777 0.4313095 0.9363891
## 3 37 1.7227777 0.4313095 0.9363891
## 3 38 1.7227777 0.4313095 0.9363891
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 3 and degree = 2.
The tuned MARS model has nprune = 3 and degree = 2 it gives a RMSE of 0.6786747 and an RSquared of 0.5803151 again not a great fit as it only explains 58% of the variance.
Lets try it on the test set.
mars_man_predictions <- predict(man_marsTuned, newdata = X_test)
postResample(pred = mars_man_predictions, obs = y_test)
## RMSE Rsquared MAE
## 0.6361360 0.5738335 0.5137480
The test set RMSE is 0.6361360 with Rsquared 0.5738335
Lastly lets try a SVM model
man_svmRTuned <- train(X_train, y_train,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
man_svmRTuned
## Support Vector Machines with Radial Basis Function Kernel
##
## 124 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 112, 112, 112, 111, 111, 112, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.8008609 0.5240639 0.6289807
## 0.50 0.7322632 0.5632135 0.5787165
## 1.00 0.6733677 0.5945949 0.5354776
## 2.00 0.6474025 0.6037812 0.5104871
## 4.00 0.6400909 0.6050680 0.5043839
## 8.00 0.6353187 0.6118439 0.5036042
## 16.00 0.6350858 0.6121514 0.5034302
## 32.00 0.6350858 0.6121514 0.5034302
## 64.00 0.6350858 0.6121514 0.5034302
## 128.00 0.6350858 0.6121514 0.5034302
## 256.00 0.6350858 0.6121514 0.5034302
## 512.00 0.6350858 0.6121514 0.5034302
## 1024.00 0.6350858 0.6121514 0.5034302
## 2048.00 0.6350858 0.6121514 0.5034302
##
## Tuning parameter 'sigma' was held constant at a value of 0.01543335
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01543335 and C = 16.
The optimal SVM is given with sigma = 0.01543335 and C= 16 it gives a training RMSE of 0.6350858 and RSquared of 0.6121514
Now lets try on the test set
svm_man_predictions <- predict(man_svmRTuned, newdata = X_test)
postResample(pred = svm_man_predictions, obs = y_test)
## RMSE Rsquared MAE
## 0.5498424 0.6408523 0.4533761
The SVM gives test set RMSE of 0.55 and RSquared of 0.64 this is the best performance on the test set of all the models tried. It is worth it to note that while its the “best” model its only explaining 64% of the variance in the test set which is not too great. This is only slightly better than a coin flip. However, this is still the best model based on the performance metrics chosen.
Which predictors are most important in the optimal nonlinear regres- sion model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
varImp(man_svmRTuned)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess13 100.00
## ManufacturingProcess17 98.38
## ManufacturingProcess32 96.84
## ManufacturingProcess09 85.36
## BiologicalMaterial06 81.53
## BiologicalMaterial03 81.06
## ManufacturingProcess36 76.83
## ManufacturingProcess06 62.60
## BiologicalMaterial12 62.56
## BiologicalMaterial02 61.98
## ManufacturingProcess31 61.96
## BiologicalMaterial11 56.76
## ManufacturingProcess33 50.60
## ManufacturingProcess11 47.54
## ManufacturingProcess30 45.92
## BiologicalMaterial01 45.78
## ManufacturingProcess29 45.69
## ManufacturingProcess02 41.86
## BiologicalMaterial08 40.09
## BiologicalMaterial04 39.84
Here are the results from the linear regression model from the previous chapter
ManufacturingProcess32 100.000000
ManufacturingProcess17 59.374258
ManufacturingProcess09 58.764503
ManufacturingProcess06 38.084333
ManufacturingProcess37 34.287693
BiologicalMaterial06 33.909670
ManufacturingProcess34 30.905978
ManufacturingProcess39 27.144722
ManufacturingProcess36 27.115346
ManufacturingProcess13 25.148453
We can see that for the nonlinear model we have 6 Manufacturing processes and 4 Biological so Manufacturing has an edge but not a large one.
The top ten important predictors are similar to the linear model but there are some differences. First the ManufacturingProcess13 is the most important for the nonlinear model but ManufacturingProcess32 is the most important for the linear model. In fact ManufacturingProcess13 only has 25 importance in the linear model compared to 100 in the nonlinear. Also, we see BiologicalMaterial03, BiologicalMaterial12, and BiologicalMaterial02 as important to the nonlinear model but these are not present in the linear model. Lastly, for the linear model the Manufactoring processes dominate the top 10 with only one biological while the nonlinear does not have Manufacturing Processes dominate it has a more even distribution of 6 Manufacturing Process to 4 Biological.
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
top_vars <- rownames(varImp(man_svmRTuned)$importance)[order(-varImp(man_svmRTuned)$importance$Overall)][1:10]
top_vars
## [1] "ManufacturingProcess13" "ManufacturingProcess17" "ManufacturingProcess32"
## [4] "ManufacturingProcess09" "BiologicalMaterial06" "BiologicalMaterial03"
## [7] "ManufacturingProcess36" "ManufacturingProcess06" "BiologicalMaterial12"
## [10] "BiologicalMaterial02"
library(ggplot2)
# Plot each top variable against yield
for (var in top_vars) {
print(
ggplot(imputed_data, aes_string(x = var, y = "Yield")) +
geom_point() +
labs(title = paste("Yield vs", var))
)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Looking at the plots above that show the relationships between the top 10 predictors and response we can see that for the most part there is a positive correlation between the yield and all the biological predictors.So, as the biological process increases so does the yield. This is pretty consistent the only variability is how much it increases. Then for the Manufacturing processes this does not hold. Processes like Manufacturing Process 17 have a strong negative correlation with the response and process 32 shows a strong positive correlation. So, the biological processes all seem to be positively correlated with the response while the manufacturing processes seem to be more process dependent some have positve some negative correlations with the response.