library(zoo)
library(forecast)
library(readxl)
library(tidyverse)
library(imputeTS)
library(openxlsx)
library(fpp3)
library(lubridate)
library(caret)
library(pls)
library(RANN)
library(glmnet)
library(earth)
## Warning: package 'earth' was built under R version 4.4.2
## Warning: package 'plotmo' was built under R version 4.4.2
library(kernlab)
library(mlbench)
library(AppliedPredictiveModeling)
Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.
Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data: \[y = 10sin(\pi x_1x_2) + 20(x_3-0.5)^2 + 10x_4 + 5x_5 + N(0,\sigma^2)\] where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
marsFit <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
preProc = c("center", "scale"),
tuneLength = 10
)
marsFit
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 4.383438 0.2405683 3.597961
## 3 3.645469 0.4745962 2.930453
## 4 2.727602 0.7035031 2.184240
## 6 2.331605 0.7835496 1.833420
## 7 1.976830 0.8421599 1.562591
## 9 1.804342 0.8683110 1.410395
## 10 1.787676 0.8711960 1.386944
## 12 1.821005 0.8670619 1.419893
## 13 1.858688 0.8617344 1.445459
## 15 1.871033 0.8607099 1.457618
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.
svmFit <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10
)
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.525979 0.7804630 2.016014
## 0.50 2.293423 0.7960080 1.808878
## 1.00 2.156969 0.8112034 1.697751
## 2.00 2.081486 0.8226986 1.631756
## 4.00 2.050864 0.8270475 1.605584
## 8.00 2.046714 0.8280409 1.602156
## 16.00 2.046390 0.8281073 1.601597
## 32.00 2.046390 0.8281073 1.601597
## 64.00 2.046390 0.8281073 1.601597
## 128.00 2.046390 0.8281073 1.601597
##
## Tuning parameter 'sigma' was held constant at a value of 0.06529705
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06529705 and C = 16.
nnetFit <- train(x = trainingData$x,
y = trainingData$y,
method = "nnet",
preProc = c("center", "scale"),
tuneLength = 10,
linout = TRUE,
trace = FALSE,
maxit = 10
)
nnetFit
## Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.0000000000 3.669085 0.5026600 2.962695
## 1 0.0001000000 3.580243 0.5260213 2.856563
## 1 0.0002371374 3.827235 0.4637974 3.083739
## 1 0.0005623413 3.621082 0.5041675 2.929060
## 1 0.0013335214 3.912175 0.4528222 3.169521
## 1 0.0031622777 3.725976 0.4999713 3.011478
## 1 0.0074989421 3.575409 0.4982169 2.875282
## 1 0.0177827941 3.693488 0.4743709 2.968929
## 1 0.0421696503 3.543805 0.5284859 2.845716
## 1 0.1000000000 3.809719 0.4759053 3.066616
## 3 0.0000000000 3.153652 0.6243579 2.546814
## 3 0.0001000000 3.223399 0.6195679 2.561911
## 3 0.0002371374 3.161069 0.6191615 2.535542
## 3 0.0005623413 3.381896 0.5791995 2.679957
## 3 0.0013335214 3.201552 0.6118308 2.548003
## 3 0.0031622777 3.253041 0.6073632 2.592403
## 3 0.0074989421 3.141864 0.6253101 2.510387
## 3 0.0177827941 3.147634 0.6271615 2.505901
## 3 0.0421696503 3.193673 0.6152537 2.560054
## 3 0.1000000000 3.341340 0.5821374 2.676539
## 5 0.0000000000 3.724077 0.4992019 2.959872
## 5 0.0001000000 3.777463 0.4799448 3.047419
## 5 0.0002371374 3.943768 0.4281486 3.080059
## 5 0.0005623413 3.716441 0.4772959 2.960046
## 5 0.0013335214 3.702787 0.4972962 2.955030
## 5 0.0031622777 3.457817 0.5406212 2.795459
## 5 0.0074989421 3.583220 0.5315041 2.861171
## 5 0.0177827941 3.736508 0.5169706 2.895150
## 5 0.0421696503 3.489377 0.5449761 2.802471
## 5 0.1000000000 3.719248 0.4940477 2.978162
## 7 0.0000000000 3.719177 0.5255128 2.922185
## 7 0.0001000000 3.958418 0.4786624 3.116452
## 7 0.0002371374 3.745110 0.4963678 2.963913
## 7 0.0005623413 3.986759 0.4982656 3.055672
## 7 0.0013335214 3.417824 0.5699462 2.742171
## 7 0.0031622777 3.856741 0.4825306 3.001068
## 7 0.0074989421 3.496858 0.5464957 2.776602
## 7 0.0177827941 3.845661 0.4836673 3.093194
## 7 0.0421696503 3.715837 0.5043021 2.959027
## 7 0.1000000000 3.744657 0.5273706 2.982406
## 9 0.0000000000 2.983453 0.6692751 2.345400
## 9 0.0001000000 3.242880 0.6062101 2.561498
## 9 0.0002371374 3.110143 0.6364875 2.469341
## 9 0.0005623413 2.948899 0.6716607 2.350752
## 9 0.0013335214 3.199709 0.5935839 2.553400
## 9 0.0031622777 3.053898 0.6437308 2.416168
## 9 0.0074989421 2.964069 0.6587931 2.355454
## 9 0.0177827941 2.906249 0.6766938 2.291491
## 9 0.0421696503 3.168778 0.6396169 2.498332
## 9 0.1000000000 2.866518 0.6877163 2.258521
## 11 0.0000000000 2.706917 0.7168649 2.140460
## 11 0.0001000000 2.649296 0.7208730 2.071452
## 11 0.0002371374 2.765109 0.7014155 2.179473
## 11 0.0005623413 2.718893 0.7111188 2.133525
## 11 0.0013335214 2.672008 0.7179390 2.094946
## 11 0.0031622777 2.766844 0.7006958 2.172767
## 11 0.0074989421 2.677288 0.7210141 2.098494
## 11 0.0177827941 2.643463 0.7258268 2.067625
## 11 0.0421696503 2.663246 0.7218661 2.088087
## 11 0.1000000000 2.726789 0.7029219 2.152613
## 13 0.0000000000 2.666065 0.7220600 2.095424
## 13 0.0001000000 2.724668 0.7059805 2.148564
## 13 0.0002371374 2.726602 0.7124426 2.141144
## 13 0.0005623413 2.704133 0.7149529 2.126859
## 13 0.0013335214 2.683710 0.7205373 2.120532
## 13 0.0031622777 2.616175 0.7259037 2.049593
## 13 0.0074989421 2.701043 0.7109228 2.126110
## 13 0.0177827941 2.636059 0.7238066 2.075185
## 13 0.0421696503 2.683429 0.7162118 2.121199
## 13 0.1000000000 2.704978 0.7180350 2.121065
## 15 0.0000000000 2.739306 0.7050684 2.171579
## 15 0.0001000000 2.766320 0.7048223 2.195702
## 15 0.0002371374 2.717148 0.7125183 2.148348
## 15 0.0005623413 2.725493 0.7115152 2.136367
## 15 0.0013335214 2.646336 0.7225710 2.097964
## 15 0.0031622777 2.595707 0.7324874 2.068968
## 15 0.0074989421 2.707178 0.7112363 2.127116
## 15 0.0177827941 2.716671 0.7172348 2.156263
## 15 0.0421696503 2.671954 0.7200690 2.109868
## 15 0.1000000000 2.733187 0.7081575 2.160507
## 17 0.0000000000 2.716154 0.7127638 2.153020
## 17 0.0001000000 2.731718 0.7132792 2.157075
## 17 0.0002371374 2.775934 0.7013180 2.188150
## 17 0.0005623413 2.741843 0.7158456 2.152959
## 17 0.0013335214 2.794495 0.7016332 2.198228
## 17 0.0031622777 2.723093 0.7167752 2.139788
## 17 0.0074989421 2.739145 0.7068115 2.157548
## 17 0.0177827941 2.721110 0.7122858 2.166796
## 17 0.0421696503 2.722485 0.7114488 2.140162
## 17 0.1000000000 2.760590 0.7058398 2.172915
## 19 0.0000000000 2.765093 0.7128154 2.180816
## 19 0.0001000000 2.730961 0.7168577 2.161501
## 19 0.0002371374 2.654476 0.7205933 2.090232
## 19 0.0005623413 2.727393 0.7137025 2.148831
## 19 0.0013335214 2.712209 0.7189890 2.118617
## 19 0.0031622777 2.714838 0.7147228 2.141693
## 19 0.0074989421 2.679775 0.7206830 2.128871
## 19 0.0177827941 2.648365 0.7280312 2.097090
## 19 0.0421696503 2.751194 0.7027446 2.197992
## 19 0.1000000000 2.706611 0.7150081 2.126365
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 15 and decay = 0.003162278.
RMSE was used to select the optimal model using the smallest value. The final value used for the model was k = 19.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set
## performance values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
marsPred <- predict(marsFit, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
## RMSE Rsquared MAE
## 1.776575 0.872700 1.358367
svmPred <- predict(svmFit, newdata = testData$x)
postResample(pred = svmPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.0792960 0.8247794 1.5796158
nnetPred <- predict(nnetFit, newdata = testData$x)
postResample(pred = nnetPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.7775652 0.6983048 2.1600409
After testing models using neural network, SVM, and MARS methods, it appears that the MARS model gives the best performance with the highest R-squared.
summary(marsFit$finalModel)
## Call: earth(x=data.frame[200,10], y=c(18.46,16.1,17...), keepxy=TRUE, degree=1,
## nprune=10)
##
## coefficients
## (Intercept) 20.3958041
## h(0.507267-X1) -3.0209971
## h(0.325504-X2) -2.8963069
## h(X3- -0.804171) 1.1187319
## h(-0.216741-X3) 3.4950111
## h(X3-0.453446) 2.1548596
## h(0.953812-X4) -2.7559239
## h(X4-0.953812) 2.8600536
## h(1.17878-X5) -1.5056208
## h(X6- -0.47556) -0.5025995
##
## Selected 10 of 18 terms, and 6 of 10 predictors (nprune=10)
## Termination condition: Reached nk 21
## Importance: X1, X4, X2, X5, X3, X6, X7-unused, X8-unused, X9-unused, ...
## Number of terms at each degree of interaction: 1 9 (additive model)
## GCV 2.731203 RSS 447.3848 GRSq 0.889112 RSq 0.9082649
Looking at the MARS model, it looks like MARS selects for all the important predictors (X1-X5), and also X6. The other four predictors are unused.
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
Load the Data:
data(ChemicalManufacturingProcess)
set.seed(63)
KNN imputation was used to fill in these missing values:
ChemicalManufacturingProcess_preProc <-
preProcess(ChemicalManufacturingProcess,
method = "knnImpute")
transformed_ChemMan <-
predict(ChemicalManufacturingProcess_preProc,
newdata = ChemicalManufacturingProcess)
df <- as.data.frame(transformed_ChemMan$Yield) %>%
rename(Yield = `transformed_ChemMan$Yield`)
Split the data into a training and a test set, pre-process the data:
smp_size <- floor(0.80 * nrow(ChemicalManufacturingProcess))
trainingDataindex <-
sample(seq_len(nrow(df)),
size = smp_size)
trainY <- df[trainingDataindex,]
testY <- df[-trainingDataindex,]
trainX <-
transformed_ChemMan[trainingDataindex,] %>%
select(-Yield,-BiologicalMaterial07)
testX <-
transformed_ChemMan[-trainingDataindex,] %>%
select(-Yield,-BiologicalMaterial07)
Train several nonlinear regression models:
knnModel <- train(x = trainX,
y = trainY,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 140 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 140, 140, 140, 140, 140, 140, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.8210597 0.3680027 0.6443853
## 7 0.7970288 0.4082682 0.6266218
## 9 0.7911356 0.4209653 0.6267681
## 11 0.7796157 0.4486290 0.6199528
## 13 0.7725634 0.4676830 0.6161589
## 15 0.7729705 0.4755742 0.6170175
## 17 0.7774953 0.4751335 0.6211100
## 19 0.7820071 0.4724227 0.6243845
## 21 0.7842379 0.4741401 0.6249302
## 23 0.7935614 0.4661117 0.6309000
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
marsFit <- train(x = trainX,
y = trainY,
method = "earth",
preProc = c("center", "scale"),
tuneLength = 10
)
marsFit
## Multivariate Adaptive Regression Spline
##
## 140 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 140, 140, 140, 140, 140, 140, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 0.8067763 0.3836168 0.6287981
## 3 0.7693529 0.5305143 0.5590409
## 5 1.6573659 0.5106048 0.6905175
## 7 1.8379534 0.4934748 0.7323306
## 9 2.0696569 0.4540472 0.7798165
## 10 2.2400202 0.4225292 0.8220717
## 12 2.3458344 0.3869790 0.8566331
## 14 2.7970783 0.3713898 0.9291264
## 16 2.8707279 0.3602541 0.9862669
## 18 2.8254260 0.3528206 0.9855120
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 3 and degree = 1.
svmFit <- train(x = trainX,
y = trainY,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 10
)
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 140 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 140, 140, 140, 140, 140, 140, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 0.8036324 0.4437882 0.6490731
## 0.50 0.7600531 0.4786698 0.6139241
## 1.00 0.7279352 0.5091851 0.5875679
## 2.00 0.7138129 0.5215663 0.5758778
## 4.00 0.7050396 0.5305260 0.5698711
## 8.00 0.7012890 0.5342665 0.5668837
## 16.00 0.7010781 0.5345333 0.5667706
## 32.00 0.7010781 0.5345333 0.5667706
## 64.00 0.7010781 0.5345333 0.5667706
## 128.00 0.7010781 0.5345333 0.5667706
##
## Tuning parameter 'sigma' was held constant at a value of 0.01425631
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01425631 and C = 16.
nnetFit <- train(x = trainX,
y = trainY,
method = "nnet",
preProc = c("center", "scale"),
tuneLength = 10,
linout = TRUE,
trace = FALSE,
maxit = 10 #previously when testing, no maxit was included;
# the nnetFit was not the optimal model and for the sake of
#speeding up the compilation process, maxit was set to 10
)
nnetFit
## Neural Network
##
## 140 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 140, 140, 140, 140, 140, 140, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.0000000000 0.9248433 0.3170729 0.7361273
## 1 0.0001000000 0.9151336 0.3377074 0.7333352
## 1 0.0002371374 0.9091448 0.3500610 0.7305424
## 1 0.0005623413 0.8606086 0.4027753 0.6800653
## 1 0.0013335214 0.8837661 0.3612925 0.7079176
## 1 0.0031622777 0.8612205 0.3695226 0.6805560
## 1 0.0074989421 0.9404846 0.2955631 0.7470113
## 1 0.0177827941 0.8971605 0.3684224 0.7135671
## 1 0.0421696503 0.9364533 0.3563117 0.7536157
## 1 0.1000000000 0.8560045 0.3961420 0.6785662
## 3 0.0000000000 0.8813206 0.4023446 0.6989767
## 3 0.0001000000 0.8647770 0.3971410 0.6867880
## 3 0.0002371374 0.8637482 0.4203207 0.6871346
## 3 0.0005623413 0.8675670 0.4183571 0.6958280
## 3 0.0013335214 0.8755747 0.3935424 0.6916964
## 3 0.0031622777 0.8310568 0.4480759 0.6596587
## 3 0.0074989421 0.8889056 0.4092101 0.7050905
## 3 0.0177827941 0.8434174 0.4215434 0.6640704
## 3 0.0421696503 0.8557974 0.4214406 0.6806607
## 3 0.1000000000 0.8445459 0.4251979 0.6739437
## 5 0.0000000000 0.8682579 0.3995772 0.6944602
## 5 0.0001000000 0.8213763 0.4356490 0.6515672
## 5 0.0002371374 0.8360422 0.4244960 0.6716993
## 5 0.0005623413 0.8282360 0.4491192 0.6686778
## 5 0.0013335214 0.8395676 0.4251960 0.6687905
## 5 0.0031622777 0.8344181 0.4378102 0.6606327
## 5 0.0074989421 0.8533184 0.4233793 0.6704455
## 5 0.0177827941 0.8307746 0.4394509 0.6657433
## 5 0.0421696503 0.8769075 0.4317627 0.7015333
## 5 0.1000000000 0.8209428 0.4455306 0.6643759
## 7 0.0000000000 0.8363869 0.4487137 0.6639338
## 7 0.0001000000 0.7972963 0.4567982 0.6381109
## 7 0.0002371374 0.8591522 0.4121276 0.6834172
## 7 0.0005623413 0.8310501 0.4294286 0.6634292
## 7 0.0013335214 0.8152954 0.4436168 0.6491718
## 7 0.0031622777 0.8301427 0.4279378 0.6652817
## 7 0.0074989421 0.8107277 0.4513242 0.6349494
## 7 0.0177827941 0.8259849 0.4412193 0.6589818
## 7 0.0421696503 0.8453644 0.4379954 0.6807763
## 7 0.1000000000 0.8463009 0.4296334 0.6805771
## 9 0.0000000000 0.8508239 0.4127260 0.6742341
## 9 0.0001000000 0.8153449 0.4604929 0.6532740
## 9 0.0002371374 0.8606621 0.4307920 0.6811905
## 9 0.0005623413 0.8411619 0.4545901 0.6681682
## 9 0.0013335214 0.8078752 0.4514411 0.6416487
## 9 0.0031622777 0.8247292 0.4439247 0.6624658
## 9 0.0074989421 0.8189458 0.4342796 0.6526431
## 9 0.0177827941 0.8397415 0.4292906 0.6709879
## 9 0.0421696503 0.8173018 0.4592087 0.6568513
## 9 0.1000000000 0.7874982 0.4811680 0.6306090
## 11 0.0000000000 0.8138174 0.4577666 0.6441461
## 11 0.0001000000 0.8370619 0.4405921 0.6650695
## 11 0.0002371374 0.8203274 0.4474813 0.6620224
## 11 0.0005623413 0.8279790 0.4594757 0.6573980
## 11 0.0013335214 0.8305866 0.4405883 0.6694777
## 11 0.0031622777 0.8932409 0.4048862 0.7079578
## 11 0.0074989421 0.8764419 0.4067397 0.6954893
## 11 0.0177827941 0.8191023 0.4427937 0.6580685
## 11 0.0421696503 0.8547854 0.4228129 0.6853764
## 11 0.1000000000 0.8133771 0.4523781 0.6473431
## 13 0.0000000000 0.8432105 0.4279083 0.6697757
## 13 0.0001000000 0.8188832 0.4584262 0.6534613
## 13 0.0002371374 0.9177379 0.3832088 0.7186223
## 13 0.0005623413 0.8333777 0.4339363 0.6602885
## 13 0.0013335214 0.8816114 0.4101488 0.7035126
## 13 0.0031622777 0.8396244 0.4376282 0.6643309
## 13 0.0074989421 0.8936750 0.4099712 0.7178524
## 13 0.0177827941 0.8616597 0.4170124 0.6849857
## 13 0.0421696503 0.8655112 0.4126582 0.6944487
## 13 0.1000000000 0.8588354 0.4090038 0.6880499
## 15 0.0000000000 0.8348542 0.4431961 0.6633181
## 15 0.0001000000 0.8517400 0.4128355 0.6882730
## 15 0.0002371374 0.8459665 0.4267448 0.6795971
## 15 0.0005623413 0.8189906 0.4506501 0.6517980
## 15 0.0013335214 0.8065190 0.4631345 0.6497759
## 15 0.0031622777 0.8567555 0.4233014 0.6895464
## 15 0.0074989421 0.8417320 0.4401707 0.6732156
## 15 0.0177827941 0.8382900 0.4339149 0.6709162
## 15 0.0421696503 0.8288385 0.4414617 0.6630455
## 15 0.1000000000 0.8456879 0.4175965 0.6751060
## 17 0.0000000000 0.8545258 0.4159990 0.6781441
## 17 0.0001000000 0.8227302 0.4380607 0.6614969
## 17 0.0002371374 0.8550329 0.4131430 0.6904461
## 17 0.0005623413 0.8485439 0.4295521 0.6872790
## 17 0.0013335214 0.8378640 0.4354744 0.6778011
## 17 0.0031622777 0.8243470 0.4637972 0.6549740
## 17 0.0074989421 0.8443815 0.4206452 0.6799852
## 17 0.0177827941 0.8567341 0.4269800 0.6886460
## 17 0.0421696503 0.8530659 0.4377044 0.6833435
## 17 0.1000000000 0.8175059 0.4402002 0.6600822
## 19 0.0000000000 NaN NaN NaN
## 19 0.0001000000 NaN NaN NaN
## 19 0.0002371374 NaN NaN NaN
## 19 0.0005623413 NaN NaN NaN
## 19 0.0013335214 NaN NaN NaN
## 19 0.0031622777 NaN NaN NaN
## 19 0.0074989421 NaN NaN NaN
## 19 0.0177827941 NaN NaN NaN
## 19 0.0421696503 NaN NaN NaN
## 19 0.1000000000 NaN NaN NaN
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 9 and decay = 0.1.
The SVM model gives the best performance, with the lowest errors and highest R-Squared.
knnPred <- predict(knnModel, newdata = testX)
postResample(pred = knnPred, obs = testY)
## RMSE Rsquared MAE
## 0.7253213 0.3550456 0.6271034
marsPred <- predict(marsFit, newdata = testX)
postResample(pred = marsPred, obs = testY)
## RMSE Rsquared MAE
## 0.6607892 0.4712673 0.5243056
svmPred <- predict(svmFit, newdata = testX)
postResample(pred = svmPred, obs = testY)
## RMSE Rsquared MAE
## 0.5495584 0.6230445 0.4216271
nnetPred <- predict(nnetFit, newdata = testX)
postResample(pred = nnetPred, obs = testY)
## RMSE Rsquared MAE
## 0.6361633 0.5400505 0.5114185
It looks like the most important predictors were ManufacturingProcess32, ManufacturingProcess13, ManufacturingProcess17, BiologicalMaterial06, ManufacturingProcess09, BiologicalMaterial12, ManufacturingProcess36, BiologicalMaterial03, BiologicalMaterial02, and ManufacturingProcess31. Compared to the top ten predictors of the optimal linear model, there are 7 overlapping out of the top 10, with the exceptions being BiologicalMaterial12, BiologicalMaterial02, and ManufacturingProcess31. It looks like the process predictors dominate the list.
print(varImp(svmFit))
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 99.79
## ManufacturingProcess17 89.14
## BiologicalMaterial06 88.19
## ManufacturingProcess09 87.03
## BiologicalMaterial12 80.65
## ManufacturingProcess36 77.72
## BiologicalMaterial03 77.60
## BiologicalMaterial02 66.59
## ManufacturingProcess31 65.20
## ManufacturingProcess06 59.96
## ManufacturingProcess11 59.14
## ManufacturingProcess33 56.29
## ManufacturingProcess02 53.39
## BiologicalMaterial11 51.10
## BiologicalMaterial08 48.68
## BiologicalMaterial04 48.62
## ManufacturingProcess29 45.68
## ManufacturingProcess30 44.71
## ManufacturingProcess12 40.34
ctrl <- trainControl(method = "cv", number = 10)
plsTune <- train(trainX,trainY,
method = "pls",
tuneLength = 3,
trControl = ctrl,
preProcess = c("center","scale"))
print(varImp(plsTune))
## pls variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess09 91.67
## ManufacturingProcess13 88.81
## ManufacturingProcess36 88.11
## BiologicalMaterial02 80.05
## BiologicalMaterial06 79.08
## ManufacturingProcess17 76.50
## ManufacturingProcess33 74.93
## BiologicalMaterial03 74.13
## BiologicalMaterial08 68.16
## ManufacturingProcess11 63.71
## BiologicalMaterial12 63.46
## ManufacturingProcess12 63.37
## ManufacturingProcess06 62.08
## BiologicalMaterial04 61.89
## BiologicalMaterial01 60.68
## BiologicalMaterial11 59.06
## ManufacturingProcess04 46.60
## ManufacturingProcess28 42.14
## ManufacturingProcess30 39.22
lmPred <- predict(plsTune, newdata = testX)
postResample(pred = lmPred, obs = testY)
## RMSE Rsquared MAE
## 0.7405603 0.3649402 0.6161330
The three unique top ten predictors for the optimal nonlinear regression model are BiologicalMaterial12, BiologicalMaterial02, and ManufacturingProcess31. Given that the strongest predictors were not unique, such that the remaining distinct predictors had less importance, it is difficult to find intuition on the predictors based on these three unique predictors. The ManufacturingProcess31 has a remarkably low R-Squared, indicating essentially no predictive value. The R-squared for BiologicalMaterial12 and BiologicalMaterial02 are also well below any common sense threshold for evaluating relationships between the predictor and response.
data <- cbind(trainX, Yield = trainY)
BiologicalMaterial12lm <-
lm(Yield ~ BiologicalMaterial12, data = data)
summary(BiologicalMaterial12lm)
##
## Call:
## lm(formula = Yield ~ BiologicalMaterial12, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.03107 -0.70782 -0.03244 0.64796 2.72309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.05114 0.08030 0.637 0.525
## BiologicalMaterial12 0.38687 0.07924 4.882 2.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9492 on 138 degrees of freedom
## Multiple R-squared: 0.1473, Adjusted R-squared: 0.1411
## F-statistic: 23.84 on 1 and 138 DF, p-value: 2.857e-06
ggplot(data,
aes(x = BiologicalMaterial12, y = Yield)) +
geom_point() +
labs(x = "BiologicalMaterial12",
y = "Yield") +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()
BiologicalMaterial02lm <-
lm(Yield ~ BiologicalMaterial02, data = data)
summary(BiologicalMaterial02lm)
##
## Call:
## lm(formula = Yield ~ BiologicalMaterial02, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.52394 -0.54232 -0.05247 0.57589 2.34771
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.02531 0.07606 0.333 0.74
## BiologicalMaterial02 0.48177 0.07425 6.488 1.44e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8998 on 138 degrees of freedom
## Multiple R-squared: 0.2337, Adjusted R-squared: 0.2282
## F-statistic: 42.1 on 1 and 138 DF, p-value: 1.442e-09
ggplot(data,
aes(x = BiologicalMaterial02, y = Yield)) +
geom_point() +
labs(x = "BiologicalMaterial02",
y = "Yield") +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()
ManufacturingProcess31lm <-
lm(Yield ~ ManufacturingProcess31, data = data)
summary(ManufacturingProcess31lm)
##
## Call:
## lm(formula = Yield ~ ManufacturingProcess31, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6841 -0.8112 -0.1131 0.7519 3.3011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03317 0.08670 0.383 0.703
## ManufacturingProcess31 -0.05947 0.07887 -0.754 0.452
##
## Residual standard error: 1.026 on 138 degrees of freedom
## Multiple R-squared: 0.004103, Adjusted R-squared: -0.003113
## F-statistic: 0.5686 on 1 and 138 DF, p-value: 0.4521
ggplot(data,
aes(x = ManufacturingProcess31, y = Yield)) +
geom_point() +
labs(x = "ManufacturingProcess31",
y = "Yield") +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()