library(caret)
library(tidyverse)
library(earth)
library(kableExtra)
library(nnet)
library(kernlab)
library(corrplot)
library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
# of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
KNN Model
library(caret)
set.seed(669)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.566994 0.4871062 2.908466
## 7 3.452048 0.5163553 2.832638
## 9 3.349224 0.5536141 2.744137
## 11 3.291738 0.5806179 2.683628
## 13 3.284603 0.5951534 2.680057
## 15 3.291283 0.6019249 2.677029
## 17 3.251163 0.6240631 2.632062
## 19 3.235118 0.6400615 2.623492
## 21 3.245644 0.6461305 2.630637
## 23 3.263783 0.6505076 2.653432
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 19.
ggplot(knnModel)
set.seed(669)
knnPred <- predict(knnModel, newdata = testData$x)
KNN_Model <- postResample(pred = knnPred, obs = testData$y)
KNN_Model
## RMSE Rsquared MAE
## 3.2286834 0.6871735 2.5939727
Support Vector Machines
set.seed(669)
svmModel <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "repeatedcv",
repeats = 5))
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.505683 0.7984124 2.011150
## 0.50 2.259026 0.8136497 1.801419
## 1.00 2.096017 0.8338866 1.657465
## 2.00 1.957438 0.8540832 1.541841
## 4.00 1.865441 0.8665348 1.457057
## 8.00 1.830489 0.8707624 1.437982
## 16.00 1.806548 0.8740805 1.438666
## 32.00 1.809575 0.8737777 1.441658
## 64.00 1.809575 0.8737777 1.441658
## 128.00 1.809575 0.8737777 1.441658
##
## Tuning parameter 'sigma' was held constant at a value of 0.05253932
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.05253932 and C = 16.
ggplot(svmModel)
set.seed(669)
svmPred <- predict(svmModel, newdata = testData$x)
SVM_Model <- postResample(pred = svmPred, obs = testData$y)
SVM_Model
## RMSE Rsquared MAE
## 2.0378439 0.8316464 1.5515653
Multivariate Adaptive Regression Splines (MARS)
set.seed(669)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:15)
marsModel <- train(trainingData$x, trainingData$y,
method = "earth",
tuneGrid = marsGrid,
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "repeatedcv",
repeats = 5))
marsModel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.391642 0.2352467 3.6389397
## 1 3 3.595964 0.4838697 2.9026814
## 1 4 2.666028 0.7275166 2.1469656
## 1 5 2.403640 0.7793935 1.9175110
## 1 6 2.278405 0.8032290 1.7776556
## 1 7 1.836919 0.8706176 1.4514388
## 1 8 1.703805 0.8870479 1.3404682
## 1 9 1.670006 0.8904408 1.3103221
## 1 10 1.663810 0.8923000 1.3090172
## 1 11 1.660675 0.8932802 1.3098248
## 1 12 1.647751 0.8939846 1.2981747
## 1 13 1.648724 0.8942421 1.2888127
## 1 14 1.653220 0.8941717 1.2931866
## 1 15 1.656168 0.8938271 1.2932214
## 2 2 4.437576 0.2208350 3.6690016
## 2 3 3.627734 0.4781926 2.9330588
## 2 4 2.768683 0.7027988 2.2394254
## 2 5 2.475277 0.7625887 1.9670835
## 2 6 2.366015 0.7844811 1.8529785
## 2 7 1.912687 0.8561997 1.5073175
## 2 8 1.759633 0.8799887 1.3699165
## 2 9 1.579606 0.9028442 1.2398586
## 2 10 1.494016 0.9126908 1.1803704
## 2 11 1.392971 0.9252088 1.1010217
## 2 12 1.317019 0.9331791 1.0353131
## 2 13 1.265877 0.9382709 0.9972193
## 2 14 1.230100 0.9417081 0.9714028
## 2 15 1.232010 0.9419678 0.9788213
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
ggplot(marsModel)
set.seed(669)
marsPred <- predict(marsModel, newdata = testData$x)
MARS_Model <- postResample(pred = marsPred, obs = testData$y)
MARS_Model
## RMSE Rsquared MAE
## 1.2779993 0.9338365 1.0147070
Neural Networks
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10))
set.seed(669)
nnetModel <- train(trainingData$x, trainingData$y,
method = "nnet",
tuneGrid = nnetGrid,
trControl = trainControl(method = "repeatedcv",
repeats = 5),
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1,
maxit = 500)
nnetModel
## Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 2.645934 0.7168734 2.096024
## 0.00 2 2.694271 0.7212110 2.129835
## 0.00 3 2.421265 0.7696578 1.923676
## 0.00 4 2.219044 0.8064061 1.766752
## 0.00 5 2.596583 0.7443744 2.035372
## 0.00 6 6.150621 0.5993008 3.257671
## 0.00 7 5.313863 0.5634306 3.120356
## 0.00 8 7.168552 0.4343077 3.833502
## 0.00 9 9.070953 0.5167586 4.464567
## 0.00 10 6.871293 0.5036091 4.139846
## 0.01 1 2.425908 0.7723423 1.896850
## 0.01 2 2.559585 0.7455838 1.992423
## 0.01 3 2.362555 0.7804042 1.865448
## 0.01 4 2.347416 0.7903146 1.875824
## 0.01 5 2.719208 0.7349609 2.121972
## 0.01 6 2.765381 0.7258617 2.180966
## 0.01 7 3.044882 0.6804758 2.404104
## 0.01 8 3.304617 0.6396799 2.632737
## 0.01 9 3.423304 0.6159683 2.689722
## 0.01 10 3.506547 0.5988202 2.775247
## 0.10 1 2.436429 0.7699589 1.901416
## 0.10 2 2.610332 0.7360692 2.038060
## 0.10 3 2.399736 0.7752067 1.906838
## 0.10 4 2.394090 0.7754421 1.896449
## 0.10 5 2.547360 0.7528304 1.996978
## 0.10 6 2.651011 0.7377691 2.091993
## 0.10 7 2.723797 0.7315738 2.193733
## 0.10 8 3.011389 0.6746603 2.401119
## 0.10 9 3.133136 0.6749866 2.476583
## 0.10 10 3.091120 0.6756543 2.510404
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4 and decay = 0.
ggplot(nnetModel)
set.seed(669)
nnetPred <- predict(nnetModel, newdata = testData$x)
NNET_Model <- postResample(pred = nnetPred, obs = testData$y)
NNET_Model
## RMSE Rsquared MAE
## 2.5520664 0.7449566 1.9470147
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
models_summary <- rbind(KNN_Model, SVM_Model, MARS_Model, NNET_Model)
kbl(models_summary) |>
kable_styling()
| RMSE | Rsquared | MAE | |
|---|---|---|---|
| KNN_Model | 3.228683 | 0.6871735 | 2.593973 |
| SVM_Model | 2.037844 | 0.8316464 | 1.551565 |
| MARS_Model | 1.277999 | 0.9338365 | 1.014707 |
| NNET_Model | 2.552066 | 0.7449566 | 1.947015 |
The MARS model performed the best, with the highest R-square value of 0.9338365 and the lowest RMSE and MAE values among all the models.
mars_var_imp <- varImp(marsModel)
mars_var_imp
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.40
## X2 49.00
## X5 15.72
## X3 0.00
set.seed(123)
mars_model_importance <- varImp(marsModel)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
set.seed(123)
varImp(marsModel) %>%
plot(., top = max(mars_model_importance$importance), main = "Informative Predictors of MARS Model")
MARS doesn’t select all of the informative predictors. All of the predictors registered an importance rating except for X3, which had a 0 importance rating. Despite this, however, the model still performed best among all the models used. A subsequent MARS model should be re-ran without the X3 predictor.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
set.seed(123)
bag_model <- preProcess(ChemicalManufacturingProcess, "bagImpute")
bag_df_model <- predict(bag_model, ChemicalManufacturingProcess)
set.seed(123)
bag_df_model2 <- bag_df_model[,-nearZeroVar(bag_df_model)]
dim(bag_df_model2)
## [1] 176 57
set.seed(123)
train_data <- createDataPartition(bag_df_model2$Yield, times = 1, p = 0.8, list = FALSE)
train_x <- bag_df_model2[train_data, -1]
train_y <- bag_df_model2[train_data, 1]
test_x <- bag_df_model2[-train_data, -1]
test_y <- bag_df_model2[-train_data, 1]
K-Nearest Neighbors
set.seed(123)
ctrl <- trainControl(
method = "repeatedcv",
repeats = 3
)
set.seed(123)
knnModel2 <- train(
train_x, train_y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 20,
trControl = ctrl
)
knnModel2
## k-Nearest Neighbors
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 128, 129, 129, 130, 128, 131, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.250895 0.5647683 0.9928731
## 7 1.301198 0.5320462 1.0493202
## 9 1.300841 0.5335003 1.0506822
## 11 1.316066 0.5218269 1.0669085
## 13 1.328039 0.5106194 1.0790389
## 15 1.337448 0.5038661 1.0916004
## 17 1.351170 0.4974015 1.1001895
## 19 1.366037 0.4888427 1.1152138
## 21 1.382910 0.4798006 1.1257250
## 23 1.389013 0.4829600 1.1277972
## 25 1.399717 0.4761087 1.1360785
## 27 1.417799 0.4662447 1.1485230
## 29 1.430490 0.4565867 1.1586513
## 31 1.442781 0.4503745 1.1668414
## 33 1.454807 0.4426267 1.1778395
## 35 1.464186 0.4396721 1.1848487
## 37 1.475419 0.4315570 1.1929832
## 39 1.486659 0.4268052 1.2021331
## 41 1.497175 0.4235994 1.2093816
## 43 1.501424 0.4230930 1.2127603
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.
ggplot(knnModel2)
set.seed(669)
knn_pred2<- predict(knnModel2, newdata = test_x)
KNN_Model2 <- postResample(pred = knn_pred2, obs = test_y)
KNN_Model2
## RMSE Rsquared MAE
## 1.4414456 0.3896228 1.2278750
knn_var_imp <- varImp(knnModel2)
knn_var_imp
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 94.06
## BiologicalMaterial03 81.27
## ManufacturingProcess13 80.63
## ManufacturingProcess36 80.31
## BiologicalMaterial02 76.04
## ManufacturingProcess17 75.92
## ManufacturingProcess31 74.68
## ManufacturingProcess09 73.04
## BiologicalMaterial12 69.48
## ManufacturingProcess06 66.33
## BiologicalMaterial11 59.72
## ManufacturingProcess33 58.04
## ManufacturingProcess29 54.55
## BiologicalMaterial04 53.93
## BiologicalMaterial01 45.62
## BiologicalMaterial08 44.93
## ManufacturingProcess11 44.87
## BiologicalMaterial09 40.88
## ManufacturingProcess30 38.31
set.seed(123)
knn_model_importance <- varImp(knnModel2)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
top_n(10) |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
set.seed(123)
varImp(knnModel2) %>%
plot(., top = max(knn_model_importance$importance), main = "Top Ten Informative Predictors of KNN Model")
Neural Networks
tooHigh <- findCorrelation(cor(train_x), cutoff = .75)
train_x2 <- train_x[, -tooHigh]
test_x2 <- test_x[, -tooHigh]
nnetGrid <- expand.grid(.decay = c(0, 0.01, .1),
.size = c(1:10))
set.seed(669)
nnetModel2 <- train(train_x2, train_y,
method = "nnet",
tuneGrid = nnetGrid,
trControl = trainControl(method = "repeatedcv",
repeats = 5),
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(train_x2) + 1) + 10 + 1,
maxit = 500)
nnetModel2
## Neural Network
##
## 144 samples
## 35 predictor
##
## Pre-processing: centered (35), scaled (35)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 128, 128, 129, 129, 131, 131, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 1.674648 0.2277418 1.364539
## 0.00 2 1.773325 0.2444448 1.391076
## 0.00 3 3.795548 0.2214851 2.789047
## 0.00 4 3.893895 0.1321894 3.055801
## 0.00 5 3.503325 0.1666450 2.760050
## 0.00 6 3.886601 0.1440405 3.022761
## 0.00 7 4.347152 0.1715446 3.049963
## 0.00 8 5.943145 0.2045977 4.149726
## 0.00 9 6.528801 0.1799045 4.435690
## 0.00 10 8.319876 0.1268489 5.268021
## 0.01 1 1.523820 0.4189965 1.220261
## 0.01 2 1.954774 0.3397195 1.503600
## 0.01 3 3.173412 0.2402570 2.402173
## 0.01 4 3.126744 0.2065022 2.385414
## 0.01 5 2.834392 0.2042247 2.200724
## 0.01 6 2.763188 0.2072512 2.078035
## 0.01 7 2.593202 0.2505570 2.047790
## 0.01 8 2.524518 0.2654474 1.992773
## 0.01 9 2.696863 0.2679921 2.083592
## 0.01 10 3.286269 0.2209813 2.427671
## 0.10 1 1.367214 0.5323299 1.093011
## 0.10 2 2.176182 0.3255796 1.591378
## 0.10 3 2.855655 0.2148824 2.042474
## 0.10 4 2.639663 0.2578748 2.047180
## 0.10 5 2.622146 0.2172052 2.011185
## 0.10 6 2.704562 0.2513017 1.976432
## 0.10 7 2.433528 0.2681299 1.846671
## 0.10 8 2.434743 0.2543910 1.865328
## 0.10 9 2.414183 0.3014347 1.786016
## 0.10 10 2.373279 0.2829206 1.812822
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.
ggplot(nnetModel2)
set.seed(669)
nnetPred2 <- predict(nnetModel2, newdata = test_x2)
NNET_Model2 <- postResample(pred = nnetPred2, obs = test_y)
NNET_Model2
## RMSE Rsquared MAE
## 1.6413986 0.2609459 1.3563031
nnet_var_imp <- varImp(nnetModel2)
nnet_var_imp
## nnet variable importance
##
## only 20 most important variables shown (out of 35)
##
## Overall
## ManufacturingProcess20 100.000
## ManufacturingProcess17 36.613
## ManufacturingProcess43 25.503
## BiologicalMaterial03 22.405
## ManufacturingProcess02 21.518
## ManufacturingProcess21 17.391
## BiologicalMaterial05 16.074
## ManufacturingProcess30 15.363
## BiologicalMaterial11 14.875
## ManufacturingProcess27 13.461
## ManufacturingProcess04 13.202
## ManufacturingProcess19 12.477
## ManufacturingProcess37 12.453
## ManufacturingProcess36 9.875
## ManufacturingProcess06 8.904
## BiologicalMaterial09 8.812
## ManufacturingProcess44 8.675
## ManufacturingProcess34 8.410
## ManufacturingProcess11 8.369
## ManufacturingProcess28 7.470
set.seed(123)
nnet_model_importance <- varImp(nnetModel2)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
top_n(10) |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
set.seed(123)
varImp(nnetModel2) %>%
plot(., top = max(nnet_model_importance$importance), main = "Top Ten Informative Predictors of Neural Networks Model")
Multivariate Adaptive Regression Splines (MARS)
set.seed(669)
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:15)
marsModel2 <- train(train_x, train_y,
method = "earth",
tuneGrid = marsGrid,
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "repeatedcv",
repeats = 5))
marsModel2
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 128, 128, 129, 129, 131, 131, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.409018 0.4576942 1.1169488
## 1 3 1.237232 0.5683528 0.9933988
## 1 4 1.690011 0.5445486 1.1410357
## 1 5 1.627422 0.5450424 1.1205388
## 1 6 1.695787 0.5345547 1.1512436
## 1 7 1.716881 0.5339539 1.1617209
## 1 8 1.705811 0.5454112 1.1551347
## 1 9 1.768709 0.5367055 1.1737097
## 1 10 1.744318 0.5264892 1.1732639
## 1 11 1.735958 0.5285960 1.1643584
## 1 12 1.751344 0.5193017 1.1736825
## 1 13 1.774201 0.5091363 1.1863299
## 1 14 1.780298 0.5089429 1.1874428
## 1 15 1.782731 0.5098715 1.1868117
## 2 2 1.403400 0.4512591 1.1179170
## 2 3 1.286627 0.5289070 1.0375263
## 2 4 1.255857 0.5451410 0.9986199
## 2 5 1.254515 0.5542336 0.9972450
## 2 6 1.411216 0.5371858 1.0558050
## 2 7 1.451516 0.5388871 1.0763556
## 2 8 1.418642 0.5629594 1.0598353
## 2 9 1.430846 0.5618721 1.0508746
## 2 10 1.420267 0.5702010 1.0299538
## 2 11 1.437566 0.5585835 1.0479050
## 2 12 1.557576 0.5304418 1.1072788
## 2 13 1.555677 0.5161191 1.1026097
## 2 14 1.575808 0.5169944 1.1095229
## 2 15 1.660752 0.5028044 1.1408227
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 3 and degree = 1.
ggplot(marsModel2)
set.seed(669)
marsPred2 <- predict(marsModel2, newdata = test_x)
MARS_Model2 <- postResample(pred = marsPred2, obs = test_y)
MARS_Model2
## RMSE Rsquared MAE
## 1.2280303 0.5472378 0.9821450
mars_var_imp2 <- varImp(marsModel2)
mars_var_imp2
## earth variable importance
##
## Overall
## ManufacturingProcess32 100
## ManufacturingProcess09 0
set.seed(123)
mars_model_importance <- varImp(marsModel2)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
#top_n(10) |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
set.seed(123)
varImp(nnetModel2) %>%
plot(., top = max(mars_model_importance$importance), main = "Informative Predictors of MARS Model")
Support Vector Machines (SVM)
set.seed(669)
svmModel2 <- train(train_x, train_y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "repeatedcv",
repeats = 5))
svmModel2
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 128, 128, 129, 129, 131, 131, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.382048 0.5213295 1.1142298
## 0.50 1.273575 0.5700082 1.0321389
## 1.00 1.181972 0.6230363 0.9484286
## 2.00 1.145390 0.6432492 0.9167154
## 4.00 1.124105 0.6547611 0.8985218
## 8.00 1.115948 0.6600861 0.8953382
## 16.00 1.115948 0.6600861 0.8953382
## 32.00 1.115948 0.6600861 0.8953382
## 64.00 1.115948 0.6600861 0.8953382
## 128.00 1.115948 0.6600861 0.8953382
##
## Tuning parameter 'sigma' was held constant at a value of 0.01844985
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01844985 and C = 8.
ggplot(svmModel2)
set.seed(669)
svmPred <- predict(svmModel2, newdata = test_x)
SVM_Model2 <- postResample(pred = svmPred, obs = test_y)
SVM_Model2
## RMSE Rsquared MAE
## 1.2381170 0.5494293 1.0461235
svm_var_imp <- varImp(svmModel2)
svm_var_imp
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 94.06
## BiologicalMaterial03 81.27
## ManufacturingProcess13 80.63
## ManufacturingProcess36 80.31
## BiologicalMaterial02 76.04
## ManufacturingProcess17 75.92
## ManufacturingProcess31 74.68
## ManufacturingProcess09 73.04
## BiologicalMaterial12 69.48
## ManufacturingProcess06 66.33
## BiologicalMaterial11 59.72
## ManufacturingProcess33 58.04
## ManufacturingProcess29 54.55
## BiologicalMaterial04 53.93
## BiologicalMaterial01 45.62
## BiologicalMaterial08 44.93
## ManufacturingProcess11 44.87
## BiologicalMaterial09 40.88
## ManufacturingProcess30 38.31
set.seed(123)
svm_model_importance <- varImp(svmModel2)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
top_n(10) |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
set.seed(123)
varImp(svmModel2) %>%
plot(., top = max(svm_model_importance$importance), main = "Top Ten Informative Predictors of SVM Model")
models_summary2 <- rbind(KNN_Model2, SVM_Model2, MARS_Model2, NNET_Model2)
kbl(models_summary2) |>
kable_styling()
| RMSE | Rsquared | MAE | |
|---|---|---|---|
| KNN_Model2 | 1.441446 | 0.3896228 | 1.227875 |
| SVM_Model2 | 1.238117 | 0.5494293 | 1.046123 |
| MARS_Model2 | 1.228030 | 0.5472378 | 0.982145 |
| NNET_Model2 | 1.641399 | 0.2609459 | 1.356303 |
The Support Vector Machine model performed the best, having the highest R-squared value of 0.549423, slightly better than the MARS Model, which has an R-squared value of 0.5472378. The MARS model had the lowest RMSE and MAE values, slightly less than the SVM model.
varImp(svmModel2)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 94.06
## BiologicalMaterial03 81.27
## ManufacturingProcess13 80.63
## ManufacturingProcess36 80.31
## BiologicalMaterial02 76.04
## ManufacturingProcess17 75.92
## ManufacturingProcess31 74.68
## ManufacturingProcess09 73.04
## BiologicalMaterial12 69.48
## ManufacturingProcess06 66.33
## BiologicalMaterial11 59.72
## ManufacturingProcess33 58.04
## ManufacturingProcess29 54.55
## BiologicalMaterial04 53.93
## BiologicalMaterial01 45.62
## BiologicalMaterial08 44.93
## ManufacturingProcess11 44.87
## BiologicalMaterial09 40.88
## ManufacturingProcess30 38.31
set.seed(123)
svm_model_importance <- varImp(svmModel2)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
top_n(10) |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
set.seed(123)
varImp(svmModel2) %>%
plot(., top = max(svm_model_importance$importance), main = "Top Ten Informative Predictors of SVM Model")
set.seed(123)
svm_model_importance |>
mutate(Variable = str_replace_all(Variable, "[0-9]+", "")) |>
group_by(Variable) |>
count() |>
arrange(desc(n)) |>
kbl() |>
kable_styling(latex_options="scale_down", c("striped", "hover", "condensed", full_width=F))
| Variable | n |
|---|---|
| ManufacturingProcess | 6 |
| BiologicalMaterial | 4 |
Among the top ten predictors, there were more process variables than biological variables.
Linear Model
set.seed(669)
larsModel <- train(train_x, train_y,
method = "lars",
metric = "Rsquared",
tuneLength = 20,
trControl = trainControl(method = "repeatedcv",
repeats = 5),
preProc = c("center", "scale"))
larsModel
## Least Angle Regression
##
## 144 samples
## 56 predictor
##
## Pre-processing: centered (56), scaled (56)
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 128, 128, 129, 129, 131, 131, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.05 1.234299 0.6064379 1.0013866
## 0.10 1.241295 0.6195410 0.9530008
## 0.15 1.438065 0.5760713 1.0316814
## 0.20 1.777137 0.5472013 1.1548211
## 0.25 2.028345 0.5222114 1.2407399
## 0.30 2.238594 0.4903087 1.3121701
## 0.35 2.302114 0.4875519 1.3341280
## 0.40 2.364650 0.4913298 1.3551078
## 0.45 2.484505 0.4843489 1.3968820
## 0.50 2.689362 0.4745008 1.4581656
## 0.55 2.923208 0.4637673 1.5363775
## 0.60 3.187954 0.4593655 1.6272443
## 0.65 3.493454 0.4552685 1.7214667
## 0.70 3.903853 0.4507750 1.8398080
## 0.75 4.306866 0.4455242 1.9590340
## 0.80 4.681995 0.4367231 2.0695571
## 0.85 5.071754 0.4286621 2.1856062
## 0.90 5.464563 0.4209429 2.3022083
## 0.95 5.925157 0.4146680 2.4349859
## 1.00 6.393133 0.4086651 2.5688622
##
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was fraction = 0.1.
set.seed(669)
larsPred <- predict(larsModel, newdata = test_x)
LARS_Model <- postResample(pred = larsPred, obs = test_y)
LARS_Model
## RMSE Rsquared MAE
## 1.2124888 0.5708666 1.0185659
varImp(larsModel)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 94.06
## BiologicalMaterial03 81.27
## ManufacturingProcess13 80.63
## ManufacturingProcess36 80.31
## BiologicalMaterial02 76.04
## ManufacturingProcess17 75.92
## ManufacturingProcess31 74.68
## ManufacturingProcess09 73.04
## BiologicalMaterial12 69.48
## ManufacturingProcess06 66.33
## BiologicalMaterial11 59.72
## ManufacturingProcess33 58.04
## ManufacturingProcess29 54.55
## BiologicalMaterial04 53.93
## BiologicalMaterial01 45.62
## BiologicalMaterial08 44.93
## ManufacturingProcess11 44.87
## BiologicalMaterial09 40.88
## ManufacturingProcess30 38.31
set.seed(123)
lars_model_importance <- varImp(larsModel)$importance |>
as.data.frame() |>
rownames_to_column("Variable") |>
top_n(10) |>
arrange(desc(Overall)) |>
mutate(importance = row_number())
set.seed(123)
varImp(larsModel) %>%
plot(., top = max(lars_model_importance$importance), main = "Top Ten Informative Predictors of LARS Model")
set.seed(123)
varImp(svmModel2) %>%
plot(., top = max(svm_model_importance$importance), main = "Top Ten Informative Predictors of SVM Model")
set.seed(123)
varImp(larsModel) %>%
plot(., top = max(lars_model_importance$importance), main = "Top Ten Informative Predictors of LARS Model")
The top ten predictors are the same in each model.
set.seed(123)
top_10_pred <- varImp(svmModel2)$importance |>
arrange(desc(Overall)) |>
head(10)
bag_df_model2 |>
select(c("Yield", row.names(top_10_pred))) |>
cor() |>
corrplot(method="square", tl.cex = 0.9, number.cex = 0.6, addCoef.col="black")
train_x %>%
select(row.names(top_10_pred)) %>%
featurePlot(., train_y)
ManufacturingProcess32 variable had the highest correlation with
Yield at 0.61, while ManufacturingProcess36
had the lowest correlation with Yield at -0.53.
ManufacturingProcess09 was the only other process variable
that had a positive correlation with Yield at 0.50, with
the four other process variables in the overall top ten important
predictor variables having a negative correlation with
Yield. Each of the four biological variables had positive
correlations with Yield, with
BiologicalMaterial06 and BiologicalMaterial02
having the highest correlation values among biological variables at
0.48. Overall, while there were more process variables deemed important
than biological variables, the correlation between Yield
and process variable predictors varied, while the relationship between
Yield and each of the biological variables were positive,
with correlations between 0.37 and 0.50.