library(caret)
library(tidyverse)
library(mlbench)
library(AppliedPredictiveModeling)
library(kernlab)
library(mice)
library(kableExtra)
library(gridExtra)
set.seed(7)
show_results <- function(model){
rslts <- model$results %>%
arrange(RMSE)
head(rslts, 10) %>% kable () %>% kable_styling(bootstrap_options = "striped", full_width = F)
}
set.seed(7)
train <- mlbench.friedman1(300)
train_x <- data.frame(train$x)
train_y <- train$y
test <- mlbench.friedman1(5000)
test_x <- data.frame(test$x)
test_y <- test$y
featurePlot(train_x, train_y)
Even though there is a considerable amount of noise present in the Y variable, the nonlinear nature of X2, X3, and X5 is obvious.
mod_knn <- train(x = train_x, y = train_y,
method = 'knn',
preProcess = c('center', 'scale'),
tuneLength = 10,
trControl = trainControl(method = 'repeatedcv', number = 10, repeats = 5)
)
show_results(mod_knn)
| k | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| 13 | 3.001097 | 0.6795750 | 2.410964 | 0.4300529 | 0.0889390 | 0.3158929 |
| 7 | 3.009670 | 0.6460425 | 2.412720 | 0.4152269 | 0.0929110 | 0.3226745 |
| 15 | 3.013180 | 0.6889185 | 2.439678 | 0.4164372 | 0.0901538 | 0.3087055 |
| 11 | 3.023346 | 0.6646248 | 2.423129 | 0.4287180 | 0.0932933 | 0.3203807 |
| 17 | 3.031139 | 0.6952162 | 2.467955 | 0.3911978 | 0.0888003 | 0.2981635 |
| 5 | 3.033048 | 0.6225208 | 2.434137 | 0.4091910 | 0.0998405 | 0.3306086 |
| 9 | 3.033165 | 0.6529799 | 2.437921 | 0.4129775 | 0.0986194 | 0.3145123 |
| 19 | 3.039146 | 0.7027672 | 2.465933 | 0.3922361 | 0.0831007 | 0.2972282 |
| 21 | 3.066993 | 0.7017279 | 2.470518 | 0.3875756 | 0.0861499 | 0.2944325 |
| 23 | 3.077325 | 0.7084360 | 2.482019 | 0.3827649 | 0.0825224 | 0.2979787 |
With so few predictors, KNN runs very quickly, but the predictions are not as strong as the other models. KNN doesn’t capture the functional form of the model at all so we’d expect that trying to estimate a well defined function would not be the best use case for KNN.
SVM RBF kernel
param_grid <- expand.grid(.C = c(.5,1,2,4,8,16,32,64,128), .sigma = c(.005,.01,.5, .1))
mod_svm_r <- train(x = train_x, y = train_y,
method = 'svmRadial',
preProcess = c('center', 'scale'),
tuneGrid = param_grid,
trControl = trainControl(method = 'CV', number = 10)
)
show_results(mod_svm_r)
| C | sigma | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|---|
| 128 | 0.010 | 1.708332 | 0.8777565 | 1.371462 | 0.1950010 | 0.0276485 | 0.1583031 |
| 64 | 0.010 | 1.725761 | 0.8742143 | 1.383713 | 0.1888674 | 0.0273023 | 0.1600504 |
| 128 | 0.005 | 1.766406 | 0.8681882 | 1.407739 | 0.2107299 | 0.0282787 | 0.1454472 |
| 32 | 0.010 | 1.794172 | 0.8638531 | 1.433778 | 0.2066925 | 0.0291249 | 0.1445553 |
| 64 | 0.005 | 1.893831 | 0.8504169 | 1.497996 | 0.2257277 | 0.0311215 | 0.1438626 |
| 16 | 0.010 | 1.926905 | 0.8453897 | 1.524838 | 0.2170545 | 0.0297165 | 0.1395086 |
| 32 | 0.005 | 2.086270 | 0.8201298 | 1.645569 | 0.2695046 | 0.0361150 | 0.1971588 |
| 2 | 0.100 | 2.087119 | 0.8187178 | 1.628495 | 0.2738806 | 0.0449340 | 0.2019053 |
| 8 | 0.010 | 2.097632 | 0.8179384 | 1.654107 | 0.2815435 | 0.0376155 | 0.2057749 |
| 4 | 0.100 | 2.123468 | 0.8120636 | 1.661160 | 0.2786155 | 0.0416932 | 0.2012360 |
SVM with an RBF kernel does very well in its prediction and is fairly quick to run.
SVM Polynomial Kernel
We’ll search over powers of 10 for the cost parameter to cut computation time.
param_grid <- expand.grid(.C = c(.1,1,10), .scale = c(1), .degree = c(2))
mod_svm_p <- train(x = train_x, y = train_y,
method = 'svmPoly',
preProcess = c('center', 'scale'),
tuneGrid = param_grid,
trControl = trainControl(method = 'CV', number = 10)
)
show_results(mod_svm_p)
| C | scale | degree | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|---|---|
| 0.1 | 1 | 2 | 1.796139 | 0.8626748 | 1.419792 | 0.3310039 | 0.0514060 | 0.2850445 |
| 10.0 | 1 | 2 | 1.807317 | 0.8626788 | 1.440982 | 0.3301214 | 0.0492389 | 0.2745047 |
| 1.0 | 1 | 2 | 1.807627 | 0.8621314 | 1.438866 | 0.3363793 | 0.0506426 | 0.2800093 |
The polynomial kernal performs somewhat worse than the RBF. It was somewhat expected that the polynomial kernel would perform well, given we are modeling a well defined function. Perhaps it wasn’t able to model the sine function well.
MARS
param_grid <- expand.grid(.degree = c(1,2), .nprune = seq(16,36,4))
mod_mars <- train(x = train_x, y = train_y,
method = 'bagEarth',
preProcess = c('center', 'scale'),
tuneGrid = param_grid
)
show_results(mod_mars)
| degree | nprune | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|---|
| 2 | 20 | 1.231351 | 0.9353949 | 0.9826278 | 0.0702229 | 0.0122382 | 0.0560528 |
| 2 | 16 | 1.235147 | 0.9351820 | 0.9867141 | 0.0685772 | 0.0121106 | 0.0575861 |
| 2 | 28 | 1.237517 | 0.9350038 | 0.9877129 | 0.0693530 | 0.0119051 | 0.0570614 |
| 2 | 32 | 1.237861 | 0.9348210 | 0.9885445 | 0.0699554 | 0.0124532 | 0.0583643 |
| 2 | 36 | 1.239191 | 0.9346370 | 0.9881851 | 0.0761451 | 0.0127060 | 0.0619733 |
| 2 | 24 | 1.242644 | 0.9345067 | 0.9934011 | 0.0660992 | 0.0116837 | 0.0593212 |
| 1 | 16 | 1.693920 | 0.8798857 | 1.3358672 | 0.1290282 | 0.0231158 | 0.0922088 |
| 1 | 20 | 1.694202 | 0.8799025 | 1.3338994 | 0.1323576 | 0.0226894 | 0.0951689 |
| 1 | 32 | 1.695048 | 0.8796467 | 1.3338995 | 0.1299482 | 0.0230678 | 0.0938339 |
| 1 | 28 | 1.695706 | 0.8798876 | 1.3329156 | 0.1298199 | 0.0224517 | 0.0906570 |
The bagged MARS model with second degree interactions is more than 50% better than our baseline KNN. Interactions + splits apparently work very well for modeling the sinosudial predictors.
Neural Network
param_grid <- expand.grid(.layer1 = c(16,32), .layer2 = c(16,32), .layer3 = c(16,32)) %>%
filter(.layer1 == .layer2)
mod_nn <- train(x = train_x, y = train_y,
method = 'neuralnet',
preProcess = c('center', 'scale'),
tuneGrid = param_grid
)
show_results(mod_nn)
| layer1 | layer2 | layer3 | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|---|---|
| 32 | 32 | 32 | 2.846767 | 0.6831832 | 2.270754 | 0.2207414 | 0.0596445 | 0.1828671 |
| 32 | 32 | 16 | 2.911645 | 0.6736284 | 2.326143 | 0.2677705 | 0.0558915 | 0.2268127 |
| 16 | 16 | 32 | 3.306401 | 0.6002617 | 2.629107 | 0.3408202 | 0.0799026 | 0.2574477 |
| 16 | 16 | 16 | 3.355533 | 0.5869976 | 2.665121 | 0.2901971 | 0.0627828 | 0.2626130 |
This fairly shallow neural network doesn’t work as well as the other models
SVM
predict(mod_svm_r, test_x) %>%
postResample(test_y)
## RMSE Rsquared MAE
## 1.8874516 0.8582859 1.4488003
KNN
predict(mod_knn, test_x) %>%
postResample(test_y)
## RMSE Rsquared MAE
## 3.1441029 0.6740406 2.5062258
MARS
predict(mod_mars, test_x) %>%
postResample(test_y)
## RMSE Rsquared MAE
## 1.1316009 0.9485732 0.9041606
Neural Net
predict(mod_nn, test_x) %>%
postResample(test_y)
## RMSE Rsquared MAE
## 2.4893510 0.7599552 1.9543257
Not much changes between CV and predicting on the test set. The MARS model still has far and away the best performance.
p1 <- varImp(mod_nn) %>%
plot(top = 10, main = 'Neural Network')
p2 <- varImp(mod_mars) %>%
plot(top = 10, main = 'SVM')
p3 <- varImp(mod_knn) %>%
plot(top = 10, main = 'MARS')
p4 <- varImp(mod_svm_r) %>%
plot(top = 10, main = 'KNN')
grid.arrange(p1,p2,p3,p4, nrow = 2, ncol = 2)
The MARS model correctly identifies the 5 variables contributing to the model and nearly eliminates the other 5. The SVM does similarly well, but X10 has a non-trivial importance.
For this section, we’ll be comparing to the linear models used on the same data, found here: http://rpubs.com/thefexexpress/484449. It should be noted that won’t be using repeated CV because of computation time.
data(ChemicalManufacturingProcess)
zero_vars <- nearZeroVar(ChemicalManufacturingProcess)
ChemicalManufacturingProcess_new <- ChemicalManufacturingProcess[, -zero_vars]
pred <- dplyr::select(ChemicalManufacturingProcess_new, -ManufacturingProcess36) %>%
mice(data = , m = 3, method = 'rf', maxit = 2)
imputed <-complete(pred)
pred <- mice(data = imputed, m = 3, method = 'rf', maxit = 2)
chem_imputed <- complete(pred)
yield <- chem_imputed$Yield
all_x <- select(chem_imputed, - Yield)
samples <- createDataPartition(yield, p = .8, list = F)
cm_train <- all_x[samples, ]
cm_test <- all_x[-samples, ]
yield_train <- yield[samples]
yield_test<- yield[-samples]
SVM
param_grid <- expand.grid(.C = c(.5,2,4,8,16,32,64,128), .sigma = c(.005,.01,.5, .1))
chem_svm <- train(x = cm_train, y = yield_train,
method = 'svmRadial',
preProcess = c('center', 'scale'),
tuneGrid = param_grid,
trControl = trainControl(method = 'repeatedcv', number = 10)
)
show_results(chem_svm)
| C | sigma | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|---|
| 32 | 0.010 | 1.067228 | 0.6847249 | 0.8536647 | 0.1427610 | 0.1474394 | 0.1235762 |
| 64 | 0.010 | 1.067228 | 0.6847249 | 0.8536647 | 0.1427610 | 0.1474394 | 0.1235762 |
| 128 | 0.010 | 1.067228 | 0.6847249 | 0.8536647 | 0.1427610 | 0.1474394 | 0.1235762 |
| 16 | 0.010 | 1.069430 | 0.6840327 | 0.8556142 | 0.1435646 | 0.1473355 | 0.1247638 |
| 8 | 0.010 | 1.076440 | 0.6789728 | 0.8499823 | 0.1403218 | 0.1404362 | 0.1395588 |
| 16 | 0.005 | 1.093476 | 0.6726388 | 0.8640497 | 0.1789805 | 0.1496923 | 0.1573426 |
| 4 | 0.010 | 1.097094 | 0.6661796 | 0.8678201 | 0.1750485 | 0.1207483 | 0.1574331 |
| 128 | 0.005 | 1.099990 | 0.6719026 | 0.8928322 | 0.1890672 | 0.1555262 | 0.1507244 |
| 64 | 0.005 | 1.101467 | 0.6712564 | 0.8941310 | 0.1897122 | 0.1557127 | 0.1516341 |
| 32 | 0.005 | 1.107501 | 0.6677389 | 0.8891615 | 0.1962192 | 0.1617484 | 0.1612946 |
Test Set Performance:
predict(chem_svm, cm_test) %>%
postResample(yield_test)
## RMSE Rsquared MAE
## 1.2622659 0.6241278 0.9762287
The SVM model shows better performance than any of the linear models and rivals that of gradient boosting. The test set performance is somewhat worse, but that could be the result of variance.
MARS
param_grid <- expand.grid(.degree = c(1,2), .nprune = seq(16,36,4))
chem_mars <- train(x = cm_train, y = yield_train,
method = 'bagEarth',
preProcess = c('center', 'scale'),
tuneGrid = param_grid,
trControl = trainControl(method = 'CV', number = 10)
)
show_results(chem_mars)
| degree | nprune | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|---|
| 1 | 32 | 1.215839 | 0.6062244 | 0.929198 | 0.2829779 | 0.1909598 | 0.1687226 |
| 1 | 24 | 1.335481 | 0.5731309 | 1.010341 | 0.4442277 | 0.2211819 | 0.2695660 |
| 2 | 24 | 1.351582 | 0.5699718 | 0.931936 | 0.5134587 | 0.2115701 | 0.2482426 |
| 1 | 36 | 1.429714 | 0.5778045 | 1.012777 | 0.8085275 | 0.2505505 | 0.4002633 |
| 2 | 16 | 1.504529 | 0.5850945 | 1.001581 | 1.0963222 | 0.2527159 | 0.4089059 |
| 1 | 16 | 1.545734 | 0.6047687 | 1.086161 | 1.3593115 | 0.2000048 | 0.6343778 |
| 1 | 28 | 1.550494 | 0.5861900 | 1.078940 | 1.2678535 | 0.2170202 | 0.6153022 |
| 1 | 20 | 1.900973 | 0.5907893 | 1.140004 | 2.4364172 | 0.2575956 | 0.8746617 |
| 2 | 28 | 2.040373 | 0.5916518 | 1.181305 | 2.7714510 | 0.2714860 | 1.0036015 |
| 2 | 32 | 2.918015 | 0.6116758 | 1.516599 | 5.6886492 | 0.2478328 | 2.1095668 |
Test Set Performance:
predict(chem_mars, cm_test) %>%
postResample(yield_test)
## RMSE Rsquared MAE
## 1.2395722 0.5271407 0.9936905
MARS has similar performance to the SVM, but performs slightly better on the test set. Because there are only 32 samples in the test set, I would put more stock in the CV results; the hold out set just tells us that the CV performance isn’t totally off. Also one of these results show a very high RMSE, but lower MAE indicating the occasional outlandish predictions seen with MARS.
Neural Network
We use PCA in the preprocessing because neural networks are more sensitive to correlated features than the other models.
param_grid <- expand.grid(.layer1 = c(32,64,128), .layer2 = c(32,64,128), .layer3 = c(32,64)) %>%
filter(.layer1 == .layer2)
chem_nn <- train(x = cm_train, y = yield_train,
method = 'neuralnet',
preProcess = c('center', 'scale', 'pca'),
tuneGrid = param_grid,
trControl = trainControl(method = 'CV', number = 10)
)
show_results(chem_nn)
| layer1 | layer2 | layer3 | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|---|---|
| 64 | 64 | 64 | 1.278812 | 0.5784451 | 1.037895 | 0.1936558 | 0.1218101 | 0.2420806 |
| 64 | 64 | 32 | 1.316725 | 0.5383254 | 1.070730 | 0.2436892 | 0.1658866 | 0.1943149 |
| 32 | 32 | 32 | 1.343587 | 0.5409205 | 1.078531 | 0.2738988 | 0.1519649 | 0.2302645 |
| 128 | 128 | 64 | 1.409200 | 0.5196136 | 1.186725 | 0.2969081 | 0.1581716 | 0.2616603 |
| 128 | 128 | 32 | 1.426549 | 0.4880566 | 1.180520 | 0.3282327 | 0.2064646 | 0.2766038 |
| 32 | 32 | 64 | 1.465996 | 0.5293076 | 1.202548 | 0.2903678 | 0.1794960 | 0.2465189 |
Test Set Performance:
predict(chem_nn, cm_test) %>%
postResample(yield_test)
## RMSE Rsquared MAE
## 1.6241411 0.4014445 1.1918325
The neural network doesn’t perform as well as the other models. It could be a result of the smaller dataset and all the parameters that need tuning in a neural network. It’s also possible that finding a package that offers more hyperparameter tuning options would improve performance.
KNN
chem_knn <- train(x = cm_train, y = yield_train,
method = 'knn',
preProcess = c('center', 'scale'),
tuneLength = 10,
trControl = trainControl(method = 'cv', number = 10)
)
show_results(chem_knn)
| k | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| 9 | 1.327509 | 0.5042799 | 1.100254 | 0.3026464 | 0.1993257 | 0.2279866 |
| 11 | 1.337388 | 0.5046912 | 1.105989 | 0.3296195 | 0.1927967 | 0.2451748 |
| 7 | 1.340938 | 0.4746893 | 1.098453 | 0.3169420 | 0.2255832 | 0.2568329 |
| 15 | 1.343439 | 0.5221975 | 1.106846 | 0.3295209 | 0.1610972 | 0.2397657 |
| 13 | 1.346927 | 0.5089752 | 1.112831 | 0.3266240 | 0.1799696 | 0.2366578 |
| 5 | 1.353789 | 0.4772133 | 1.119660 | 0.2802934 | 0.2287437 | 0.2061147 |
| 17 | 1.373406 | 0.4962092 | 1.130801 | 0.3403087 | 0.1703863 | 0.2458484 |
| 19 | 1.399537 | 0.4814767 | 1.152627 | 0.3208540 | 0.1851976 | 0.2293186 |
| 21 | 1.409293 | 0.4875075 | 1.168884 | 0.3125128 | 0.1786570 | 0.2182196 |
| 23 | 1.430462 | 0.4797166 | 1.182530 | 0.3094426 | 0.1916269 | 0.2144417 |
Test Set Performance:
predict(chem_knn, cm_test) %>%
postResample(yield_test)
## RMSE Rsquared MAE
## 1.2322312 0.5527103 0.9899965
The KNN model performs better on the test set, giving more credence to the variance theory. If every model performed worse on the test set, we would be less sure in chalking the SVM test performance up to variance.
p1 <- varImp(chem_nn) %>%
plot(top = 10, main = 'Neural Network')
p2 <- varImp(chem_svm) %>%
plot(top = 10, main = 'SVM')
p3 <- varImp(chem_mars) %>%
plot(top = 10, main = 'MARS')
p4 <- varImp(chem_knn) %>%
plot(top = 10, main = 'KNN')
grid.arrange(p1,p2,p3,p4, nrow = 2, ncol = 2)
VS Eachother
VS Linear Models
We focus in on the variables in the top 10 that were not in the top 10 in the linear model, along with the top feature for the nonlinear model. This should show us the nonlinear relationships that the SVM was able to detect that our linear model could not.
imp <- varImp(chem_svm)
importants <- imp$importance %>%
rownames_to_column() %>%
filter(rowname %in% c('ManufacturingProcess32', 'ManufacturingProcess31', 'BiologicalMaterial11'))
only_importants <- chem_imputed[,importants$rowname]
only_importants$y <- yield
only_importants <- filter(only_importants, ManufacturingProcess31 != 0)
library(gridExtra)
plot_it <- function(col){
ggplot(only_importants) + geom_point(aes_string(x = col, y = 'y'), color = 'black') +
geom_smooth(aes_string(x = col, y = 'y'))
}
plots <- lapply(colnames(only_importants)[1:length(only_importants) - 1], plot_it)
grid.arrange(grobs = plots, ncol = 2, nrow = 2)
Observations
Intuition
In both of these plots the response seems to reach a maxima or minima then level off. Using domain knowledge, a modeler could probably find an inuitive explination for this and explain it to business stakeholders.