library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
set.seed(284781)
training <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
training$x <- data.frame(training$x)
## Look at the data using
featurePlot(training$x, training$y)
test <- mlbench.friedman1(5000, sd = 1)
test$x <- data.frame(test$x)
knn_model <- train(x = training$x,
y = training$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knn_model
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.837233 0.4460483 3.082491
## 7 3.795854 0.4634447 3.070342
## 9 3.748734 0.4853179 3.058327
## 11 3.749681 0.4936896 3.072591
## 13 3.755816 0.5003674 3.077159
## 15 3.744031 0.5179209 3.061309
## 17 3.739528 0.5283752 3.054106
## 19 3.762090 0.5278787 3.077658
## 21 3.754473 0.5422230 3.060255
## 23 3.761631 0.5475558 3.062064
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knn_pred <- predict(knn_model, newdata = test$x)
postResample(pred = knn_pred, obs = test$y)
## RMSE Rsquared MAE
## 3.2180029 0.6902624 2.5830817
The KNN RMS and R-squared value can be a sort of baseline versus the other, more complex model types.
mars_grid <- expand.grid(.degree = 1:2, .nprune = 2:15)
mars_model <- train(x = training$x,
y = training$y,
method = "earth",
tuneGrid = mars_grid,
preProcess = c("center", "scale"),
tuneLength = 10)
mars_model
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.420760 0.2659490 3.734609
## 1 3 3.866596 0.4362895 3.117283
## 1 4 3.332638 0.5795485 2.659700
## 1 5 2.814336 0.7018928 2.207811
## 1 6 2.639572 0.7402360 2.053217
## 1 7 2.363674 0.7881426 1.831327
## 1 8 2.079630 0.8367420 1.616083
## 1 9 1.956392 0.8561370 1.534864
## 1 10 1.893344 0.8652014 1.497770
## 1 11 1.879812 0.8669273 1.486811
## 1 12 1.878365 0.8675629 1.487472
## 1 13 1.912799 0.8625434 1.514213
## 1 14 1.929153 0.8604977 1.525622
## 1 15 1.941622 0.8585217 1.530798
## 2 2 4.431376 0.2633254 3.758500
## 2 3 3.850895 0.4389288 3.113149
## 2 4 3.305752 0.5898894 2.649576
## 2 5 2.872114 0.6896624 2.264406
## 2 6 2.675087 0.7302338 2.080708
## 2 7 2.326861 0.7989202 1.833539
## 2 8 2.090856 0.8342304 1.666143
## 2 9 1.785856 0.8787872 1.410562
## 2 10 1.588257 0.9049035 1.258295
## 2 11 1.508781 0.9153083 1.203985
## 2 12 1.448210 0.9213881 1.153985
## 2 13 1.402000 0.9265610 1.116501
## 2 14 1.397053 0.9267936 1.113223
## 2 15 1.406628 0.9267778 1.116179
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
mars_predictions <- predict(mars_model, newdata = test$x)
postResample(pred = mars_predictions, obs = test$y)
## RMSE Rsquared MAE
## 1.3256520 0.9286603 1.0196659
MARS has a much better RMSE than KNN. R-squared is comparable.
varImp(mars_model)
## earth variable importance
##
## Overall
## X4 100.00
## X1 82.21
## X2 67.65
## X5 55.06
## X3 43.03
## X8 0.00
The MARS model has, in fact, selected the informative predictors.
svm_model <- train(x = training$x,
y = training$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 3.082187 0.6983577 2.446508
## 0.50 2.765976 0.7373152 2.158110
## 1.00 2.565931 0.7673541 1.987885
## 2.00 2.339705 0.8060278 1.836148
## 4.00 2.239696 0.8219310 1.742172
## 8.00 2.209119 0.8288286 1.752601
## 16.00 2.242573 0.8242354 1.785742
## 32.00 2.245518 0.8238255 1.787816
## 64.00 2.245518 0.8238255 1.787816
## 128.00 2.245518 0.8238255 1.787816
##
## Tuning parameter 'sigma' was held constant at a value of 0.06452199
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06452199 and C = 8.
svm_predictions <- predict(svm_model, newdata = test$x)
postResample(pred = svm_predictions, obs = test$y)
## RMSE Rsquared MAE
## 2.0925447 0.8239415 1.6451044
The SVM model has a worse RMSE than the MARS and a worse R-squared than both. So far, MARS seems to be the top performing model.
nnet_grid <- expand.grid(.decay=c(0, 0.01, 0.1, 0.5, 0.9),
.size=c(1, 10, 15, 20),
.bag=FALSE)
nnet_model <- train(x = training$x,
y = training$y,
method = "avNNet",
tuneGrid = nnet_grid,
preProc = c("center", "scale"),
trace=FALSE,
linout=TRUE,
maxit=500)
## Warning: executing %dopar% sequentially: no parallel backend registered
nnet_model
## Model Averaged Neural Network
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 3.213027 0.6009585 2.503680
## 0.00 10 3.952337 0.5279348 2.877521
## 0.00 15 3.179907 0.6193990 2.503094
## 0.00 20 2.995475 0.6537507 2.342278
## 0.01 1 3.193652 0.6085512 2.468448
## 0.01 10 3.184978 0.6217268 2.507903
## 0.01 15 2.898531 0.6759028 2.292387
## 0.01 20 2.594368 0.7349180 2.028543
## 0.10 1 3.145068 0.6183100 2.406646
## 0.10 10 3.062719 0.6416988 2.396324
## 0.10 15 2.666857 0.7195360 2.095962
## 0.10 20 2.464999 0.7594462 1.902376
## 0.50 1 3.157788 0.6150242 2.422322
## 0.50 10 2.744122 0.7056143 2.134140
## 0.50 15 2.449158 0.7629989 1.910419
## 0.50 20 2.424859 0.7675369 1.881082
## 0.90 1 3.172773 0.6109898 2.443381
## 0.90 10 2.630401 0.7276318 2.043081
## 0.90 15 2.435435 0.7655483 1.894194
## 0.90 20 2.387742 0.7744201 1.848050
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 20, decay = 0.9 and bag = FALSE.
nnet_pred <- predict(nnet_model, newdata = test$x)
postResample(pred = nnet_pred, obs = test$y)
## RMSE Rsquared MAE
## 1.7659827 0.8737405 1.3968758
The neural network had the second best RMSE but it looks like MARS performed the best, based on RMSE and R-squared, for this particular data sample.
We will load the same data as exercise 6.3 and pre-process it the same way.
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(AppliedPredictiveModeling)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("ChemicalManufacturingProcess")
md.pattern(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 152 1 1 1 1
## 6 1 1 1 1
## 1 1 1 1 1
## 7 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess09 ManufacturingProcess13 ManufacturingProcess15
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess32 ManufacturingProcess37 ManufacturingProcess38
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess39 ManufacturingProcess42 ManufacturingProcess43
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess44 ManufacturingProcess45 ManufacturingProcess01
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 0
## 0 0 1
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess07
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess08 ManufacturingProcess12 ManufacturingProcess14
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 0
## 1 0 0 1
## 1 1 1
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 0
## 1 1 1 1
## 1 0 0 0
## 1 1 2
## ManufacturingProcess02 ManufacturingProcess25 ManufacturingProcess26
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 0 0
## 2 0 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 1 1
## 3 5 5
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess33
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess03
## 152 1 1 1 0
## 6 1 1 0 1
## 1 1 0 1 1
## 7 0 0 0 3
## 5 1 1 1 11
## 2 1 1 1 1
## 1 1 1 1 1
## 1 0 0 0 4
## 1 0 0 0 16
## 9 10 15 106
cmp_df <- kNN(ChemicalManufacturingProcess,imp_var=FALSE)
zeroVar <- nearZeroVar(cmp_df)
cmp_df_final <- cmp_df[,-zeroVar]
part <- ChemicalManufacturingProcess$Yield %>%
createDataPartition(p=0.8,list=FALSE,times=1)
x_train <- cmp_df_final[part,]
x_test <- cmp_df_final[-part,]
y_train <- ChemicalManufacturingProcess$Yield[part]
y_test <- ChemicalManufacturingProcess$Yield[-part]
Now we will experiment with the same models as the previous question, starting with KNN.
knn_model <- train(x = x_train,
y = y_train,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knn_model
## k-Nearest Neighbors
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.265975 0.5452400 0.9902709
## 7 1.265480 0.5547005 1.0085555
## 9 1.272697 0.5591807 1.0135258
## 11 1.272022 0.5704851 1.0187692
## 13 1.275673 0.5717302 1.0199876
## 15 1.284043 0.5708050 1.0268790
## 17 1.293110 0.5666238 1.0331613
## 19 1.306244 0.5615143 1.0432799
## 21 1.323831 0.5510852 1.0594225
## 23 1.334774 0.5485627 1.0683099
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 7.
knn_pred <- predict(knn_model, newdata = x_test)
postResample(pred = knn_pred, obs = y_test)
## RMSE Rsquared MAE
## 1.1013208 0.6555528 0.8938393
Again we have KNN as a baseline. R-squared looks like it could be improved significantly.
mars_grid <- expand.grid(.degree=1:2,
.nprune=2:10)
mars_model <- train(x = x_train,
y = y_train,
method = "earth",
tuneGrid = mars_grid,
preProc = c("center", "scale"))
mars_model
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.463033e-14 1 1.44855e-14
## 1 3 1.463033e-14 1 1.44855e-14
## 1 4 1.463033e-14 1 1.44855e-14
## 1 5 1.463033e-14 1 1.44855e-14
## 1 6 1.463033e-14 1 1.44855e-14
## 1 7 1.463033e-14 1 1.44855e-14
## 1 8 1.463033e-14 1 1.44855e-14
## 1 9 1.463033e-14 1 1.44855e-14
## 1 10 1.463033e-14 1 1.44855e-14
## 2 2 1.463033e-14 1 1.44855e-14
## 2 3 1.463033e-14 1 1.44855e-14
## 2 4 1.463033e-14 1 1.44855e-14
## 2 5 1.463033e-14 1 1.44855e-14
## 2 6 1.463033e-14 1 1.44855e-14
## 2 7 1.463033e-14 1 1.44855e-14
## 2 8 1.463033e-14 1 1.44855e-14
## 2 9 1.463033e-14 1 1.44855e-14
## 2 10 1.463033e-14 1 1.44855e-14
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 2 and degree = 1.
mars_predictions <- predict(mars_model, newdata = x_test)
postResample(pred = mars_predictions, obs = y_test)
## RMSE Rsquared MAE
## 1.432145e-14 1.000000e+00 1.421085e-14
MARS appears not to be the methodology for these data – worse performance than KNN by RMSE and a strange 100% R-squared.
svm_model <- train(x = x_train,
y = y_train,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
svm_model
## Support Vector Machines with Radial Basis Function Kernel
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 131, 132, 129, 129, 130, 128, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.1697520 0.7125480 0.9361767
## 0.50 0.9465636 0.8071920 0.7394489
## 1.00 0.7653627 0.8640640 0.5905697
## 2.00 0.6868656 0.8852735 0.5231221
## 4.00 0.6776011 0.8876721 0.5181494
## 8.00 0.6776011 0.8876721 0.5181494
## 16.00 0.6776011 0.8876721 0.5181494
## 32.00 0.6776011 0.8876721 0.5181494
## 64.00 0.6776011 0.8876721 0.5181494
## 128.00 0.6776011 0.8876721 0.5181494
##
## Tuning parameter 'sigma' was held constant at a value of 0.0147849
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0147849 and C = 4.
svm_predictions <- predict(svm_model, newdata = x_test)
postResample(pred = svm_predictions, obs = y_test)
## RMSE Rsquared MAE
## 0.5392078 0.9264314 0.4392968
SVM has the lowest RMSE so far and a high R-squared – approximately 97%.
nnet_grid <- expand.grid(.decay=c(0, 0.01, 0.1),
.size=c(1, 5, 10),
.bag=FALSE)
nnet_model <- train(x = x_train,
y = y_train,
method = "avNNet",
tuneGrid = nnet_grid,
preProc = c("center", "scale"),
trace=FALSE,
linout=TRUE,
maxit=500)
nnet_model
## Model Averaged Neural Network
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## decay size RMSE Rsquared MAE
## 0.00 1 1.5034445 0.4121909 1.2083307
## 0.00 5 2.1846413 0.4065370 1.6802527
## 0.00 10 9.1835472 0.1217374 5.9462823
## 0.01 1 0.5351704 0.8971591 0.2181828
## 0.01 5 1.4501439 0.5532420 1.0262482
## 0.01 10 2.1020990 0.4737367 1.5980706
## 0.10 1 0.6731660 0.8690445 0.3851554
## 0.10 5 1.7664417 0.5057023 0.9525642
## 0.10 10 1.3177992 0.5934042 0.9650665
##
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1, decay = 0.01 and bag = FALSE.
nnet_pred <- predict(nnet_model, newdata = x_test)
postResample(pred = nnet_pred, obs = y_test)
## RMSE Rsquared MAE
## 1.0578760 0.7809455 0.2817886
Neural network has a decently good RMSE but not as good as SVM. R-squared is one of the worst.
SVM will be the selected model for these data.
important_vars <- varImp(svm_model)
important_vars
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## Yield 100.00
## ManufacturingProcess13 37.43
## ManufacturingProcess32 34.40
## ManufacturingProcess17 31.60
## BiologicalMaterial06 29.07
## ManufacturingProcess09 28.43
## BiologicalMaterial12 28.20
## ManufacturingProcess36 27.51
## ManufacturingProcess31 25.39
## BiologicalMaterial03 24.71
## ManufacturingProcess06 24.58
## BiologicalMaterial02 22.74
## ManufacturingProcess11 21.10
## BiologicalMaterial11 18.96
## ManufacturingProcess33 18.58
## BiologicalMaterial08 16.84
## BiologicalMaterial04 15.86
## ManufacturingProcess30 15.20
## ManufacturingProcess12 15.16
## BiologicalMaterial09 14.32
Manufacturing process variables appear to be the most important, with 13, 32, and 17 having the highest values. This was the same in the previous homework.
ggplot(cmp_df_final, aes(ManufacturingProcess13, Yield)) +
geom_point()
ggplot(cmp_df_final, aes(ManufacturingProcess32, Yield)) +
geom_point()
ggplot(cmp_df_final, aes(ManufacturingProcess17, Yield)) +
geom_point()
Manufacturing Processes 13 and 17 appear to have an inverse relationship with yield, while MP32 has a direct positive relationship.