library(fpp3)
library(caret)
library(mlbench)
library(ggplot2)
library(ggcorrplot)
Import our Data:
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)
featurePlot(trainingData$x, trainingData$y)
testData <- mlbench.friedman1(5000, sd =1)
testData$x <- data.frame(testData$x)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(200)
marsTuned <- train(trainingData$x, trainingData$y, method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marsTuned
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.188280 0.3042527 3.460689
## 1 3 3.551182 0.4999832 2.837116
## 1 4 2.653143 0.7167280 2.128222
## 1 5 2.405769 0.7562160 1.948161
## 1 6 2.295006 0.7754603 1.853199
## 1 7 1.771950 0.8611767 1.391357
## 1 8 1.647182 0.8774867 1.299564
## 1 9 1.609816 0.8837307 1.299705
## 1 10 1.635035 0.8798236 1.309436
## 1 11 1.571915 0.8896147 1.260711
## 1 12 1.571561 0.8898750 1.253077
## 1 13 1.567577 0.8906927 1.250795
## 1 14 1.571673 0.8909652 1.245508
## 1 15 1.571673 0.8909652 1.245508
## 1 16 1.571673 0.8909652 1.245508
## 1 17 1.571673 0.8909652 1.245508
## 1 18 1.571673 0.8909652 1.245508
## 1 19 1.571673 0.8909652 1.245508
## 1 20 1.571673 0.8909652 1.245508
## 1 21 1.571673 0.8909652 1.245508
## 1 22 1.571673 0.8909652 1.245508
## 1 23 1.571673 0.8909652 1.245508
## 1 24 1.571673 0.8909652 1.245508
## 1 25 1.571673 0.8909652 1.245508
## 1 26 1.571673 0.8909652 1.245508
## 1 27 1.571673 0.8909652 1.245508
## 1 28 1.571673 0.8909652 1.245508
## 1 29 1.571673 0.8909652 1.245508
## 1 30 1.571673 0.8909652 1.245508
## 1 31 1.571673 0.8909652 1.245508
## 1 32 1.571673 0.8909652 1.245508
## 1 33 1.571673 0.8909652 1.245508
## 1 34 1.571673 0.8909652 1.245508
## 1 35 1.571673 0.8909652 1.245508
## 1 36 1.571673 0.8909652 1.245508
## 1 37 1.571673 0.8909652 1.245508
## 1 38 1.571673 0.8909652 1.245508
## 2 2 4.188280 0.3042527 3.460689
## 2 3 3.551182 0.4999832 2.837116
## 2 4 2.615256 0.7216809 2.128763
## 2 5 2.344223 0.7683855 1.890080
## 2 6 2.275048 0.7762472 1.807779
## 2 7 1.841464 0.8418935 1.457945
## 2 8 1.641647 0.8839822 1.288520
## 2 9 1.535119 0.9002991 1.214772
## 2 10 1.473254 0.9101555 1.158761
## 2 11 1.379476 0.9207735 1.080991
## 2 12 1.285380 0.9283193 1.033426
## 2 13 1.267261 0.9328905 1.014726
## 2 14 1.261797 0.9327541 1.009821
## 2 15 1.266663 0.9320714 1.005751
## 2 16 1.270858 0.9322465 1.009757
## 2 17 1.263778 0.9327687 1.007653
## 2 18 1.263778 0.9327687 1.007653
## 2 19 1.263778 0.9327687 1.007653
## 2 20 1.263778 0.9327687 1.007653
## 2 21 1.263778 0.9327687 1.007653
## 2 22 1.263778 0.9327687 1.007653
## 2 23 1.263778 0.9327687 1.007653
## 2 24 1.263778 0.9327687 1.007653
## 2 25 1.263778 0.9327687 1.007653
## 2 26 1.263778 0.9327687 1.007653
## 2 27 1.263778 0.9327687 1.007653
## 2 28 1.263778 0.9327687 1.007653
## 2 29 1.263778 0.9327687 1.007653
## 2 30 1.263778 0.9327687 1.007653
## 2 31 1.263778 0.9327687 1.007653
## 2 32 1.263778 0.9327687 1.007653
## 2 33 1.263778 0.9327687 1.007653
## 2 34 1.263778 0.9327687 1.007653
## 2 35 1.263778 0.9327687 1.007653
## 2 36 1.263778 0.9327687 1.007653
## 2 37 1.263778 0.9327687 1.007653
## 2 38 1.263778 0.9327687 1.007653
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
marsPred <- predict(marsTuned, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
## RMSE Rsquared MAE
## 1.1722635 0.9448890 0.9324923
svmTuned <- train(trainingData$x, trainingData$y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmTuned$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 8
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0594129416262883
##
## Number of Support Vectors : 152
##
## Objective Function Value : -78.5712
## Training error : 0.009515
svmPred <- predict(svmTuned, newdata = testData$x)
postResample(pred = svmPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.0405933 0.8312032 1.5474565
## Determine if there are predictors that contain correlation above a certain threshold
high_cor <- findCorrelation(cor(trainingData$x), cutoff = .75)
high_cor
## integer(0)
None of the predictors have correlation above the threshold
nnetgrid <- expand.grid(.decay = c(0,0.01, .1),
.size =c(1:10),
.bag = FALSE)
set.seed(200)
nnettune <- train(trainingData$x, trainingData$y,
method ="avNNet",
tuneGrid = nnetgrid,
trControl = trainControl(method = "cv"),
preProc = c("center", "scale"),
linout =TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1, maxit = 500)
nnPred <- predict(nnettune, newdata = testData$x)
postResample(pred = nnPred, obs = testData$y)
## RMSE Rsquared MAE
## 2.0603901 0.8320669 1.5289876
The model with the best performance is the MARS model. It contains the highest Rsquared at 0.945 and the lowest RMSE at 1.17.
varImp(marsTuned)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.24
## X2 48.74
## X5 15.53
## X3 0.00
MARS selects all of the informative predictors, besides X3. The X5 parameter doesn’t have much importance to the model though.
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
We will use bag impute to impute the values
set.seed(1443)
process <- preProcess(ChemicalManufacturingProcess, method = "bagImpute")
imputed_chemical <- predict(process,ChemicalManufacturingProcess)
Now the data can be split into a train/test dataset
split <- createDataPartition(imputed_chemical$Yield, p = 0.8, list = FALSE)
y <- imputed_chemical[,1]
x <- imputed_chemical[,-1]
train_x <- x[split,]
train_y <- y[split]
test_x <- x[-split,]
test_y <- y[-split]
marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(1443)
marsTuned <- train(train_x, train_y, method = "earth",
tuneGrid = marsGrid,
trControl = trainControl(method = "cv"))
marsTuned
## Multivariate Adaptive Regression Spline
##
## 144 samples
## 57 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 129, 130, 128, 131, 130, 131, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.421482 0.4679176 1.1104162
## 1 3 1.248109 0.5967817 1.0027972
## 1 4 1.207163 0.6157003 0.9786943
## 1 5 1.194636 0.6304960 0.9599176
## 1 6 1.229769 0.6085103 0.9653690
## 1 7 1.212047 0.6317633 0.9565507
## 1 8 1.183587 0.6389006 0.9457047
## 1 9 1.254062 0.5995039 0.9904849
## 1 10 1.244370 0.6152832 0.9835944
## 1 11 1.227960 0.6266194 0.9792802
## 1 12 1.267643 0.6104668 0.9921221
## 1 13 1.275278 0.6133854 0.9794999
## 1 14 1.284729 0.6113546 0.9862456
## 1 15 1.305466 0.6138931 1.0046183
## 1 16 1.316288 0.6107436 1.0108728
## 1 17 1.349402 0.5976483 1.0419861
## 1 18 1.353143 0.5954621 1.0456895
## 1 19 1.348765 0.5964580 1.0376609
## 1 20 1.353248 0.5940745 1.0443913
## 1 21 1.353248 0.5940745 1.0443913
## 1 22 1.353248 0.5940745 1.0443913
## 1 23 1.353248 0.5940745 1.0443913
## 1 24 1.353248 0.5940745 1.0443913
## 1 25 1.353248 0.5940745 1.0443913
## 1 26 1.353248 0.5940745 1.0443913
## 1 27 1.353248 0.5940745 1.0443913
## 1 28 1.353248 0.5940745 1.0443913
## 1 29 1.353248 0.5940745 1.0443913
## 1 30 1.353248 0.5940745 1.0443913
## 1 31 1.353248 0.5940745 1.0443913
## 1 32 1.353248 0.5940745 1.0443913
## 1 33 1.353248 0.5940745 1.0443913
## 1 34 1.353248 0.5940745 1.0443913
## 1 35 1.353248 0.5940745 1.0443913
## 1 36 1.353248 0.5940745 1.0443913
## 1 37 1.353248 0.5940745 1.0443913
## 1 38 1.353248 0.5940745 1.0443913
## 2 2 1.421482 0.4679176 1.1104162
## 2 3 1.329601 0.5342963 1.0313547
## 2 4 1.219195 0.6154011 0.9598474
## 2 5 1.262747 0.5827485 0.9900176
## 2 6 1.279963 0.5709927 0.9988537
## 2 7 1.238261 0.6234642 0.9891882
## 2 8 1.340012 0.6025986 1.0368049
## 2 9 1.270893 0.6256407 0.9972661
## 2 10 1.235299 0.6512391 0.9779152
## 2 11 1.265105 0.6452478 0.9887539
## 2 12 1.232196 0.6586458 0.9803787
## 2 13 1.217495 0.6629845 0.9430122
## 2 14 1.228971 0.6581058 0.9515741
## 2 15 1.237148 0.6599354 0.9606890
## 2 16 1.277041 0.6400424 0.9751521
## 2 17 1.288075 0.6380934 0.9750828
## 2 18 1.297636 0.6316674 0.9816944
## 2 19 1.298170 0.6362933 0.9841653
## 2 20 1.296204 0.6381375 0.9857380
## 2 21 1.295169 0.6384784 0.9825247
## 2 22 1.293966 0.6432376 0.9723086
## 2 23 1.293966 0.6432376 0.9723086
## 2 24 1.926249 0.6212280 1.1680111
## 2 25 1.938489 0.6212937 1.1736045
## 2 26 2.000929 0.6212288 1.1878331
## 2 27 2.085514 0.6205812 1.2133717
## 2 28 2.308993 0.6194794 1.2753805
## 2 29 2.308993 0.6194794 1.2753805
## 2 30 2.308993 0.6194794 1.2753805
## 2 31 2.328632 0.6195214 1.2755721
## 2 32 2.397736 0.6193375 1.2948054
## 2 33 2.452103 0.6192322 1.3076040
## 2 34 2.334805 0.6195231 1.2795109
## 2 35 2.334805 0.6195231 1.2795109
## 2 36 2.334805 0.6195231 1.2795109
## 2 37 2.334805 0.6195231 1.2795109
## 2 38 2.334805 0.6195231 1.2795109
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 8 and degree = 1.
marsPred <- predict(marsTuned, newdata = test_x)
mars_acc <- postResample(pred = marsPred, obs = test_y)
knnModel <- train(x = train_x,
y = train_y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 144 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.524775 0.3656683 1.184893
## 7 1.507903 0.3752920 1.186188
## 9 1.493735 0.3902160 1.184275
## 11 1.473488 0.4149109 1.176280
## 13 1.469845 0.4235443 1.176752
## 15 1.468520 0.4279935 1.177127
## 17 1.479525 0.4252627 1.186608
## 19 1.485196 0.4237991 1.195323
## 21 1.500216 0.4109271 1.205779
## 23 1.500431 0.4144962 1.204616
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = test_x)
knn_acc <- postResample(pred = knnPred, obs = test_y)
svmTuned <- train(train_x, train_y,
method = "svmRadial",
preProc = c("center", "scale"),
tuneLength = 14,
trControl = trainControl(method = "cv"))
svmTuned$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 16
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.014149870944168
##
## Number of Support Vectors : 127
##
## Objective Function Value : -86.3478
## Training error : 0.009262
svmPred <- predict(svmTuned, newdata = test_x)
svm_acc <- postResample(pred = svmPred, obs = test_y)
Remove highly correlated variables
high_cor <- findCorrelation(cor(train_x), cutoff = .75)
high_cor
## [1] 2 6 1 8 4 12 44 27 41 21 25 26 54 57 38 39 43 30 52
## Make new dataframe for neural network as data will be removed
train_x_nn <- train_x
train_x_nn$y <- train_y
test_x_nn <- test_x
test_x_nn$y <- test_y
train_nn <- train_x_nn[, -high_cor]
test_nn <- test_x_nn[,-high_cor]
train_y_nn <- train_x_nn$y
test_y_nn <- test_x_nn$y
train_x_nn <- train_x_nn %>% subset(select = -c(y))
test_x_nn <- test_x_nn %>% subset(select = -c(y))
nnetgrid <- expand.grid(.decay = c(0,0.01, .1),
.size =c(1:10),
.bag = FALSE)
set.seed(200)
nnettune <- train(train_x_nn, train_y_nn,
method ="avNNet",
tuneGrid = nnetgrid,
trControl = trainControl(method = "cv"),
preProc = c("center", "scale"),
linout =TRUE,
trace = FALSE,
MaxNWts = 10 * (ncol(train_x_nn) + 1) + 10 + 1, maxit = 500)
nnPred <- predict(nnettune, newdata = test_x_nn)
nn_acc <- postResample(pred = nnPred, obs = test_y_nn)
model_acc <- rbind(nnet = nn_acc, svm = svm_acc, knn = knn_acc, mars = mars_acc)
model_acc
## RMSE Rsquared MAE
## nnet 1.429413 0.4623302 1.1464508
## svm 1.212403 0.4992352 0.9962611
## knn 1.258253 0.4692207 1.0442500
## mars 1.235178 0.4821087 1.0543239
The support vector model has the highest Rsquared and the lowest RMSE, so it would be the model chosen
varImp(svmTuned)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 97.00
## BiologicalMaterial06 83.58
## ManufacturingProcess36 79.24
## BiologicalMaterial03 79.14
## ManufacturingProcess17 76.19
## ManufacturingProcess09 74.99
## BiologicalMaterial12 74.51
## BiologicalMaterial02 64.70
## ManufacturingProcess31 57.91
## ManufacturingProcess06 52.48
## BiologicalMaterial11 50.85
## ManufacturingProcess33 49.63
## ManufacturingProcess11 45.68
## BiologicalMaterial04 45.55
## ManufacturingProcess29 44.62
## BiologicalMaterial09 42.31
## BiologicalMaterial08 41.31
## BiologicalMaterial01 36.76
## ManufacturingProcess12 36.70
The process variables tend to dominate the SVM machine learning model, but some biological variables are present.
set.seed(1443)
pls_train <- train(train_x, train_y, method = "pls", tuneLength = 25, trControl = trainControl(method = "cv", number = 10), preProc = c("center", "scale"))
The linear model:
varImp(pls_train)
## pls variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess13 85.33
## ManufacturingProcess36 83.91
## ManufacturingProcess09 83.52
## ManufacturingProcess17 80.64
## ManufacturingProcess33 61.26
## BiologicalMaterial03 60.70
## BiologicalMaterial02 60.14
## BiologicalMaterial06 59.39
## ManufacturingProcess12 58.12
## BiologicalMaterial08 55.71
## ManufacturingProcess11 54.51
## ManufacturingProcess06 54.38
## BiologicalMaterial12 53.12
## BiologicalMaterial11 51.76
## BiologicalMaterial01 50.74
## BiologicalMaterial04 49.94
## ManufacturingProcess28 44.28
## ManufacturingProcess34 42.21
## ManufacturingProcess04 39.58
In our linear model, the manufacturing processes dominate the list of importance far more than the nonlinear model. All the top 5 predictors are for manufacturing. Although this is the case, many of the top 10 predictors are shared between the models.
unique_predictors <- c("BiologicalMaterial12", "ManufacturingProcess31","ManufacturingProcess33", "ManufacturingProcess29","BiologicalMaterial09","BiologicalMaterial08", "Yield" )
imputed_chemical %>% select(unique_predictors) %>%
cor() %>%
ggcorrplot(method = "square", lab =TRUE, lab_size = 4)
There is high correlation between biological material 8 and biological material 12 as well as manufacturing process 29 and manufacturing process 31. None of the unique predictors are very correlated with the yield. Although we can see that they are correlated, there is no real information about why they have a relationship with each other or the yield. Further investigation into the processes would be needed to come to any conclusions.