Homework 8

Question 7.2

library(fpp3)
library(caret)
library(mlbench)
library(ggplot2)
library(ggcorrplot)

Import our Data:

set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
trainingData$x <- data.frame(trainingData$x)

featurePlot(trainingData$x, trainingData$y)

testData <- mlbench.friedman1(5000, sd =1)
testData$x <- data.frame(testData$x)

KNN Model:

knnModel <- train(x = trainingData$x,
                  y = trainingData$y, 
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength  = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
knnPred <- predict(knnModel, newdata = testData$x)
postResample(pred = knnPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

MARS Model:

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(200)
marsTuned <- train(trainingData$x, trainingData$y, method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))
marsTuned
## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.188280  0.3042527  3.460689
##   1        3      3.551182  0.4999832  2.837116
##   1        4      2.653143  0.7167280  2.128222
##   1        5      2.405769  0.7562160  1.948161
##   1        6      2.295006  0.7754603  1.853199
##   1        7      1.771950  0.8611767  1.391357
##   1        8      1.647182  0.8774867  1.299564
##   1        9      1.609816  0.8837307  1.299705
##   1       10      1.635035  0.8798236  1.309436
##   1       11      1.571915  0.8896147  1.260711
##   1       12      1.571561  0.8898750  1.253077
##   1       13      1.567577  0.8906927  1.250795
##   1       14      1.571673  0.8909652  1.245508
##   1       15      1.571673  0.8909652  1.245508
##   1       16      1.571673  0.8909652  1.245508
##   1       17      1.571673  0.8909652  1.245508
##   1       18      1.571673  0.8909652  1.245508
##   1       19      1.571673  0.8909652  1.245508
##   1       20      1.571673  0.8909652  1.245508
##   1       21      1.571673  0.8909652  1.245508
##   1       22      1.571673  0.8909652  1.245508
##   1       23      1.571673  0.8909652  1.245508
##   1       24      1.571673  0.8909652  1.245508
##   1       25      1.571673  0.8909652  1.245508
##   1       26      1.571673  0.8909652  1.245508
##   1       27      1.571673  0.8909652  1.245508
##   1       28      1.571673  0.8909652  1.245508
##   1       29      1.571673  0.8909652  1.245508
##   1       30      1.571673  0.8909652  1.245508
##   1       31      1.571673  0.8909652  1.245508
##   1       32      1.571673  0.8909652  1.245508
##   1       33      1.571673  0.8909652  1.245508
##   1       34      1.571673  0.8909652  1.245508
##   1       35      1.571673  0.8909652  1.245508
##   1       36      1.571673  0.8909652  1.245508
##   1       37      1.571673  0.8909652  1.245508
##   1       38      1.571673  0.8909652  1.245508
##   2        2      4.188280  0.3042527  3.460689
##   2        3      3.551182  0.4999832  2.837116
##   2        4      2.615256  0.7216809  2.128763
##   2        5      2.344223  0.7683855  1.890080
##   2        6      2.275048  0.7762472  1.807779
##   2        7      1.841464  0.8418935  1.457945
##   2        8      1.641647  0.8839822  1.288520
##   2        9      1.535119  0.9002991  1.214772
##   2       10      1.473254  0.9101555  1.158761
##   2       11      1.379476  0.9207735  1.080991
##   2       12      1.285380  0.9283193  1.033426
##   2       13      1.267261  0.9328905  1.014726
##   2       14      1.261797  0.9327541  1.009821
##   2       15      1.266663  0.9320714  1.005751
##   2       16      1.270858  0.9322465  1.009757
##   2       17      1.263778  0.9327687  1.007653
##   2       18      1.263778  0.9327687  1.007653
##   2       19      1.263778  0.9327687  1.007653
##   2       20      1.263778  0.9327687  1.007653
##   2       21      1.263778  0.9327687  1.007653
##   2       22      1.263778  0.9327687  1.007653
##   2       23      1.263778  0.9327687  1.007653
##   2       24      1.263778  0.9327687  1.007653
##   2       25      1.263778  0.9327687  1.007653
##   2       26      1.263778  0.9327687  1.007653
##   2       27      1.263778  0.9327687  1.007653
##   2       28      1.263778  0.9327687  1.007653
##   2       29      1.263778  0.9327687  1.007653
##   2       30      1.263778  0.9327687  1.007653
##   2       31      1.263778  0.9327687  1.007653
##   2       32      1.263778  0.9327687  1.007653
##   2       33      1.263778  0.9327687  1.007653
##   2       34      1.263778  0.9327687  1.007653
##   2       35      1.263778  0.9327687  1.007653
##   2       36      1.263778  0.9327687  1.007653
##   2       37      1.263778  0.9327687  1.007653
##   2       38      1.263778  0.9327687  1.007653
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.
marsPred <- predict(marsTuned, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 1.1722635 0.9448890 0.9324923

SVM Model:

svmTuned <- train(trainingData$x, trainingData$y,
                  method = "svmRadial",
                  preProc = c("center", "scale"),
                  tuneLength = 14,
                  trControl = trainControl(method = "cv"))
svmTuned$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 8 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.0594129416262883 
## 
## Number of Support Vectors : 152 
## 
## Objective Function Value : -78.5712 
## Training error : 0.009515
svmPred <- predict(svmTuned, newdata = testData$x)
postResample(pred = svmPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.0405933 0.8312032 1.5474565

Neural Network Model:

## Determine if there are predictors that contain correlation above a certain threshold
high_cor <- findCorrelation(cor(trainingData$x), cutoff = .75)
high_cor
## integer(0)

None of the predictors have correlation above the threshold

nnetgrid <- expand.grid(.decay = c(0,0.01, .1),
                        .size =c(1:10),
                        .bag = FALSE)
set.seed(200)
nnettune <- train(trainingData$x, trainingData$y,
                  method ="avNNet",
                  tuneGrid = nnetgrid, 
                  trControl = trainControl(method = "cv"),
                  preProc = c("center", "scale"),
                  linout =TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(trainingData$x) + 1) + 10 + 1, maxit = 500)
nnPred <- predict(nnettune, newdata = testData$x)
postResample(pred = nnPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 2.0603901 0.8320669 1.5289876

Conclusion:

The model with the best performance is the MARS model. It contains the highest Rsquared at 0.945 and the lowest RMSE at 1.17.

varImp(marsTuned)
## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.24
## X2   48.74
## X5   15.53
## X3    0.00

MARS selects all of the informative predictors, besides X3. The X5 parameter doesn’t have much importance to the model though.

Question 7.5

Part A

library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")

We will use bag impute to impute the values

set.seed(1443)
process <- preProcess(ChemicalManufacturingProcess, method = "bagImpute")
imputed_chemical <- predict(process,ChemicalManufacturingProcess)

Now the data can be split into a train/test dataset

split <- createDataPartition(imputed_chemical$Yield, p = 0.8, list = FALSE)
y <- imputed_chemical[,1]
x <- imputed_chemical[,-1]
train_x <- x[split,]
train_y <- y[split]
test_x <- x[-split,]
test_y <- y[-split]

MARS Model:

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
set.seed(1443)
marsTuned <- train(train_x, train_y, method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))
marsTuned
## Multivariate Adaptive Regression Spline 
## 
## 144 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 129, 130, 128, 131, 130, 131, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      1.421482  0.4679176  1.1104162
##   1        3      1.248109  0.5967817  1.0027972
##   1        4      1.207163  0.6157003  0.9786943
##   1        5      1.194636  0.6304960  0.9599176
##   1        6      1.229769  0.6085103  0.9653690
##   1        7      1.212047  0.6317633  0.9565507
##   1        8      1.183587  0.6389006  0.9457047
##   1        9      1.254062  0.5995039  0.9904849
##   1       10      1.244370  0.6152832  0.9835944
##   1       11      1.227960  0.6266194  0.9792802
##   1       12      1.267643  0.6104668  0.9921221
##   1       13      1.275278  0.6133854  0.9794999
##   1       14      1.284729  0.6113546  0.9862456
##   1       15      1.305466  0.6138931  1.0046183
##   1       16      1.316288  0.6107436  1.0108728
##   1       17      1.349402  0.5976483  1.0419861
##   1       18      1.353143  0.5954621  1.0456895
##   1       19      1.348765  0.5964580  1.0376609
##   1       20      1.353248  0.5940745  1.0443913
##   1       21      1.353248  0.5940745  1.0443913
##   1       22      1.353248  0.5940745  1.0443913
##   1       23      1.353248  0.5940745  1.0443913
##   1       24      1.353248  0.5940745  1.0443913
##   1       25      1.353248  0.5940745  1.0443913
##   1       26      1.353248  0.5940745  1.0443913
##   1       27      1.353248  0.5940745  1.0443913
##   1       28      1.353248  0.5940745  1.0443913
##   1       29      1.353248  0.5940745  1.0443913
##   1       30      1.353248  0.5940745  1.0443913
##   1       31      1.353248  0.5940745  1.0443913
##   1       32      1.353248  0.5940745  1.0443913
##   1       33      1.353248  0.5940745  1.0443913
##   1       34      1.353248  0.5940745  1.0443913
##   1       35      1.353248  0.5940745  1.0443913
##   1       36      1.353248  0.5940745  1.0443913
##   1       37      1.353248  0.5940745  1.0443913
##   1       38      1.353248  0.5940745  1.0443913
##   2        2      1.421482  0.4679176  1.1104162
##   2        3      1.329601  0.5342963  1.0313547
##   2        4      1.219195  0.6154011  0.9598474
##   2        5      1.262747  0.5827485  0.9900176
##   2        6      1.279963  0.5709927  0.9988537
##   2        7      1.238261  0.6234642  0.9891882
##   2        8      1.340012  0.6025986  1.0368049
##   2        9      1.270893  0.6256407  0.9972661
##   2       10      1.235299  0.6512391  0.9779152
##   2       11      1.265105  0.6452478  0.9887539
##   2       12      1.232196  0.6586458  0.9803787
##   2       13      1.217495  0.6629845  0.9430122
##   2       14      1.228971  0.6581058  0.9515741
##   2       15      1.237148  0.6599354  0.9606890
##   2       16      1.277041  0.6400424  0.9751521
##   2       17      1.288075  0.6380934  0.9750828
##   2       18      1.297636  0.6316674  0.9816944
##   2       19      1.298170  0.6362933  0.9841653
##   2       20      1.296204  0.6381375  0.9857380
##   2       21      1.295169  0.6384784  0.9825247
##   2       22      1.293966  0.6432376  0.9723086
##   2       23      1.293966  0.6432376  0.9723086
##   2       24      1.926249  0.6212280  1.1680111
##   2       25      1.938489  0.6212937  1.1736045
##   2       26      2.000929  0.6212288  1.1878331
##   2       27      2.085514  0.6205812  1.2133717
##   2       28      2.308993  0.6194794  1.2753805
##   2       29      2.308993  0.6194794  1.2753805
##   2       30      2.308993  0.6194794  1.2753805
##   2       31      2.328632  0.6195214  1.2755721
##   2       32      2.397736  0.6193375  1.2948054
##   2       33      2.452103  0.6192322  1.3076040
##   2       34      2.334805  0.6195231  1.2795109
##   2       35      2.334805  0.6195231  1.2795109
##   2       36      2.334805  0.6195231  1.2795109
##   2       37      2.334805  0.6195231  1.2795109
##   2       38      2.334805  0.6195231  1.2795109
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 8 and degree = 1.
marsPred <- predict(marsTuned, newdata = test_x)
mars_acc <- postResample(pred = marsPred, obs = test_y)

KNN Model:

knnModel <- train(x = train_x,
                  y = train_y, 
                  method = "knn",
                  preProc = c("center", "scale"),
                  tuneLength  = 10)
knnModel
## k-Nearest Neighbors 
## 
## 144 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 144, 144, 144, 144, 144, 144, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.524775  0.3656683  1.184893
##    7  1.507903  0.3752920  1.186188
##    9  1.493735  0.3902160  1.184275
##   11  1.473488  0.4149109  1.176280
##   13  1.469845  0.4235443  1.176752
##   15  1.468520  0.4279935  1.177127
##   17  1.479525  0.4252627  1.186608
##   19  1.485196  0.4237991  1.195323
##   21  1.500216  0.4109271  1.205779
##   23  1.500431  0.4144962  1.204616
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred <- predict(knnModel, newdata = test_x)
knn_acc <- postResample(pred = knnPred, obs = test_y)

SVM Model:

svmTuned <- train(train_x, train_y,
                  method = "svmRadial",
                  preProc = c("center", "scale"),
                  tuneLength = 14,
                  trControl = trainControl(method = "cv"))
svmTuned$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 16 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.014149870944168 
## 
## Number of Support Vectors : 127 
## 
## Objective Function Value : -86.3478 
## Training error : 0.009262
svmPred <- predict(svmTuned, newdata = test_x)
svm_acc <- postResample(pred = svmPred, obs = test_y)

Neural Network Model:

Remove highly correlated variables

high_cor <- findCorrelation(cor(train_x), cutoff = .75)
high_cor
##  [1]  2  6  1  8  4 12 44 27 41 21 25 26 54 57 38 39 43 30 52
## Make new dataframe for neural network as data will be removed

train_x_nn <- train_x
train_x_nn$y <- train_y

test_x_nn <- test_x
test_x_nn$y <- test_y

train_nn <- train_x_nn[, -high_cor]
test_nn <- test_x_nn[,-high_cor]

train_y_nn <- train_x_nn$y
test_y_nn <- test_x_nn$y

train_x_nn <- train_x_nn %>% subset(select = -c(y))
test_x_nn <- test_x_nn %>% subset(select = -c(y))
nnetgrid <- expand.grid(.decay = c(0,0.01, .1),
                        .size =c(1:10),
                        .bag = FALSE)
set.seed(200)
nnettune <- train(train_x_nn, train_y_nn,
                  method ="avNNet",
                  tuneGrid = nnetgrid, 
                  trControl = trainControl(method = "cv"),
                  preProc = c("center", "scale"),
                  linout =TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(train_x_nn) + 1) + 10 + 1, maxit = 500)
nnPred <- predict(nnettune, newdata = test_x_nn)
nn_acc <- postResample(pred = nnPred, obs = test_y_nn)
model_acc <- rbind(nnet = nn_acc, svm = svm_acc, knn = knn_acc, mars = mars_acc)
model_acc
##          RMSE  Rsquared       MAE
## nnet 1.429413 0.4623302 1.1464508
## svm  1.212403 0.4992352 0.9962611
## knn  1.258253 0.4692207 1.0442500
## mars 1.235178 0.4821087 1.0543239

The support vector model has the highest Rsquared and the lowest RMSE, so it would be the model chosen

Part B

varImp(svmTuned)
## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   97.00
## BiologicalMaterial06     83.58
## ManufacturingProcess36   79.24
## BiologicalMaterial03     79.14
## ManufacturingProcess17   76.19
## ManufacturingProcess09   74.99
## BiologicalMaterial12     74.51
## BiologicalMaterial02     64.70
## ManufacturingProcess31   57.91
## ManufacturingProcess06   52.48
## BiologicalMaterial11     50.85
## ManufacturingProcess33   49.63
## ManufacturingProcess11   45.68
## BiologicalMaterial04     45.55
## ManufacturingProcess29   44.62
## BiologicalMaterial09     42.31
## BiologicalMaterial08     41.31
## BiologicalMaterial01     36.76
## ManufacturingProcess12   36.70

The process variables tend to dominate the SVM machine learning model, but some biological variables are present.

set.seed(1443)
pls_train <- train(train_x, train_y, method = "pls", tuneLength = 25, trControl = trainControl(method = "cv", number = 10), preProc = c("center", "scale"))

The linear model:

varImp(pls_train)
## pls variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   85.33
## ManufacturingProcess36   83.91
## ManufacturingProcess09   83.52
## ManufacturingProcess17   80.64
## ManufacturingProcess33   61.26
## BiologicalMaterial03     60.70
## BiologicalMaterial02     60.14
## BiologicalMaterial06     59.39
## ManufacturingProcess12   58.12
## BiologicalMaterial08     55.71
## ManufacturingProcess11   54.51
## ManufacturingProcess06   54.38
## BiologicalMaterial12     53.12
## BiologicalMaterial11     51.76
## BiologicalMaterial01     50.74
## BiologicalMaterial04     49.94
## ManufacturingProcess28   44.28
## ManufacturingProcess34   42.21
## ManufacturingProcess04   39.58

In our linear model, the manufacturing processes dominate the list of importance far more than the nonlinear model. All the top 5 predictors are for manufacturing. Although this is the case, many of the top 10 predictors are shared between the models.

Part C

unique_predictors <- c("BiologicalMaterial12", "ManufacturingProcess31","ManufacturingProcess33", "ManufacturingProcess29","BiologicalMaterial09","BiologicalMaterial08", "Yield" )

imputed_chemical %>% select(unique_predictors) %>% 
  cor() %>%
  ggcorrplot(method = "square", lab =TRUE, lab_size = 4)

There is high correlation between biological material 8 and biological material 12 as well as manufacturing process 29 and manufacturing process 31. None of the unique predictors are very correlated with the yield. Although we can see that they are correlated, there is no real information about why they have a relationship with each other or the yield. Further investigation into the processes would be needed to come to any conclusions.