DATA 624 Homework 8

library(ggplot2)
library(AppliedPredictiveModeling)
library(magrittr)
library(caret)
library(tidyr)
library(dplyr)
library(knitr)
library(kableExtra)
library(mlbench)
library(kernlab)
library(lattice)
library(earth)
library(doParallel)

registerDoParallel(cores=4)
transparentTheme(trans = .4)

Nonlinear Regression Models

Exercise 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\[y=10sin(\pi { x }_{ 1 }{ x }_{ 2 })+20{ ({ x }_{ 3 }-0.5) }^{ 2 }+10{ x }_{ 4 }+5{ x }_{ 5 }+N{ (0,{ \sigma }^{ 2 }) }\]

where the x values are random variables uniformly distributed between [0,1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

set.seed(200)
tr_data <- mlbench.friedman1(200, sd = 1)

Organize data and view plot

tr_data$x <- data.frame(tr_data$x)

featurePlot(x = tr_data$x, y = tr_data$y, plot = "scatter",
            type = c("p", "smooth"), span = .5,
            layout = c(3, 1))

Simulate test set to estimate performance

tst_data <- mlbench.friedman1(5000, sd = 1)
tst_data$x <- data.frame(tst_data$x)

The preProcess class can be used for many operations on predictors, including centering and scaling. The function preProcess estimates the required parameters for each operation and predict.preProcess is used to apply them to specific data sets. This function can also be interfaces when calling the train function.

knnModel <- train(x = tr_data$x,
                  y = tr_data$y,
                  method = "knn",
                  preProcess = c("center", "scale"),
                  tuneLength = 10)
knnModel

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.466085  0.5121775  2.816838
##    7  3.349428  0.5452823  2.727410
##    9  3.264276  0.5785990  2.660026
##   11  3.214216  0.6024244  2.603767
##   13  3.196510  0.6176570  2.591935
##   15  3.184173  0.6305506  2.577482
##   17  3.183130  0.6425367  2.567787
##   19  3.198752  0.6483184  2.592683
##   21  3.188993  0.6611428  2.588787
##   23  3.200458  0.6638353  2.604529
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.

The function postResample can be used to estimate the root mean squared error (RMSE), simple R2, and the mean absolute error (MAE) for numeric outcomes.

knnPred <- predict(knnModel, newdata = tst_data$x)
postResample(pred = knnPred, obs = tst_data$y)

##      RMSE  Rsquared       MAE 
## 3.2040595 0.6819919 2.5683461

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1-X5)?

Neural Network

## The predictors cutoff is 0.75
findCorrelation(cor(tr_data$x), cutoff = .75)

## integer(0)

nnetGrid <- expand.grid(size = c(1:10),
                        decay = c(0, 0.01, 0.1),
                        bag = FALSE)
ctrl <- trainControl(method = "cv")
nnetTune <- train(tr_data$x, tr_data$y,
                  method = "avNNet",
                  tuneGrid = nnetGrid,
                  trControl = ctrl,
                  preProcess = c("center", "scale"),
                  linout = TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(tr_data$x) + 1) + 10 + 1,
                  maxit = 500
                  )
nnetTune

## Model Averaged Neural Network 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  RMSE      Rsquared   MAE     
##    1    0.00   2.434845  0.7683498  1.921367
##    1    0.01   2.437343  0.7689596  1.935100
##    1    0.10   2.450747  0.7652079  1.941971
##    2    0.00   2.564619  0.7437707  2.049075
##    2    0.01   2.493438  0.7576380  2.004657
##    2    0.10   2.470469  0.7623416  1.942559
##    3    0.00   2.107356  0.8283417  1.681631
##    3    0.01   2.118485  0.8252286  1.695232
##    3    0.10   2.113338  0.8296311  1.726697
##    4    0.00   1.993740  0.8454879  1.559778
##    4    0.01   2.055123  0.8382896  1.618923
##    4    0.10   2.111877  0.8360611  1.687825
##    5    0.00   2.173066  0.8230241  1.721098
##    5    0.01   2.117938  0.8262887  1.708309
##    5    0.10   2.126142  0.8313323  1.668135
##    6    0.00   3.011411  0.7133146  2.156020
##    6    0.01   2.245598  0.8059848  1.812856
##    6    0.10   2.200281  0.8204366  1.778666
##    7    0.00   6.423508  0.4711110  3.500348
##    7    0.01   2.369153  0.8044959  1.896450
##    7    0.10   2.277288  0.8082229  1.826526
##    8    0.00   5.161402  0.5296825  3.078540
##    8    0.01   2.375962  0.7935227  1.874543
##    8    0.10   2.355936  0.7886816  1.877549
##    9    0.00   7.052266  0.4257688  4.006578
##    9    0.01   2.443129  0.7865069  1.912185
##    9    0.10   2.242392  0.8071338  1.780744
##   10    0.00   2.906893  0.7036006  2.190651
##   10    0.01   2.516951  0.7569720  2.016432
##   10    0.10   2.238560  0.8113030  1.787851
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 4, decay = 0 and bag = FALSE.

nnetPred <- predict(nnetTune, newdata = tst_data$x)
postResample(pred = nnetPred, obs = tst_data$y)

##     RMSE Rsquared      MAE 
## 2.496722 0.784618 1.685182

MARS

Multivariate Adaptive Regression Splines

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsTuned <- train(tr_data$x, tr_data$y,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = ctrl)
marsTuned

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      4.413446  0.2478941  3.6399239
##   1        3      3.574483  0.5013467  2.8729864
##   1        4      2.617745  0.7237716  2.0980328
##   1        5      2.320707  0.7886105  1.8754620
##   1        6      2.260119  0.7883800  1.7778789
##   1        7      1.762424  0.8755876  1.4062920
##   1        8      1.714411  0.8837484  1.3496555
##   1        9      1.719717  0.8824787  1.3339340
##   1       10      1.702872  0.8849333  1.3225238
##   1       11      1.751669  0.8809872  1.3531874
##   1       12      1.711261  0.8872353  1.3368464
##   1       13      1.715594  0.8876740  1.3356978
##   1       14      1.705170  0.8894903  1.3253158
##   1       15      1.708649  0.8891224  1.3270375
##   1       16      1.708649  0.8891224  1.3270375
##   1       17      1.708649  0.8891224  1.3270375
##   1       18      1.708649  0.8891224  1.3270375
##   1       19      1.708649  0.8891224  1.3270375
##   1       20      1.708649  0.8891224  1.3270375
##   1       21      1.708649  0.8891224  1.3270375
##   1       22      1.708649  0.8891224  1.3270375
##   1       23      1.708649  0.8891224  1.3270375
##   1       24      1.708649  0.8891224  1.3270375
##   1       25      1.708649  0.8891224  1.3270375
##   1       26      1.708649  0.8891224  1.3270375
##   1       27      1.708649  0.8891224  1.3270375
##   1       28      1.708649  0.8891224  1.3270375
##   1       29      1.708649  0.8891224  1.3270375
##   1       30      1.708649  0.8891224  1.3270375
##   1       31      1.708649  0.8891224  1.3270375
##   1       32      1.708649  0.8891224  1.3270375
##   1       33      1.708649  0.8891224  1.3270375
##   1       34      1.708649  0.8891224  1.3270375
##   1       35      1.708649  0.8891224  1.3270375
##   1       36      1.708649  0.8891224  1.3270375
##   1       37      1.708649  0.8891224  1.3270375
##   1       38      1.708649  0.8891224  1.3270375
##   2        2      4.446426  0.2463165  3.6801871
##   2        3      3.693557  0.4680639  2.9539729
##   2        4      2.604103  0.7280379  2.0888994
##   2        5      2.302083  0.7913823  1.8518115
##   2        6      2.274632  0.7898184  1.7702669
##   2        7      1.755874  0.8759812  1.4073734
##   2        8      1.708271  0.8844751  1.3477172
##   2        9      1.463317  0.9139567  1.1744075
##   2       10      1.387041  0.9257526  1.1002772
##   2       11      1.328055  0.9326865  1.0485877
##   2       12      1.279350  0.9362082  1.0111897
##   2       13      1.259643  0.9393238  0.9954768
##   2       14      1.271698  0.9383447  1.0111163
##   2       15      1.272656  0.9384563  1.0099987
##   2       16      1.280985  0.9373221  1.0229847
##   2       17      1.280985  0.9373221  1.0229847
##   2       18      1.280985  0.9373221  1.0229847
##   2       19      1.280985  0.9373221  1.0229847
##   2       20      1.280985  0.9373221  1.0229847
##   2       21      1.280985  0.9373221  1.0229847
##   2       22      1.280985  0.9373221  1.0229847
##   2       23      1.280985  0.9373221  1.0229847
##   2       24      1.280985  0.9373221  1.0229847
##   2       25      1.280985  0.9373221  1.0229847
##   2       26      1.280985  0.9373221  1.0229847
##   2       27      1.280985  0.9373221  1.0229847
##   2       28      1.280985  0.9373221  1.0229847
##   2       29      1.280985  0.9373221  1.0229847
##   2       30      1.280985  0.9373221  1.0229847
##   2       31      1.280985  0.9373221  1.0229847
##   2       32      1.280985  0.9373221  1.0229847
##   2       33      1.280985  0.9373221  1.0229847
##   2       34      1.280985  0.9373221  1.0229847
##   2       35      1.280985  0.9373221  1.0229847
##   2       36      1.280985  0.9373221  1.0229847
##   2       37      1.280985  0.9373221  1.0229847
##   2       38      1.280985  0.9373221  1.0229847
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 13 and degree = 2.

varImp(marsTuned)

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.33
## X2   48.88
## X5   15.63
## X3    0.00

marsPred <- predict(marsTuned, newdata = tst_data$x)
postResample(pred = marsPred, obs = tst_data$y)

##      RMSE  Rsquared       MAE 
## 1.2803060 0.9335241 1.0168673

SVM

Support Vector Machines

svmRTuned <- train(tr_data$x, tr_data$y,
                   method = "svmRadial",
                   preProcess = c("center", "scale"),
                   tuneLength = 15,
                   trControl = ctrl)
svmRTuned

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.506432  0.8074352  1.991203
##      0.50  2.245243  0.8248294  1.772495
##      1.00  2.055802  0.8483389  1.613707
##      2.00  1.945652  0.8601214  1.506706
##      4.00  1.858992  0.8677698  1.456227
##      8.00  1.841591  0.8687158  1.470754
##     16.00  1.850739  0.8672508  1.485498
##     32.00  1.850714  0.8672494  1.485458
##     64.00  1.850714  0.8672494  1.485458
##    128.00  1.850714  0.8672494  1.485458
##    256.00  1.850714  0.8672494  1.485458
##    512.00  1.850714  0.8672494  1.485458
##   1024.00  1.850714  0.8672494  1.485458
##   2048.00  1.850714  0.8672494  1.485458
##   4096.00  1.850714  0.8672494  1.485458
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0623166
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.0623166 and C = 8.

svmRPred <- predict(svmRTuned, newdata = tst_data$x)
postResample(pred = svmRPred, obs = tst_data$y)

##      RMSE  Rsquared       MAE 
## 2.0514462 0.8294636 1.5563873

Summary

MARS model appears to give the best performance as evident from RMSE value in the table below.

rbind(
  "mars" = postResample(pred = marsPred, obs = tst_data$y),
  "svm" = postResample(pred = svmRPred, obs = tst_data$y),
  "net" = postResample(pred = nnetPred, obs = tst_data$y),
  "knn" = postResample(pred = knnPred, obs = tst_data$y)
) %>%
  kable() %>%
    kable_styling()

	RMSE	Rsquared	MAE
mars	1.280306	0.9335241	1.016867
svm	2.051446	0.8294636	1.556387
net	2.496722	0.7846180	1.685182
knn	3.204060	0.6819919	2.568346

The MARS model does select the informative predictors (those named X1-X5) as shown by the variable importance table (varImp) below.

varImp(marsTuned)

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.33
## X2   48.88
## X5   15.63
## X3    0.00

Exercise 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

Pre-Processing

data("ChemicalManufacturingProcess")

preP <- preProcess(ChemicalManufacturingProcess, 
                   method = c("BoxCox", "knnImpute", "center", "scale"))
df <- predict(preP, ChemicalManufacturingProcess)

## Restore to original
df$Yield = ChemicalManufacturingProcess$Yield

## Split the data
trainRows <- createDataPartition(df$Yield, p = .80, list = FALSE)
df.train <- df[trainRows, ]
df.test <- df[-trainRows, ]

Nonlinear Regression Models Training

colYield <- which(colnames(df) == "Yield")
trainX <- df.train[, -colYield]
trainY <- df.train$Yield
testX <- df.test[, -colYield]
testY <- df.test$Yield

KNN Model

knnModel <- train(x = trainX,
                  y = trainY,
                  method = "knn",
                  preProcess = c("center", "scale"),
                  tuneLength = 10)
knnPred <- predict(knnModel, newdata = testX)

Neural Networks Model

tooHigh <- findCorrelation(cor(trainX), cutoff = .75)
trainXnnet <- trainX[, -tooHigh]
testXnnet <- testX[, -tooHigh]
nnetGrid <- expand.grid(size = c(1:10),
                        decay = c(0, 0.01, 0.1),
                        bag = FALSE)
ctrl <- trainControl(method = "cv")
nnetTune <- train(trainXnnet, trainY,
                  method = "avNNet",
                  tuneGrid = nnetGrid,
                  trControl = ctrl,
                  linout = TRUE,
                  trace = FALSE,
                  MaxNWts = 10 * (ncol(tr_data$x) + 1) + 10 + 1,
                  maxit = 500
                  )
nnetPred <- predict(nnetTune, newdata = testXnnet)

MARS Model

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:38)
marsTuned <- train(trainX, trainY,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = ctrl)
marsPred <- predict(marsTuned, newdata = testX)

SVM Model

svmRTuned <- train(trainX, trainY,
                   method = "svmRadial",
                   tuneLength = 15,
                   trControl = ctrl)
svmRPred <- predict(svmRTuned, newdata = testX)

Part A

Train set performance

rbind(
  "mars" = postResample(pred = predict(marsTuned), obs = trainY),
  "svm" = postResample(pred = predict(svmRTuned), obs = trainY),
  "net" = postResample(pred = predict(nnetTune), obs = trainY),
  "knn" = postResample(pred = predict(knnModel), obs = trainY)
) %>%
  kable() %>%
    kable_styling()

	RMSE	Rsquared	MAE
mars	0.8141725	0.8162069	0.6495019
svm	0.3102481	0.9759565	0.2271820
net	0.5294216	0.9269025	0.3558515
knn	1.1561029	0.6596899	0.8897816

Test set performance

rbind(
  "mars" = postResample(pred = marsPred, obs = testY),
  "svm" = postResample(pred = svmRPred, obs = testY),
  "net" = postResample(pred = nnetPred, obs = testY),
  "knn" = postResample(pred = knnPred, obs = testY)
) %>%
  kable() %>%
    kable_styling()

	RMSE	Rsquared	MAE
mars	1.321779	0.4528653	1.137929
svm	1.169016	0.4698346	0.887459
net	1.336501	0.3965722	1.051339
knn	1.250764	0.3593780	1.116389

The RMSE value for SVM was the lowest i.e Train: 0.3102481 and Test: 1.169016. Looks like SVM is optimal

Part B

varImp(svmRTuned)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess32   95.71
## ManufacturingProcess17   88.31
## BiologicalMaterial06     81.77
## ManufacturingProcess36   75.41
## ManufacturingProcess09   70.71
## BiologicalMaterial12     70.31
## BiologicalMaterial03     70.30
## ManufacturingProcess31   57.99
## ManufacturingProcess06   57.52
## BiologicalMaterial02     54.56
## BiologicalMaterial11     47.81
## ManufacturingProcess33   46.19
## ManufacturingProcess11   45.22
## BiologicalMaterial04     44.54
## ManufacturingProcess30   42.56
## BiologicalMaterial08     41.76
## ManufacturingProcess12   39.68
## BiologicalMaterial01     38.16
## ManufacturingProcess02   37.51

The above list shows most important predictors at the top, for the optimal model (SVM). There are slightly more process variables dominating the list rather than biological ones.

The below listings of top ten important predictors from the less optimal models against the SVM model, confirm that process variables dominate the list as being the most important. However, different models selected different process variables in the top ten list of importance.

varImp(marsTuned)

## earth variable importance
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess25   52.90
## ManufacturingProcess17   35.66
## BiologicalMaterial02     34.99
## ManufacturingProcess28   34.99
## BiologicalMaterial03     34.99
## ManufacturingProcess06   27.66
## ManufacturingProcess33   15.89
## BiologicalMaterial09      0.00

varImp(nnetTune)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 37)
## 
##                        Overall
## ManufacturingProcess17  100.00
## ManufacturingProcess36   85.40
## BiologicalMaterial03     79.61
## ManufacturingProcess31   65.67
## ManufacturingProcess06   65.14
## ManufacturingProcess33   52.30
## ManufacturingProcess11   51.21
## ManufacturingProcess12   44.94
## ManufacturingProcess02   42.47
## BiologicalMaterial09     42.09
## ManufacturingProcess01   33.81
## ManufacturingProcess04   32.17
## ManufacturingProcess27   28.35
## ManufacturingProcess35   23.09
## ManufacturingProcess20   22.43
## BiologicalMaterial10     21.69
## ManufacturingProcess16   19.95
## ManufacturingProcess24   19.63
## ManufacturingProcess21   16.20
## ManufacturingProcess28   14.26

varImp(knnModel)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess13  100.00
## ManufacturingProcess32   95.71
## ManufacturingProcess17   88.31
## BiologicalMaterial06     81.77
## ManufacturingProcess36   75.41
## ManufacturingProcess09   70.71
## BiologicalMaterial12     70.31
## BiologicalMaterial03     70.30
## ManufacturingProcess31   57.99
## ManufacturingProcess06   57.52
## BiologicalMaterial02     54.56
## BiologicalMaterial11     47.81
## ManufacturingProcess33   46.19
## ManufacturingProcess11   45.22
## BiologicalMaterial04     44.54
## ManufacturingProcess30   42.56
## BiologicalMaterial08     41.76
## ManufacturingProcess12   39.68
## BiologicalMaterial01     38.16
## ManufacturingProcess02   37.51

dotPlot(varImp(knnModel), top=20)

Part C

vip <- varImp(svmRTuned)$importance
top10Vars <- head(rownames(vip)[order(-vip$Overall)], 10)
as.data.frame(top10Vars)

##                 top10Vars
## 1  ManufacturingProcess13
## 2  ManufacturingProcess32
## 3  ManufacturingProcess17
## 4    BiologicalMaterial06
## 5  ManufacturingProcess36
## 6  ManufacturingProcess09
## 7    BiologicalMaterial12
## 8    BiologicalMaterial03
## 9  ManufacturingProcess31
## 10 ManufacturingProcess06

plotX <- df[,top10Vars]
plotY <- df[,colYield]

colnames(plotX) <- gsub("(Process|Material)", "", colnames(plotX))

featurePlot(x = plotX, y = plotY, plot = "scatter",
            type = c("p", "smooth"), span = .5,
            layout = c(3, 1))

This shows the relationship between the topn 10 predictors and the response. SVM is the optimal model. The top predictors appears to have linear relationship with the response. As per our findings ManufacturingProcess13 has the most positive correlation. Some of the Manufacturing Process have negative impact (negative correlation) on Yield, while most of the Biological Maeterial maintain a positive and balanced correlation.