library(tidyverse)
library(fpp2)
library(urca)
library(rio)
library(gridExtra)
library(caret)
library(glmnet)
library(mlbench)
library(AppliedPredictiveModeling)
seed <- 200

1 HW8: Non-Linear Regression

Do problems 7.2 and 7.5 in Kuhn and Johnson. There are only two but they have many parts. Please submit both a link to your Rpubs and the .rmd file.

1.1 Ex. 7.2

Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:

\[y = 10 sin(πx_1x_2) + 20(x_3 − 0.5)^2 + 10x_4 + 5x_5 + N(0, σ^2)\]

where the \(x\) values are random variables uniformly distributed between \([0, 1]\) (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:

#library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)

## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)

## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.

## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data.

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?

1.1.1 KNN

set.seed(seed)
knnModel <- train(x = trainingData$x, y = trainingData$y, method = "knn",
                  preProc = c("center", "scale"), tuneLength = 10)
knnModel

## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.654912  0.4779838  2.958475
##    7  3.529432  0.5118581  2.861742
##    9  3.446330  0.5425096  2.780756
##   11  3.378049  0.5723793  2.719410
##   13  3.332339  0.5953773  2.692863
##   15  3.309235  0.6111389  2.663046
##   17  3.317408  0.6201421  2.678898
##   19  3.311667  0.6333800  2.682098
##   21  3.316340  0.6407537  2.688887
##   23  3.326040  0.6491480  2.705915
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.

knnModel$results[which(knnModel$results$k==knnModel$bestTune$k),]

varImp(knnModel)

## loess r-squared variable importance
## 
##      Overall
## X4  100.0000
## X1   95.5047
## X2   89.6186
## X5   45.2170
## X3   29.9330
## X9    6.3299
## X10   5.5182
## X8    3.2527
## X6    0.8884
## X7    0.0000

knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set performance values
postResample(pred = knnPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 3.1750657 0.6785946 2.5443169

1.1.2 SVM

set.seed(seed)

svmModel <- train(x = trainingData$x, y = trainingData$y, method = "svmRadial",
                   tuneLength = 14, preProc = c("center", "scale"), 
                  trControl = trainControl(method = "cv"))
svmModel

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE     
##      0.25  2.525164  0.7810576  2.010680
##      0.50  2.270567  0.7944850  1.794902
##      1.00  2.099356  0.8155574  1.659376
##      2.00  2.005858  0.8302852  1.578799
##      4.00  1.934650  0.8435677  1.528373
##      8.00  1.915665  0.8475605  1.528648
##     16.00  1.923914  0.8463074  1.535991
##     32.00  1.923914  0.8463074  1.535991
##     64.00  1.923914  0.8463074  1.535991
##    128.00  1.923914  0.8463074  1.535991
##    256.00  1.923914  0.8463074  1.535991
##    512.00  1.923914  0.8463074  1.535991
##   1024.00  1.923914  0.8463074  1.535991
##   2048.00  1.923914  0.8463074  1.535991
## 
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 8.

svmModel$results[which(svmModel$results$C==svmModel$bestTune$C),]

varImp(svmModel)

## loess r-squared variable importance
## 
##      Overall
## X4  100.0000
## X1   95.5047
## X2   89.6186
## X5   45.2170
## X3   29.9330
## X9    6.3299
## X10   5.5182
## X8    3.2527
## X6    0.8884
## X7    0.0000

svmPred <- predict(svmModel, newdata = testData$x)
postResample(pred = svmPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 2.0541197 0.8290353 1.5586411

1.1.3 MARS

Start R and use these commands to load the data:

set.seed(seed)
marsModel <- train(x = trainingData$x, y = trainingData$y, method = "earth",
                   tuneGrid = expand.grid(.degree=1:2, .nprune=2:38), 
                  trControl = trainControl(method = "cv", number=10))
marsModel

## Multivariate Adaptive Regression Spline 
## 
## 200 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE     
##   1        2      4.188280  0.3042527  3.460689
##   1        3      3.551182  0.4999832  2.837116
##   1        4      2.653143  0.7167280  2.128222
##   1        5      2.405769  0.7562160  1.948161
##   1        6      2.295006  0.7754603  1.853199
##   1        7      1.771950  0.8611767  1.391357
##   1        8      1.647182  0.8774867  1.299564
##   1        9      1.609816  0.8837307  1.299705
##   1       10      1.635035  0.8798236  1.309436
##   1       11      1.571915  0.8896147  1.260711
##   1       12      1.571561  0.8898750  1.253077
##   1       13      1.567577  0.8906927  1.250795
##   1       14      1.571673  0.8909652  1.245508
##   1       15      1.571673  0.8909652  1.245508
##   1       16      1.571673  0.8909652  1.245508
##   1       17      1.571673  0.8909652  1.245508
##   1       18      1.571673  0.8909652  1.245508
##   1       19      1.571673  0.8909652  1.245508
##   1       20      1.571673  0.8909652  1.245508
##   1       21      1.571673  0.8909652  1.245508
##   1       22      1.571673  0.8909652  1.245508
##   1       23      1.571673  0.8909652  1.245508
##   1       24      1.571673  0.8909652  1.245508
##   1       25      1.571673  0.8909652  1.245508
##   1       26      1.571673  0.8909652  1.245508
##   1       27      1.571673  0.8909652  1.245508
##   1       28      1.571673  0.8909652  1.245508
##   1       29      1.571673  0.8909652  1.245508
##   1       30      1.571673  0.8909652  1.245508
##   1       31      1.571673  0.8909652  1.245508
##   1       32      1.571673  0.8909652  1.245508
##   1       33      1.571673  0.8909652  1.245508
##   1       34      1.571673  0.8909652  1.245508
##   1       35      1.571673  0.8909652  1.245508
##   1       36      1.571673  0.8909652  1.245508
##   1       37      1.571673  0.8909652  1.245508
##   1       38      1.571673  0.8909652  1.245508
##   2        2      4.188280  0.3042527  3.460689
##   2        3      3.551182  0.4999832  2.837116
##   2        4      2.615256  0.7216809  2.128763
##   2        5      2.344223  0.7683855  1.890080
##   2        6      2.275048  0.7762472  1.807779
##   2        7      1.841464  0.8418935  1.457945
##   2        8      1.641647  0.8839822  1.288520
##   2        9      1.535119  0.9002991  1.214772
##   2       10      1.473254  0.9101555  1.158761
##   2       11      1.379476  0.9207735  1.080991
##   2       12      1.285380  0.9283193  1.033426
##   2       13      1.267261  0.9328905  1.014726
##   2       14      1.261797  0.9327541  1.009821
##   2       15      1.266663  0.9320714  1.005751
##   2       16      1.270858  0.9322465  1.009757
##   2       17      1.263778  0.9327687  1.007653
##   2       18      1.263778  0.9327687  1.007653
##   2       19      1.263778  0.9327687  1.007653
##   2       20      1.263778  0.9327687  1.007653
##   2       21      1.263778  0.9327687  1.007653
##   2       22      1.263778  0.9327687  1.007653
##   2       23      1.263778  0.9327687  1.007653
##   2       24      1.263778  0.9327687  1.007653
##   2       25      1.263778  0.9327687  1.007653
##   2       26      1.263778  0.9327687  1.007653
##   2       27      1.263778  0.9327687  1.007653
##   2       28      1.263778  0.9327687  1.007653
##   2       29      1.263778  0.9327687  1.007653
##   2       30      1.263778  0.9327687  1.007653
##   2       31      1.263778  0.9327687  1.007653
##   2       32      1.263778  0.9327687  1.007653
##   2       33      1.263778  0.9327687  1.007653
##   2       34      1.263778  0.9327687  1.007653
##   2       35      1.263778  0.9327687  1.007653
##   2       36      1.263778  0.9327687  1.007653
##   2       37      1.263778  0.9327687  1.007653
##   2       38      1.263778  0.9327687  1.007653
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 14 and degree = 2.

marsModel$results[which((marsModel$results$nprune==marsModel$bestTune$nprune) & (marsModel$results$degree==marsModel$bestTune$degree)),]

varImp(marsModel)

## earth variable importance
## 
##    Overall
## X1  100.00
## X4   75.24
## X2   48.74
## X5   15.53
## X3    0.00

marsPred <- predict(marsModel, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)

##      RMSE  Rsquared       MAE 
## 1.1722635 0.9448890 0.9324923

1.1.4 Performance

Among the three models, the best tuned marsModel has the smallest RMSE 1.261797, with the largest \(R^2\) 0.9327541. It also gives the smallest RMSE 1.1722635 and the largest \(R^2\) 0.9448890 among the three sets of test set performance. Thus, MARS appears to give the best overall performance.

MARS model selected only the predictors X1, X4, X2, X5, which consists of the most portion of the informative predictors than the other two models.

1.2 Ex. 7.5

Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

Which nonlinear regression model gives the optimal resampling and test set performance?
Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?

1.2.1 Data Pre-Processing

The matrix ChemicalManufacturingProcess contains the 57 predictors (12 describing the input biological material and 45 describing the process predictors) for the 176 manufacturing runs, plus the variable yield which contains the percent yield for each run.

A small percentage of cells in the predictor set contain missing values. Used a knn imputation function to fill in these missing values.

data(ChemicalManufacturingProcess)
summary(ChemicalManufacturingProcess)

##      Yield       BiologicalMaterial01 BiologicalMaterial02
##  Min.   :35.25   Min.   :4.580        Min.   :46.87       
##  1st Qu.:38.75   1st Qu.:5.978        1st Qu.:52.68       
##  Median :39.97   Median :6.305        Median :55.09       
##  Mean   :40.18   Mean   :6.411        Mean   :55.69       
##  3rd Qu.:41.48   3rd Qu.:6.870        3rd Qu.:58.74       
##  Max.   :46.34   Max.   :8.810        Max.   :64.75       
##                                                           
##  BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
##  Min.   :56.97        Min.   : 9.38        Min.   :13.24       
##  1st Qu.:64.98        1st Qu.:11.24        1st Qu.:17.23       
##  Median :67.22        Median :12.10        Median :18.49       
##  Mean   :67.70        Mean   :12.35        Mean   :18.60       
##  3rd Qu.:70.43        3rd Qu.:13.22        3rd Qu.:19.90       
##  Max.   :78.25        Max.   :23.09        Max.   :24.85       
##                                                                
##  BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
##  Min.   :40.60        Min.   :100.0        Min.   :15.88       
##  1st Qu.:46.05        1st Qu.:100.0        1st Qu.:17.06       
##  Median :48.46        Median :100.0        Median :17.51       
##  Mean   :48.91        Mean   :100.0        Mean   :17.49       
##  3rd Qu.:51.34        3rd Qu.:100.0        3rd Qu.:17.88       
##  Max.   :59.38        Max.   :100.8        Max.   :19.14       
##                                                                
##  BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
##  Min.   :11.44        Min.   :1.770        Min.   :135.8       
##  1st Qu.:12.60        1st Qu.:2.460        1st Qu.:143.8       
##  Median :12.84        Median :2.710        Median :146.1       
##  Mean   :12.85        Mean   :2.801        Mean   :147.0       
##  3rd Qu.:13.13        3rd Qu.:2.990        3rd Qu.:149.6       
##  Max.   :14.08        Max.   :6.870        Max.   :158.7       
##                                                                
##  BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
##  Min.   :18.35        Min.   : 0.00          Min.   : 0.00         
##  1st Qu.:19.73        1st Qu.:10.80          1st Qu.:19.30         
##  Median :20.12        Median :11.40          Median :21.00         
##  Mean   :20.20        Mean   :11.21          Mean   :16.68         
##  3rd Qu.:20.75        3rd Qu.:12.15          3rd Qu.:21.50         
##  Max.   :22.21        Max.   :14.10          Max.   :22.50         
##                       NA's   :1              NA's   :3             
##  ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
##  Min.   :1.47           Min.   :911.0          Min.   : 923.0        
##  1st Qu.:1.53           1st Qu.:928.0          1st Qu.: 986.8        
##  Median :1.54           Median :934.0          Median : 999.2        
##  Mean   :1.54           Mean   :931.9          Mean   :1001.7        
##  3rd Qu.:1.55           3rd Qu.:936.0          3rd Qu.:1008.9        
##  Max.   :1.60           Max.   :946.0          Max.   :1175.3        
##  NA's   :15             NA's   :1              NA's   :1             
##  ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
##  Min.   :203.0          Min.   :177.0          Min.   :177.0         
##  1st Qu.:205.7          1st Qu.:177.0          1st Qu.:177.0         
##  Median :206.8          Median :177.0          Median :178.0         
##  Mean   :207.4          Mean   :177.5          Mean   :177.6         
##  3rd Qu.:208.7          3rd Qu.:178.0          3rd Qu.:178.0         
##  Max.   :227.4          Max.   :178.0          Max.   :178.0         
##  NA's   :2              NA's   :1              NA's   :1             
##  ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
##  Min.   :38.89          Min.   : 7.500         Min.   : 7.500        
##  1st Qu.:44.89          1st Qu.: 8.700         1st Qu.: 9.000        
##  Median :45.73          Median : 9.100         Median : 9.400        
##  Mean   :45.66          Mean   : 9.179         Mean   : 9.386        
##  3rd Qu.:46.52          3rd Qu.: 9.550         3rd Qu.: 9.900        
##  Max.   :49.36          Max.   :11.600         Max.   :11.500        
##                         NA's   :9              NA's   :10            
##  ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
##  Min.   :   0.0         Min.   :32.10          Min.   :4701          
##  1st Qu.:   0.0         1st Qu.:33.90          1st Qu.:4828          
##  Median :   0.0         Median :34.60          Median :4856          
##  Mean   : 857.8         Mean   :34.51          Mean   :4854          
##  3rd Qu.:   0.0         3rd Qu.:35.20          3rd Qu.:4882          
##  Max.   :4549.0         Max.   :38.60          Max.   :5055          
##  NA's   :1                                     NA's   :1             
##  ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
##  Min.   :5904           Min.   :   0           Min.   :31.30         
##  1st Qu.:6010           1st Qu.:4561           1st Qu.:33.50         
##  Median :6032           Median :4588           Median :34.40         
##  Mean   :6039           Mean   :4566           Mean   :34.34         
##  3rd Qu.:6061           3rd Qu.:4619           3rd Qu.:35.10         
##  Max.   :6233           Max.   :4852           Max.   :40.00         
##                                                                      
##  ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
##  Min.   :   0           Min.   :5890           Min.   :   0          
##  1st Qu.:4813           1st Qu.:6001           1st Qu.:4553          
##  Median :4835           Median :6022           Median :4582          
##  Mean   :4810           Mean   :6028           Mean   :4556          
##  3rd Qu.:4862           3rd Qu.:6050           3rd Qu.:4610          
##  Max.   :4971           Max.   :6146           Max.   :4759          
##                                                                      
##  ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
##  Min.   :-1.8000        Min.   : 0.000         Min.   :0.000         
##  1st Qu.:-0.6000        1st Qu.: 3.000         1st Qu.:2.000         
##  Median :-0.3000        Median : 5.000         Median :3.000         
##  Mean   :-0.1642        Mean   : 5.406         Mean   :3.017         
##  3rd Qu.: 0.0000        3rd Qu.: 8.000         3rd Qu.:4.000         
##  Max.   : 3.6000        Max.   :12.000         Max.   :6.000         
##                         NA's   :1              NA's   :1             
##  ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
##  Min.   : 0.000         Min.   :   0           Min.   :   0          
##  1st Qu.: 4.000         1st Qu.:4832           1st Qu.:6020          
##  Median : 8.000         Median :4855           Median :6047          
##  Mean   : 8.834         Mean   :4828           Mean   :6016          
##  3rd Qu.:14.000         3rd Qu.:4877           3rd Qu.:6070          
##  Max.   :23.000         Max.   :4990           Max.   :6161          
##  NA's   :1              NA's   :5              NA's   :5             
##  ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
##  Min.   :   0           Min.   : 0.000         Min.   : 0.00         
##  1st Qu.:4560           1st Qu.: 0.000         1st Qu.:19.70         
##  Median :4587           Median :10.400         Median :19.90         
##  Mean   :4563           Mean   : 6.592         Mean   :20.01         
##  3rd Qu.:4609           3rd Qu.:10.750         3rd Qu.:20.40         
##  Max.   :4710           Max.   :11.500         Max.   :22.00         
##  NA's   :5              NA's   :5              NA's   :5             
##  ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
##  Min.   : 0.000         Min.   : 0.00          Min.   :143.0         
##  1st Qu.: 8.800         1st Qu.:70.10          1st Qu.:155.0         
##  Median : 9.100         Median :70.80          Median :158.0         
##  Mean   : 9.161         Mean   :70.18          Mean   :158.5         
##  3rd Qu.: 9.700         3rd Qu.:71.40          3rd Qu.:162.0         
##  Max.   :11.200         Max.   :72.50          Max.   :173.0         
##  NA's   :5              NA's   :5                                    
##  ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
##  Min.   :56.00          Min.   :2.300          Min.   :463.0         
##  1st Qu.:62.00          1st Qu.:2.500          1st Qu.:490.0         
##  Median :64.00          Median :2.500          Median :495.0         
##  Mean   :63.54          Mean   :2.494          Mean   :495.6         
##  3rd Qu.:65.00          3rd Qu.:2.500          3rd Qu.:501.5         
##  Max.   :70.00          Max.   :2.600          Max.   :522.0         
##  NA's   :5              NA's   :5              NA's   :5             
##  ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
##  Min.   :0.01700        Min.   :0.000          Min.   :0.000         
##  1st Qu.:0.01900        1st Qu.:0.700          1st Qu.:2.000         
##  Median :0.02000        Median :1.000          Median :3.000         
##  Mean   :0.01957        Mean   :1.014          Mean   :2.534         
##  3rd Qu.:0.02000        3rd Qu.:1.300          3rd Qu.:3.000         
##  Max.   :0.02200        Max.   :2.300          Max.   :3.000         
##  NA's   :5                                                           
##  ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
##  Min.   :0.000          Min.   :0.00000        Min.   :0.00000       
##  1st Qu.:7.100          1st Qu.:0.00000        1st Qu.:0.00000       
##  Median :7.200          Median :0.00000        Median :0.00000       
##  Mean   :6.851          Mean   :0.01771        Mean   :0.02371       
##  3rd Qu.:7.300          3rd Qu.:0.00000        3rd Qu.:0.00000       
##  Max.   :7.500          Max.   :0.10000        Max.   :0.20000       
##                         NA's   :1              NA's   :1             
##  ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
##  Min.   : 0.00          Min.   : 0.0000        Min.   :0.000         
##  1st Qu.:11.40          1st Qu.: 0.6000        1st Qu.:1.800         
##  Median :11.60          Median : 0.8000        Median :1.900         
##  Mean   :11.21          Mean   : 0.9119        Mean   :1.805         
##  3rd Qu.:11.70          3rd Qu.: 1.0250        3rd Qu.:1.900         
##  Max.   :12.10          Max.   :11.0000        Max.   :2.100         
##                                                                      
##  ManufacturingProcess45
##  Min.   :0.000         
##  1st Qu.:2.100         
##  Median :2.200         
##  Mean   :2.138         
##  3rd Qu.:2.300         
##  Max.   :2.600         
##

predictors <- ChemicalManufacturingProcess[,-c(1)]

#fill in missing values from textbook sec3.8 
cmp_pre <- preProcess(predictors, method="knnImpute") 
#apply the transformations
cmp_predictors <- predict(cmp_pre, predictors)

Split the data into a training and a test set, pre-process the data, and tune models

Pre-process the data with centering and scaling.

cmp_pre <- preProcess(cmp_predictors, method=c("center", "scale"))
cmp_predictors <- predict(cmp_pre, cmp_predictors)

Train-test split at 70%

set.seed(0)
trainingRows <- createDataPartition(ChemicalManufacturingProcess$Yield, 
                                    p=0.70, list=FALSE) #caret, textbook sec4.9
train_X <- cmp_predictors[trainingRows, ]
train_Y <- ChemicalManufacturingProcess$Yield[trainingRows]
test_X <- cmp_predictors[-trainingRows, ]
test_Y <- ChemicalManufacturingProcess$Yield[-trainingRows]

1.2.2 Models

KNN

set.seed(seed)
knnModel <- train(x = train_X, y = train_Y, method = "knn",
                  preProc = c("center", "scale"), tuneLength = 10)
knnModel

## k-Nearest Neighbors 
## 
## 124 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 124, 124, 124, 124, 124, 124, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.539813  0.3750201  1.227161
##    7  1.524679  0.3841183  1.208823
##    9  1.523703  0.3906871  1.216281
##   11  1.537252  0.3837573  1.230434
##   13  1.547875  0.3794296  1.247661
##   15  1.546669  0.3857709  1.240240
##   17  1.553577  0.3852770  1.249162
##   19  1.566202  0.3838446  1.257550
##   21  1.577298  0.3815472  1.266863
##   23  1.587820  0.3753654  1.275620
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.

knnModel$results[which(knnModel$results$k==knnModel$bestTune$k),]

varImp(knnModel)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   82.63
## BiologicalMaterial03     79.22
## BiologicalMaterial06     73.12
## ManufacturingProcess17   67.01
## ManufacturingProcess09   65.99
## BiologicalMaterial12     56.03
## BiologicalMaterial02     55.46
## ManufacturingProcess36   55.44
## ManufacturingProcess06   52.99
## ManufacturingProcess31   51.42
## ManufacturingProcess11   44.14
## ManufacturingProcess30   40.38
## BiologicalMaterial04     38.92
## ManufacturingProcess20   38.51
## ManufacturingProcess33   38.33
## BiologicalMaterial11     37.23
## BiologicalMaterial01     35.48
## BiologicalMaterial08     32.69
## ManufacturingProcess27   31.73

knnPred <- predict(knnModel, newdata = test_X)
postResample(pred = knnPred, obs = test_Y)

##      RMSE  Rsquared       MAE 
## 1.1623546 0.5270362 0.9460897

SVM

set.seed(seed)

svmModel <- train(x = train_X, y = train_Y, method = "svmRadial",
                   tuneLength = 14, preProc = c("center", "scale"), 
                  trControl = trainControl(method = "cv"))
svmModel

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 124 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 112, 111, 112, 112, 112, ... 
## Resampling results across tuning parameters:
## 
##   C        RMSE      Rsquared   MAE      
##      0.25  1.509470  0.5022116  1.2202423
##      0.50  1.360055  0.5763501  1.1127471
##      1.00  1.225229  0.6325938  0.9946640
##      2.00  1.195651  0.6315744  0.9513683
##      4.00  1.165029  0.6506106  0.9351621
##      8.00  1.160765  0.6532519  0.9322085
##     16.00  1.160765  0.6532519  0.9322085
##     32.00  1.160765  0.6532519  0.9322085
##     64.00  1.160765  0.6532519  0.9322085
##    128.00  1.160765  0.6532519  0.9322085
##    256.00  1.160765  0.6532519  0.9322085
##    512.00  1.160765  0.6532519  0.9322085
##   1024.00  1.160765  0.6532519  0.9322085
##   2048.00  1.160765  0.6532519  0.9322085
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01678543
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01678543 and C = 8.

svmModel$results[which(svmModel$results$C==svmModel$bestTune$C),]

varImp(svmModel)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   82.63
## BiologicalMaterial03     79.22
## BiologicalMaterial06     73.12
## ManufacturingProcess17   67.01
## ManufacturingProcess09   65.99
## BiologicalMaterial12     56.03
## BiologicalMaterial02     55.46
## ManufacturingProcess36   55.44
## ManufacturingProcess06   52.99
## ManufacturingProcess31   51.42
## ManufacturingProcess11   44.14
## ManufacturingProcess30   40.38
## BiologicalMaterial04     38.92
## ManufacturingProcess20   38.51
## ManufacturingProcess33   38.33
## BiologicalMaterial11     37.23
## BiologicalMaterial01     35.48
## BiologicalMaterial08     32.69
## ManufacturingProcess27   31.73

svmPred <- predict(svmModel, newdata = test_X)
postResample(pred = svmPred, obs = test_Y)

##      RMSE  Rsquared       MAE 
## 0.9996084 0.6780015 0.7966498

MARS

set.seed(seed)
marsModel <- train(x = train_X, y = train_Y, method = "earth",
                   tuneGrid = expand.grid(.degree=1:2, .nprune=2:38), 
                  trControl = trainControl(method = "cv", number=10))
marsModel

## Multivariate Adaptive Regression Spline 
## 
## 124 samples
##  57 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 112, 111, 112, 112, 112, ... 
## Resampling results across tuning parameters:
## 
##   degree  nprune  RMSE      Rsquared   MAE      
##   1        2      1.464379  0.4278744  1.1434979
##   1        3      1.243938  0.5876275  1.0016038
##   1        4      1.167763  0.6475525  0.9606690
##   1        5      1.195458  0.6213747  0.9880046
##   1        6      1.219402  0.6154096  1.0002067
##   1        7      1.332260  0.5408939  1.0814623
##   1        8      1.335695  0.5495798  1.0753074
##   1        9      1.269230  0.6084046  1.0217191
##   1       10      1.294814  0.5940752  1.0425001
##   1       11      1.318073  0.5882004  1.0837287
##   1       12      1.301646  0.5945737  1.0754887
##   1       13      1.296171  0.6102179  1.0757344
##   1       14      1.285848  0.6178035  1.0609679
##   1       15      1.308401  0.6081256  1.0756533
##   1       16      1.325121  0.6004614  1.0851917
##   1       17      1.326365  0.5990722  1.0910575
##   1       18      1.326365  0.5990722  1.0910575
##   1       19      1.326365  0.5990722  1.0910575
##   1       20      1.326365  0.5990722  1.0910575
##   1       21      1.326365  0.5990722  1.0910575
##   1       22      1.326365  0.5990722  1.0910575
##   1       23      1.326365  0.5990722  1.0910575
##   1       24      1.326365  0.5990722  1.0910575
##   1       25      1.326365  0.5990722  1.0910575
##   1       26      1.326365  0.5990722  1.0910575
##   1       27      1.326365  0.5990722  1.0910575
##   1       28      1.326365  0.5990722  1.0910575
##   1       29      1.326365  0.5990722  1.0910575
##   1       30      1.326365  0.5990722  1.0910575
##   1       31      1.326365  0.5990722  1.0910575
##   1       32      1.326365  0.5990722  1.0910575
##   1       33      1.326365  0.5990722  1.0910575
##   1       34      1.326365  0.5990722  1.0910575
##   1       35      1.326365  0.5990722  1.0910575
##   1       36      1.326365  0.5990722  1.0910575
##   1       37      1.326365  0.5990722  1.0910575
##   1       38      1.326365  0.5990722  1.0910575
##   2        2      1.513972  0.4039588  1.1708820
##   2        3      1.318005  0.5209985  1.0556696
##   2        4      1.289605  0.5461899  1.0414659
##   2        5      1.804329  0.5199665  1.2763757
##   2        6      3.593781  0.5360845  1.7730602
##   2        7      3.484069  0.5828953  1.7331673
##   2        8      3.526391  0.5447415  1.7698416
##   2        9      3.633117  0.5220216  1.7976770
##   2       10      3.869048  0.5159321  1.9071128
##   2       11      3.981311  0.5005362  1.9322848
##   2       12      3.508545  0.4968024  1.8013002
##   2       13      4.171823  0.5098865  1.9908573
##   2       14      3.751825  0.4814136  1.8763742
##   2       15      3.566567  0.4850190  1.8138385
##   2       16      3.669947  0.4928431  1.8672913
##   2       17      3.678524  0.4917893  1.8611460
##   2       18      3.671100  0.4972673  1.8615724
##   2       19      3.776047  0.4496105  1.9290834
##   2       20      3.770828  0.4525992  1.9179708
##   2       21      3.748076  0.4624188  1.8929645
##   2       22      3.748076  0.4624188  1.8929645
##   2       23      3.748076  0.4624188  1.8929645
##   2       24      3.748076  0.4624188  1.8929645
##   2       25      3.748076  0.4624188  1.8929645
##   2       26      3.748076  0.4624188  1.8929645
##   2       27      3.748076  0.4624188  1.8929645
##   2       28      3.748076  0.4624188  1.8929645
##   2       29      3.748076  0.4624188  1.8929645
##   2       30      3.748076  0.4624188  1.8929645
##   2       31      3.748076  0.4624188  1.8929645
##   2       32      3.748076  0.4624188  1.8929645
##   2       33      3.748076  0.4624188  1.8929645
##   2       34      3.748076  0.4624188  1.8929645
##   2       35      3.748076  0.4624188  1.8929645
##   2       36      3.748076  0.4624188  1.8929645
##   2       37      3.748076  0.4624188  1.8929645
##   2       38      3.748076  0.4624188  1.8929645
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.

marsModel$results[which((marsModel$results$nprune==marsModel$bestTune$nprune) & (marsModel$results$degree==marsModel$bestTune$degree)),]

varImp(marsModel)

## earth variable importance
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess09   43.86
## ManufacturingProcess13    0.00

marsPred <- predict(marsModel, newdata = test_X)
postResample(pred = marsPred, obs = test_Y)

##      RMSE  Rsquared       MAE 
## 1.0953033 0.5924993 0.9007130

Neural Networks

set.seed(seed)
nnetModel <- train(x = train_X, y = train_Y, method = "avNNet",
                  tuneGrid = expand.grid(.decay = c(0, 0.01, 0.1), .size = c(1, 5, 10), .bag = FALSE), 
                  trControl = trainControl(method = "cv"), preProcess=c("center", "scale"), 
                  linout = TRUE, trace = FALSE, maxit = 50)
nnetModel

## Model Averaged Neural Network 
## 
## 124 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 112, 112, 111, 112, 112, 112, ... 
## Resampling results across tuning parameters:
## 
##   decay  size  RMSE      Rsquared   MAE     
##   0.00    1    1.646012  0.3941778  1.355516
##   0.00    5    1.868920  0.3900451  1.481834
##   0.00   10    3.183190  0.2278582  2.397276
##   0.01    1    2.548174  0.3961209  1.632880
##   0.01    5    1.853580  0.3894560  1.530885
##   0.01   10    3.341813  0.2083801  2.485296
##   0.10    1    1.379039  0.5078523  1.109062
##   0.10    5    1.952719  0.3602016  1.493032
##   0.10   10    3.057725  0.3724728  2.385928
## 
## Tuning parameter 'bag' was held constant at a value of FALSE
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1, decay = 0.1 and bag
##  = FALSE.

nnetModel$results[which((nnetModel$results$size==nnetModel$bestTune$size) & (nnetModel$results$decay==nnetModel$bestTune$decay)),]

varImp(nnetModel)

## loess r-squared variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## ManufacturingProcess32  100.00
## ManufacturingProcess13   82.63
## BiologicalMaterial03     79.22
## BiologicalMaterial06     73.12
## ManufacturingProcess17   67.01
## ManufacturingProcess09   65.99
## BiologicalMaterial12     56.03
## BiologicalMaterial02     55.46
## ManufacturingProcess36   55.44
## ManufacturingProcess06   52.99
## ManufacturingProcess31   51.42
## ManufacturingProcess11   44.14
## ManufacturingProcess30   40.38
## BiologicalMaterial04     38.92
## ManufacturingProcess20   38.51
## ManufacturingProcess33   38.33
## BiologicalMaterial11     37.23
## BiologicalMaterial01     35.48
## BiologicalMaterial08     32.69
## ManufacturingProcess27   31.73

nnetPred <- predict(nnetModel, newdata = test_X)
postResample(pred = nnetPred, obs = test_Y)

##      RMSE  Rsquared       MAE 
## 1.1014408 0.5837736 0.8749950

1.2.3 Part a

Which nonlinear regression model gives the optimal resampling and test set performance?

Answer:

The SVM model gives the smallest RMSE 0.9996084 and the largest \(R^2\) 0.6780015 with the test set, which appears to have the best test set performance.

#kNN
knnModel$results[which(knnModel$results$k==knnModel$bestTune$k),]

postResample(pred = knnPred, obs = test_Y)

##      RMSE  Rsquared       MAE 
## 1.1623546 0.5270362 0.9460897

#SVM
svmModel$results[which(svmModel$results$C==svmModel$bestTune$C),]

postResample(pred = svmPred, obs = test_Y)

##      RMSE  Rsquared       MAE 
## 0.9996084 0.6780015 0.7966498

#MARS
marsModel$results[which((marsModel$results$nprune==marsModel$bestTune$nprune) & (marsModel$results$degree==marsModel$bestTune$degree)),]

postResample(pred = marsPred, obs = test_Y)

##      RMSE  Rsquared       MAE 
## 1.0953033 0.5924993 0.9007130

#neural networks
nnetModel$results[which((nnetModel$results$size==nnetModel$bestTune$size) & (nnetModel$results$decay==nnetModel$bestTune$decay)),]

postResample(pred = nnetPred, obs = test_Y)

##      RMSE  Rsquared       MAE 
## 1.1014408 0.5837736 0.8749950

1.2.4 Part b

Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?