alt text here

alt text here

library(mlbench)
## Warning: package 'mlbench' was built under R version 3.5.3
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)

## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.

trainingData$x <- data.frame(trainingData$x)

## Look at the data using
featurePlot(trainingData$x, trainingData$y)

## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:

testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)

Tune several models on these data. For example:

knnModel <- train(x = trainingData$x,y = trainingData$y, method = "knn", preProc = c("center", "scale"), tuneLength = 10)
knnModel
## k-Nearest Neighbors 
## 
## 200 samples
##  10 predictor
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  3.565620  0.4887976  2.886629
##    7  3.422420  0.5300524  2.752964
##    9  3.368072  0.5536927  2.715310
##   11  3.323010  0.5779056  2.669375
##   13  3.275835  0.6030846  2.628663
##   15  3.261864  0.6163510  2.621192
##   17  3.261973  0.6267032  2.616956
##   19  3.286299  0.6281075  2.640585
##   21  3.280950  0.6390386  2.643807
##   23  3.292397  0.6440392  2.656080
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.

Model with smallest RMSE value is selected.

knnPred <- predict(knnModel, newdata = testData$x)

## The function 'postResample' can be used to get the test set performance values
postResample(pred = knnPred, obs = testData$y)
##      RMSE  Rsquared       MAE 
## 3.1750657 0.6785946 2.5443169

Which models appear to give the best performance? Does MARS select the informative predictors (those named X1-X5)?

marsModel <- train(x = trainingData$x, y = trainingData$y, method = "earth", preProcess = c("center", "scale"), tuneLength = 10)
marsPred <- predict(marsModel, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
##     RMSE Rsquared      MAE 
## 1.776575 0.872700 1.358367

Mars appear to be bettern than KNN in terms of RMSE and also better Rsquared.


7.5

7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.

library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)

Impute missing data.

(cmp <- preProcess(ChemicalManufacturingProcess, method=c('knnImpute')))
## Created from 152 samples and 58 variables
## 
## Pre-processing:
##   - centered (58)
##   - ignored (0)
##   - 5 nearest neighbor imputation (58)
##   - scaled (58)
cmp_predictors = as.matrix(ChemicalManufacturingProcess[,2:58])
cmp_yield = ChemicalManufacturingProcess[,1]  

Split data into training and testing with 75-25 split.

set.seed(100)
train_select <- createDataPartition(cmp_yield, p=0.75, list=F) #create train set
train_x <- ChemicalManufacturingProcess[train_select,-1]
train_y <-  ChemicalManufacturingProcess[train_select,1]
test_x <- ChemicalManufacturingProcess[-train_select,-1]
test_y <-  ChemicalManufacturingProcess[-train_select,1]
pre_process <- c("nzv",  "corr", "center","scale", "medianImpute")

PLS

plsModel <-train(train_x, train_y, method="pls", tuneLength = 10,preProcess=pre_process, trainControl=trainControl(method = "repeatedcv", repeats = 5))
plsModel
## Partial Least Squares 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (47), scaled (47), median imputation (47),
##  remove (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE      Rsquared   MAE     
##    1     1.536017  0.4137568  1.165840
##    2     2.017128  0.3693663  1.266490
##    3     1.723892  0.4218530  1.209382
##    4     1.848094  0.3960479  1.256171
##    5     2.049340  0.3634584  1.324814
##    6     2.225879  0.3413780  1.382096
##    7     2.393875  0.3347665  1.421663
##    8     2.534497  0.3343900  1.460528
##    9     2.739731  0.3035248  1.528569
##   10     3.018755  0.2710653  1.598368
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(plsModel)

plsPred <- predict(plsModel, test_x)
(pls_n <- postResample(pred = plsPred, obs = test_y))
##     RMSE Rsquared      MAE 
## 1.342746 0.382608 1.151274

Nonlinear regression models

KNN

knnModel <- train(train_x, train_y, method="knn", preProcess=pre_process, tuneLength=10, trainControl=trainControl(method = "repeatedcv", repeats = 5))
knnModel
## k-Nearest Neighbors 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (47), scaled (47), median imputation (47),
##  remove (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    5  1.526954  0.3955980  1.213651
##    7  1.510777  0.4022838  1.203949
##    9  1.517393  0.3943632  1.208276
##   11  1.516777  0.3982018  1.210734
##   13  1.506678  0.4094165  1.200842
##   15  1.510852  0.4122061  1.210893
##   17  1.518293  0.4077321  1.213531
##   19  1.524798  0.4049865  1.215259
##   21  1.526019  0.4105295  1.214460
##   23  1.535208  0.4075051  1.222433
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
plot(knnModel)

knnPred <- predict(knnModel, newdata=test_x)
(knn_m <- postResample(pred=knnPred,test_y))
##      RMSE  Rsquared       MAE 
## 1.3333407 0.3954126 1.0836713

SVM

svmModel <- train(train_x, train_y, method="svmRadial", preProcess=pre_process, tuneLength=10, trainControl=trainControl(method = "repeatedcv", repeats = 5))
svmModel
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (47), scaled (47), median imputation (47),
##  remove (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE      Rsquared   MAE     
##     0.25  1.456002  0.4394515  1.172572
##     0.50  1.379816  0.4734729  1.112083
##     1.00  1.334729  0.4973732  1.071040
##     2.00  1.311699  0.5103331  1.050497
##     4.00  1.300388  0.5171384  1.038661
##     8.00  1.298817  0.5173864  1.037378
##    16.00  1.298815  0.5173875  1.037377
##    32.00  1.298815  0.5173875  1.037377
##    64.00  1.298815  0.5173875  1.037377
##   128.00  1.298815  0.5173875  1.037377
## 
## Tuning parameter 'sigma' was held constant at a value of 0.01692517
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01692517 and C = 16.
plot(svmModel)

svmPred <- predict(svmModel, newdata=test_x)
(svm_m <- postResample(pred=svmPred,test_y))
##      RMSE  Rsquared       MAE 
## 1.0108947 0.6555742 0.8013416

MARS

marsModel <- train(train_x, train_y, method="earth", preProcess=pre_process, tuneLength=10)
marsModel
## Multivariate Adaptive Regression Spline 
## 
## 132 samples
##  57 predictor
## 
## Pre-processing: centered (47), scaled (47), median imputation (47),
##  remove (10) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##    2      1.481242  0.4297057  1.143980
##    3      1.337799  0.5347605  1.047621
##    5      1.569905  0.4995921  1.090125
##    7      1.611222  0.4816629  1.118713
##    9      1.850084  0.4482929  1.178046
##   10      2.286661  0.4146197  1.261148
##   12      3.651053  0.3814192  1.492688
##   14      3.955688  0.3537000  1.558358
##   16      4.068425  0.3423588  1.595166
##   18      4.124978  0.3432530  1.605274
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 3 and degree = 1.
marsPred <- predict(marsModel, newdata=test_x)
(mars_m <- postResample(pred=marsPred, test_y))
##      RMSE  Rsquared       MAE 
## 1.1821166 0.5463396 0.9779644
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:dbplyr':
## 
##     ident, sql
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
comp_mat <- data.frame(rbind(knn_m, svm_m, mars_m))
comp_mat$MODEL <-  c("KNN","SVM","MARS")
comp_mat <- comp_mat %>% select(MODEL,RMSE, Rsquared,MAE) 
rownames(comp_mat)<- c()
comp_mat
##   MODEL     RMSE  Rsquared       MAE
## 1   KNN 1.333341 0.3954126 1.0836713
## 2   SVM 1.010895 0.6555742 0.8013416
## 3  MARS 1.182117 0.5463396 0.9779644
  1. Which nonlinear regression model gives the optimal resampling and test set performance?

SVM model appears to have better performance than other other models with the lowest RMSA and highest R-squared. Similar preprocessing was used in the PLS model.

  1. Which predictors are most important in the optimal nonlinear regression model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
pls_impvs <- varImp(plsModel)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
## 
##     R2
## The following object is masked from 'package:stats':
## 
##     loadings
svm_impvs <- varImp(svmModel)

The plots below show that variable ManufacturingProcess32 is the most important predictor for both models (PLS or nonlinear SVM). For PLS, there are 6 manufacturing predictors and 4 biological predictors For SVM, there are also 6 manufacturing predictors and 4 biological predictors.

Top 10 variables for PLS

plot(pls_impvs, top=10)

Top 10 variables for SVM (nonlinear)

plot(svm_impvs, top=10)

  1. Explore the relationships between the top predictors and the response for the predictors that are unique to the optimal nonlinear regression model. Do these plots reveal intuition about the biological or process predictors and their relationship with yield?
imp_train <- train_x %>%select(ManufacturingProcess32, BiologicalMaterial06, ManufacturingProcess36, ManufacturingProcess13, BiologicalMaterial03)
cor(imp_train)
##                        ManufacturingProcess32 BiologicalMaterial06
## ManufacturingProcess32             1.00000000            0.6089977
## BiologicalMaterial06               0.60899774            1.0000000
## ManufacturingProcess36                     NA                   NA
## ManufacturingProcess13            -0.08550178           -0.1197651
## BiologicalMaterial03               0.52839399            0.8759335
##                        ManufacturingProcess36 ManufacturingProcess13
## ManufacturingProcess32                     NA            -0.08550178
## BiologicalMaterial06                       NA            -0.11976510
## ManufacturingProcess36                      1                     NA
## ManufacturingProcess13                     NA             1.00000000
## BiologicalMaterial03                       NA            -0.13796233
##                        BiologicalMaterial03
## ManufacturingProcess32            0.5283940
## BiologicalMaterial06              0.8759335
## ManufacturingProcess36                   NA
## ManufacturingProcess13           -0.1379623
## BiologicalMaterial03              1.0000000
cor(imp_train, train_y)
##                              [,1]
## ManufacturingProcess32  0.6459822
## BiologicalMaterial06    0.4960426
## ManufacturingProcess36         NA
## ManufacturingProcess13 -0.4620008
## BiologicalMaterial03    0.4618161

Top 5 dominant predictors were chosen. Correlation of each predictor with each other and also with response variable were generated. Based on correlation of predictors with response variable, it appears that manufacturing process has a more dominant relationship with response variable.