suppressPackageStartupMessages(library(AppliedPredictiveModeling))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(caret))
suppressPackageStartupMessages(library(kernlab))
suppressPackageStartupMessages(library(earth))
suppressPackageStartupMessages(library(mlbench))
Home Work 8
Friedman (1991) introduced several benchmark data sets create by simulation. One of these simulations used the following nonlinear equation to create data:
y = 10sin(πx1x2)+20(x3 −0.5)2 +10x4 +5x5 +N(0,σ2)
where the x values are random variables uniformly distributed between [0, 1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.friedman1 that simulates these data:
library(mlbench)
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
library(caret)
knnModel <- train(x = trainingData$x,
y = trainingData$y,
method = "knn",
preProc = c("center", "scale"),
tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.466085 0.5121775 2.816838
## 7 3.349428 0.5452823 2.727410
## 9 3.264276 0.5785990 2.660026
## 11 3.214216 0.6024244 2.603767
## 13 3.196510 0.6176570 2.591935
## 15 3.184173 0.6305506 2.577482
## 17 3.183130 0.6425367 2.567787
## 19 3.198752 0.6483184 2.592683
## 21 3.188993 0.6611428 2.588787
## 23 3.200458 0.6638353 2.604529
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
200 samples 10 predictors
Pre-processing: centered, scaled Resampling: Bootstrap (25 reps)
Summary of sample sizes: 200, 200, 200, 200, 200, 200, …
Resampling results across tuning parameters:
k RMSE Rsquared RMSE SD Rsquared SD 5 3.51 0.496 0.238 0.0641 7 3.36 0.536 0.24 0.0617 9 3.3 0.559 0.251 0.0546 11 3.24 0.586 0.252 0.0501 13 3.2 0.61 0.234 0.0465 15 3.19 0.623 0.264 0.0496 17 3.19 0.63 0.286 0.0528 19 3.18 0.643 0.274 0.048 21 3.2 0.646 0.269 0.0464 23 3.2 0.652 0.267 0.0465
RMSE was used to select the optimal model using the smallest value. The final value used for the model was k = 19.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set > ## performance values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.2040595 0.6819919 2.5683461
varImp((knnModel))
## loess r-squared variable importance
##
## Overall
## X4 100.0000
## X1 95.5047
## X2 89.6186
## X5 45.2170
## X3 29.9330
## X9 6.3299
## X10 5.5182
## X8 3.2527
## X6 0.8884
## X7 0.0000
marsModel<- train(x = trainingData$x, y = trainingData$y, method = "earth",
preProcess = c("center", "scale"), tuneLength = 10)
marsPred <- predict(marsModel, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
## RMSE Rsquared MAE
## 1.776575 0.872700 1.358367
varImp((marsModel))
## earth variable importance
##
## Overall
## X1 100.00
## X4 82.78
## X2 64.18
## X5 40.21
## X3 28.14
## X6 0.00
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1–X5)?
MARS gives the best performance (least RMSE and higher R-squared) and selects the informative predictors (X1 - X5).
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
data(ChemicalManufacturingProcess)
# Using the similar approach followed in 6.3, we will split data and preprocess.
processPredictors = as.matrix(ChemicalManufacturingProcess[,2:58])
yield = ChemicalManufacturingProcess[,1]
# Data Splitting
train_r <- createDataPartition(yield, p=0.75, list=F)
pp_train <- ChemicalManufacturingProcess[train_r,-1]
y_train <- ChemicalManufacturingProcess[train_r,1]
pp_test <- ChemicalManufacturingProcess[-train_r,-1]
y_test <- ChemicalManufacturingProcess[-train_r,1]
p_pro <- c("nzv", "corr", "center","scale", "medianImpute")
# PLS model for compare
t_ctrl <- trainControl(method = "repeatedcv", repeats = 5)
plsModel<-train(pp_train, y_train, method="pls", tuneLength = 10,preProcess=p_pro, trainControl=t_ctrl)
plsPred <- predict(plsModel, pp_test)
postResample(pred = plsPred, obs = y_test)
## RMSE Rsquared MAE
## 1.3730073 0.4457299 1.1195921
plsModel
## Partial Least Squares
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (46), scaled (46), median imputation (46), remove (11)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.611189 0.3319394 1.216911
## 2 2.563353 0.2605888 1.331377
## 3 1.968729 0.3499692 1.234725
## 4 1.890660 0.3636799 1.254565
## 5 2.197519 0.3373813 1.329977
## 6 2.533194 0.3022048 1.421511
## 7 2.906725 0.2824512 1.523001
## 8 3.186946 0.2668625 1.604831
## 9 3.454210 0.2598260 1.674901
## 10 3.752694 0.2399434 1.750975
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
# KNN model
knnModel <- train(pp_train, y_train, method="knn", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl)
knnPred <- predict(knnModel, newdata=pp_test)
knnRes <-postResample(pred=knnPred,y_test)
knnModel
## k-Nearest Neighbors
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (46), scaled (46), median imputation (46), remove (11)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.451722 0.3818234 1.155787
## 7 1.443272 0.3885667 1.162571
## 9 1.444162 0.3868560 1.163596
## 11 1.439181 0.3916531 1.161275
## 13 1.433959 0.4003895 1.163261
## 15 1.425182 0.4100981 1.153498
## 17 1.423930 0.4170072 1.154250
## 19 1.430171 0.4140404 1.158675
## 21 1.432339 0.4207953 1.157209
## 23 1.433677 0.4267988 1.161250
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 17.
# SVM model
svmModel <- train(pp_train, y_train, method="svmRadial", preProcess=p_pro, tuneLength=10, trainControl=t_ctrl)
svmPred <- predict(svmModel, newdata=pp_test)
svmRes <- postResample(pred=svmPred,y_test)
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (46), scaled (46), median imputation (46), remove (11)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.450449 0.4854829 1.172229
## 0.50 1.366380 0.5144350 1.088796
## 1.00 1.319028 0.5386292 1.044180
## 2.00 1.288442 0.5558028 1.022877
## 4.00 1.278651 0.5595650 1.020169
## 8.00 1.278350 0.5594425 1.021490
## 16.00 1.278632 0.5591447 1.021643
## 32.00 1.278632 0.5591447 1.021643
## 64.00 1.278632 0.5591447 1.021643
## 128.00 1.278632 0.5591447 1.021643
##
## Tuning parameter 'sigma' was held constant at a value of 0.01502779
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01502779 and C = 8.
# MARS model
marsModel <- train(pp_train, y_train, method="earth", preProcess=p_pro, tuneLength=10)
marsPred <- predict(marsModel, newdata=pp_test)
marsRes <- postResample(pred=marsPred,y_test)
marsModel
## Multivariate Adaptive Regression Spline
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (46), scaled (46), median imputation (46), remove (11)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 1.507254 0.3756706 1.219024
## 3 1.658673 0.4905195 1.126607
## 5 3.515665 0.3848685 1.451150
## 7 4.065829 0.3466363 1.563408
## 9 3.399506 0.3494705 1.501010
## 10 4.835486 0.3119240 1.728915
## 12 5.789640 0.2745042 1.882947
## 14 6.037434 0.2639899 1.990054
## 16 6.817133 0.2667646 2.109404
## 18 6.870728 0.2668009 2.122727
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 2 and degree = 1.
# Neutral Network model
nnGrid <- expand.grid(size=seq(from = 1, to = 10, by = 1),
decay = seq(from = 0.1, to = 0.5, by = 0.1))
nnetModel <- train(pp_train, y_train, method="nnet", tuneGrid = nnGrid,
preProcess= p_pro, linout=T,trace=F,
MaxNWts=10 * (ncol(pp_train)+1) + 10 + 1, maxit=500)
nnetPred <- predict(nnetModel, newdata=pp_test)
nnetRes <- postResample(pred=nnetPred,y_test)
nnetModel
## Neural Network
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (46), scaled (46), median imputation (46), remove (11)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## size decay RMSE Rsquared MAE
## 1 0.1 2.022843 0.3025680 1.554950
## 1 0.2 2.493954 0.2379207 1.702322
## 1 0.3 2.218904 0.2694536 1.551550
## 1 0.4 2.117783 0.2753678 1.503102
## 1 0.5 2.490149 0.2229075 1.680638
## 2 0.1 3.013660 0.1779287 2.193216
## 2 0.2 3.248170 0.1911697 2.324572
## 2 0.3 3.197889 0.1422185 2.225092
## 2 0.4 3.227929 0.1700100 2.173862
## 2 0.5 3.257966 0.1542977 2.271527
## 3 0.1 3.185246 0.1396031 2.232946
## 3 0.2 3.248812 0.1686489 2.286286
## 3 0.3 2.994342 0.1848136 2.134024
## 3 0.4 3.076758 0.1580869 2.056969
## 3 0.5 2.902161 0.1915657 1.987340
## 4 0.1 2.926806 0.1889393 2.022008
## 4 0.2 2.862972 0.1858573 2.027766
## 4 0.3 3.036029 0.1749935 2.008362
## 4 0.4 2.615902 0.2071793 1.854142
## 4 0.5 2.718426 0.1997110 1.838165
## 5 0.1 3.077895 0.1407548 2.190088
## 5 0.2 2.675516 0.2128929 1.890449
## 5 0.3 2.606919 0.2105668 1.785290
## 5 0.4 2.490262 0.2311155 1.710876
## 5 0.5 2.472151 0.2343317 1.715912
## 6 0.1 2.735821 0.1739222 1.952460
## 6 0.2 2.503145 0.2046562 1.780678
## 6 0.3 2.484844 0.2334165 1.747501
## 6 0.4 2.628607 0.2131599 1.755044
## 6 0.5 2.509510 0.2258916 1.706066
## 7 0.1 2.973054 0.1341656 2.209017
## 7 0.2 2.658791 0.2102126 1.849399
## 7 0.3 2.443576 0.2380040 1.665392
## 7 0.4 2.428895 0.2471246 1.659418
## 7 0.5 2.453709 0.2428383 1.636789
## 8 0.1 2.850024 0.1500391 2.113219
## 8 0.2 2.487781 0.1980572 1.834097
## 8 0.3 2.684568 0.1886811 1.834414
## 8 0.4 2.548878 0.2060877 1.735909
## 8 0.5 2.463477 0.2141937 1.677509
## 9 0.1 3.019999 0.1219543 2.242907
## 9 0.2 2.831301 0.1482695 1.981620
## 9 0.3 2.589273 0.1860167 1.825011
## 9 0.4 2.526380 0.1895355 1.743219
## 9 0.5 2.507416 0.2023877 1.676486
## 10 0.1 2.914008 0.1492600 2.251556
## 10 0.2 2.778790 0.1622152 1.981490
## 10 0.3 2.754752 0.1847766 2.029941
## 10 0.4 2.648942 0.1937471 1.805968
## 10 0.5 2.541192 0.1959476 1.778534
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were size = 1 and decay = 0.1.
Which nonlinear regression model gives the optimal resampling and test set performance? SVM gives the best performance (least RMSE and higher R-squared)
Which predictors are most important in the optimal nonlinear regres- sion model? Do either the biological or process variables dominate the list? How do the top ten important predictors compare to the top ten predictors from the optimal linear model?
varImp(plsModel)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 46)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess36 84.35
## BiologicalMaterial06 81.39
## ManufacturingProcess13 74.78
## BiologicalMaterial03 71.17
## ManufacturingProcess09 67.94
## BiologicalMaterial01 65.46
## ManufacturingProcess33 64.94
## ManufacturingProcess04 62.17
## ManufacturingProcess12 60.11
## BiologicalMaterial08 57.43
## BiologicalMaterial11 55.67
## ManufacturingProcess06 55.14
## ManufacturingProcess17 55.14
## ManufacturingProcess28 48.88
## ManufacturingProcess02 47.63
## ManufacturingProcess34 38.63
## ManufacturingProcess11 38.26
## BiologicalMaterial05 36.59
## BiologicalMaterial10 32.49
varImp(svmModel)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## BiologicalMaterial06 84.47
## ManufacturingProcess13 84.10
## ManufacturingProcess36 77.77
## BiologicalMaterial02 72.96
## BiologicalMaterial03 67.39
## ManufacturingProcess31 62.38
## ManufacturingProcess29 60.75
## BiologicalMaterial12 59.71
## ManufacturingProcess17 57.76
## BiologicalMaterial11 52.44
## ManufacturingProcess04 52.37
## BiologicalMaterial01 50.75
## BiologicalMaterial04 49.98
## ManufacturingProcess09 49.93
## ManufacturingProcess33 48.92
## ManufacturingProcess06 44.58
## ManufacturingProcess11 39.51
## ManufacturingProcess12 39.14
## BiologicalMaterial08 38.36
Process Variables dominate the list. Particularly ManufacturingProcess 32, 09 and 13, which are the most important predictors in both cases
imp_train <- pp_train %>%select(ManufacturingProcess32,BiologicalMaterial06, ManufacturingProcess13,ManufacturingProcess36, BiologicalMaterial12)
cor(imp_train)
## ManufacturingProcess32 BiologicalMaterial06
## ManufacturingProcess32 1.0000000 0.6245656
## BiologicalMaterial06 0.6245656 1.0000000
## ManufacturingProcess13 -0.1471563 -0.1188220
## ManufacturingProcess36 NA NA
## BiologicalMaterial12 0.4204999 0.8324827
## ManufacturingProcess13 ManufacturingProcess36
## ManufacturingProcess32 -0.14715625 NA
## BiologicalMaterial06 -0.11882199 NA
## ManufacturingProcess13 1.00000000 NA
## ManufacturingProcess36 NA 1
## BiologicalMaterial12 0.01697036 NA
## BiologicalMaterial12
## ManufacturingProcess32 0.42049986
## BiologicalMaterial06 0.83248266
## ManufacturingProcess13 0.01697036
## ManufacturingProcess36 NA
## BiologicalMaterial12 1.00000000
cor(imp_train, y_train)
## [,1]
## ManufacturingProcess32 0.6292438
## BiologicalMaterial06 0.5140042
## ManufacturingProcess13 -0.4730524
## ManufacturingProcess36 NA
## BiologicalMaterial12 0.3474817