alt text here
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.5.3
set.seed(200)
trainingData <- mlbench.friedman1(200, sd = 1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this will give the columns names.
trainingData$x <- data.frame(trainingData$x)
## Look at the data using
featurePlot(trainingData$x, trainingData$y)
## or other methods.
## This creates a list with a vector 'y' and a matrix
## of predictors 'x'. Also simulate a large test set to
## estimate the true error rate with good precision:
testData <- mlbench.friedman1(5000, sd = 1)
testData$x <- data.frame(testData$x)
Tune several models on these data. For example:
knnModel <- train(x = trainingData$x,y = trainingData$y, method = "knn", preProc = c("center", "scale"), tuneLength = 10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.565620 0.4887976 2.886629
## 7 3.422420 0.5300524 2.752964
## 9 3.368072 0.5536927 2.715310
## 11 3.323010 0.5779056 2.669375
## 13 3.275835 0.6030846 2.628663
## 15 3.261864 0.6163510 2.621192
## 17 3.261973 0.6267032 2.616956
## 19 3.286299 0.6281075 2.640585
## 21 3.280950 0.6390386 2.643807
## 23 3.292397 0.6440392 2.656080
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
Model with smallest RMSE value is selected.
knnPred <- predict(knnModel, newdata = testData$x)
## The function 'postResample' can be used to get the test set performance values
postResample(pred = knnPred, obs = testData$y)
## RMSE Rsquared MAE
## 3.1750657 0.6785946 2.5443169
Which models appear to give the best performance? Does MARS select the informative predictors (those named X1-X5)?
marsModel <- train(x = trainingData$x, y = trainingData$y, method = "earth", preProcess = c("center", "scale"), tuneLength = 10)
marsPred <- predict(marsModel, newdata = testData$x)
postResample(pred = marsPred, obs = testData$y)
## RMSE Rsquared MAE
## 1.776575 0.872700 1.358367
Mars appear to be bettern than KNN in terms of RMSE and also better Rsquared.
7.5. Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several nonlinear regression models.
library(AppliedPredictiveModeling)
data(ChemicalManufacturingProcess)
Impute missing data.
(cmp <- preProcess(ChemicalManufacturingProcess, method=c('knnImpute')))
## Created from 152 samples and 58 variables
##
## Pre-processing:
## - centered (58)
## - ignored (0)
## - 5 nearest neighbor imputation (58)
## - scaled (58)
cmp_predictors = as.matrix(ChemicalManufacturingProcess[,2:58])
cmp_yield = ChemicalManufacturingProcess[,1]
Split data into training and testing with 75-25 split.
set.seed(100)
train_select <- createDataPartition(cmp_yield, p=0.75, list=F) #create train set
train_x <- ChemicalManufacturingProcess[train_select,-1]
train_y <- ChemicalManufacturingProcess[train_select,1]
test_x <- ChemicalManufacturingProcess[-train_select,-1]
test_y <- ChemicalManufacturingProcess[-train_select,1]
pre_process <- c("nzv", "corr", "center","scale", "medianImpute")
plsModel <-train(train_x, train_y, method="pls", tuneLength = 10,preProcess=pre_process, trainControl=trainControl(method = "repeatedcv", repeats = 5))
plsModel
## Partial Least Squares
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (47), scaled (47), median imputation (47),
## remove (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## ncomp RMSE Rsquared MAE
## 1 1.536017 0.4137568 1.165840
## 2 2.017128 0.3693663 1.266490
## 3 1.723892 0.4218530 1.209382
## 4 1.848094 0.3960479 1.256171
## 5 2.049340 0.3634584 1.324814
## 6 2.225879 0.3413780 1.382096
## 7 2.393875 0.3347665 1.421663
## 8 2.534497 0.3343900 1.460528
## 9 2.739731 0.3035248 1.528569
## 10 3.018755 0.2710653 1.598368
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 1.
plot(plsModel)
plsPred <- predict(plsModel, test_x)
(pls_n <- postResample(pred = plsPred, obs = test_y))
## RMSE Rsquared MAE
## 1.342746 0.382608 1.151274
Nonlinear regression models
knnModel <- train(train_x, train_y, method="knn", preProcess=pre_process, tuneLength=10, trainControl=trainControl(method = "repeatedcv", repeats = 5))
knnModel
## k-Nearest Neighbors
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (47), scaled (47), median imputation (47),
## remove (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.526954 0.3955980 1.213651
## 7 1.510777 0.4022838 1.203949
## 9 1.517393 0.3943632 1.208276
## 11 1.516777 0.3982018 1.210734
## 13 1.506678 0.4094165 1.200842
## 15 1.510852 0.4122061 1.210893
## 17 1.518293 0.4077321 1.213531
## 19 1.524798 0.4049865 1.215259
## 21 1.526019 0.4105295 1.214460
## 23 1.535208 0.4075051 1.222433
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
plot(knnModel)
knnPred <- predict(knnModel, newdata=test_x)
(knn_m <- postResample(pred=knnPred,test_y))
## RMSE Rsquared MAE
## 1.3333407 0.3954126 1.0836713
svmModel <- train(train_x, train_y, method="svmRadial", preProcess=pre_process, tuneLength=10, trainControl=trainControl(method = "repeatedcv", repeats = 5))
svmModel
## Support Vector Machines with Radial Basis Function Kernel
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (47), scaled (47), median imputation (47),
## remove (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.456002 0.4394515 1.172572
## 0.50 1.379816 0.4734729 1.112083
## 1.00 1.334729 0.4973732 1.071040
## 2.00 1.311699 0.5103331 1.050497
## 4.00 1.300388 0.5171384 1.038661
## 8.00 1.298817 0.5173864 1.037378
## 16.00 1.298815 0.5173875 1.037377
## 32.00 1.298815 0.5173875 1.037377
## 64.00 1.298815 0.5173875 1.037377
## 128.00 1.298815 0.5173875 1.037377
##
## Tuning parameter 'sigma' was held constant at a value of 0.01692517
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01692517 and C = 16.
plot(svmModel)
svmPred <- predict(svmModel, newdata=test_x)
(svm_m <- postResample(pred=svmPred,test_y))
## RMSE Rsquared MAE
## 1.0108947 0.6555742 0.8013416
marsModel <- train(train_x, train_y, method="earth", preProcess=pre_process, tuneLength=10)
marsModel
## Multivariate Adaptive Regression Spline
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (47), scaled (47), median imputation (47),
## remove (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## nprune RMSE Rsquared MAE
## 2 1.481242 0.4297057 1.143980
## 3 1.337799 0.5347605 1.047621
## 5 1.569905 0.4995921 1.090125
## 7 1.611222 0.4816629 1.118713
## 9 1.850084 0.4482929 1.178046
## 10 2.286661 0.4146197 1.261148
## 12 3.651053 0.3814192 1.492688
## 14 3.955688 0.3537000 1.558358
## 16 4.068425 0.3423588 1.595166
## 18 4.124978 0.3432530 1.605274
##
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 3 and degree = 1.
marsPred <- predict(marsModel, newdata=test_x)
(mars_m <- postResample(pred=marsPred, test_y))
## RMSE Rsquared MAE
## 1.1821166 0.5463396 0.9779644
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:dbplyr':
##
## ident, sql
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
comp_mat <- data.frame(rbind(knn_m, svm_m, mars_m))
comp_mat$MODEL <- c("KNN","SVM","MARS")
comp_mat <- comp_mat %>% select(MODEL,RMSE, Rsquared,MAE)
rownames(comp_mat)<- c()
comp_mat
## MODEL RMSE Rsquared MAE
## 1 KNN 1.333341 0.3954126 1.0836713
## 2 SVM 1.010895 0.6555742 0.8013416
## 3 MARS 1.182117 0.5463396 0.9779644
SVM model appears to have better performance than other other models with the lowest RMSA and highest R-squared. Similar preprocessing was used in the PLS model.
pls_impvs <- varImp(plsModel)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
svm_impvs <- varImp(svmModel)
The plots below show that variable ManufacturingProcess32
is the most important predictor for both models (PLS or nonlinear SVM). For PLS, there are 6 manufacturing predictors and 4 biological predictors For SVM, there are also 6 manufacturing predictors and 4 biological predictors.
plot(pls_impvs, top=10)
plot(svm_impvs, top=10)
imp_train <- train_x %>%select(ManufacturingProcess32, BiologicalMaterial06, ManufacturingProcess36, ManufacturingProcess13, BiologicalMaterial03)
cor(imp_train)
## ManufacturingProcess32 BiologicalMaterial06
## ManufacturingProcess32 1.00000000 0.6089977
## BiologicalMaterial06 0.60899774 1.0000000
## ManufacturingProcess36 NA NA
## ManufacturingProcess13 -0.08550178 -0.1197651
## BiologicalMaterial03 0.52839399 0.8759335
## ManufacturingProcess36 ManufacturingProcess13
## ManufacturingProcess32 NA -0.08550178
## BiologicalMaterial06 NA -0.11976510
## ManufacturingProcess36 1 NA
## ManufacturingProcess13 NA 1.00000000
## BiologicalMaterial03 NA -0.13796233
## BiologicalMaterial03
## ManufacturingProcess32 0.5283940
## BiologicalMaterial06 0.8759335
## ManufacturingProcess36 NA
## ManufacturingProcess13 -0.1379623
## BiologicalMaterial03 1.0000000
cor(imp_train, train_y)
## [,1]
## ManufacturingProcess32 0.6459822
## BiologicalMaterial06 0.4960426
## ManufacturingProcess36 NA
## ManufacturingProcess13 -0.4620008
## BiologicalMaterial03 0.4618161
Top 5 dominant predictors were chosen. Correlation of each predictor with each other and also with response variable were generated. Based on correlation of predictors with response variable, it appears that manufacturing process has a more dominant relationship with response variable.