library(AppliedPredictiveModeling)
library(psych)
library(reshape2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::%+%() masks psych::%+%()
## x ggplot2::alpha() masks psych::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(caTools)
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.2
##
## Attaching package: 'lars'
## The following object is masked from 'package:psych':
##
## error.bars
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
library(mlbench)
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:psych':
##
## rescale
## Loading required package: TeachingDemos
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
## The following object is masked from 'package:psych':
##
## alpha
library(nnet)
7.2 Friedman (1991) introduced several bendchmark data sets create by simulation. Once of these simulations used the following nonlinear equation to create data:
\[ y=10sin(\pi )x_{1}x_{2} + 20(x_{3}-0.5)^{2} +10x_{4}+5x_{5}+N(0,\sigma ^{^{2}})) \]
where the x values are random variables uniformly distributed between [0,1] (there are also 5 other non-informative variables also created in the simulation). The package mlbench contains a function called mlbench.freidman1 that simulates these data:
library(mlbench)
set.seed(200)
trainingData<-mlbench.friedman1(200,sd=1)
## We convert the 'x' data from a matrix to a data frame
## One reason is that this well give the columns names.
trainingData$x<- data.frame(trainingData$x)
head(trainingData$x)
## X1 X2 X3 X4 X5 X6 X7
## 1 0.5337724 0.6478064 0.85078526 0.18159957 0.92903976 0.36179060 0.8266609
## 2 0.5837650 0.4381528 0.67272659 0.66924914 0.16379784 0.45305931 0.6489601
## 3 0.5895783 0.5879065 0.40967108 0.33812728 0.89409334 0.02681911 0.1785614
## 4 0.6910399 0.2259548 0.03335447 0.06691274 0.63744519 0.52500637 0.5133614
## 5 0.6673315 0.8188985 0.71676079 0.80324287 0.08306864 0.22344157 0.6644906
## 6 0.8392937 0.3862983 0.64618857 0.86105431 0.63038947 0.43703891 0.3360117
## X8 X9 X10
## 1 0.4214081 0.59111440 0.5886216
## 2 0.8446239 0.92819306 0.7584008
## 3 0.3495908 0.01759542 0.4441185
## 4 0.7970260 0.68986918 0.4450716
## 5 0.9038919 0.39696995 0.5500808
## 6 0.6489177 0.53116033 0.9066182
## Look at the data using featurePlot
featurePlot(trainingData$x, trainingData$y)
## This creates a list with a vector 'y' and a matrix of predictors 'x'. Also, simulate a large test set to estimate the ttrue error rate with good precision:
testData<-mlbench.friedman1(5000, sd=1)
testData$x<-data.frame(testData$x)
Tune several models on these data. For example:
library(caret)
set.seed(200)
knnModel<-train(x=trainingData$x,
y=trainingData$y,
method="knn",
preProc = c("center", "scale"),
tuneLength=10)
knnModel
## k-Nearest Neighbors
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 3.654912 0.4779838 2.958475
## 7 3.529432 0.5118581 2.861742
## 9 3.446330 0.5425096 2.780756
## 11 3.378049 0.5723793 2.719410
## 13 3.332339 0.5953773 2.692863
## 15 3.309235 0.6111389 2.663046
## 17 3.317408 0.6201421 2.678898
## 19 3.311667 0.6333800 2.682098
## 21 3.316340 0.6407537 2.688887
## 23 3.326040 0.6491480 2.705915
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 15.
knnPred<-predict(knnModel, newdata=testData$x)
plot(knnModel)
## The function 'portResample' can be used to get the test set performance values
postResample(pred=knnPred, obs=testData$y)
## RMSE Rsquared MAE
## 3.1750657 0.6785946 2.5443169
k=15 rmse=3.309235 on the training knn rmse=3.204 on the test
Which models appear to get the best performance? Does the MARS select the informative predictors?
#MARS
set.seed(200)
Mgrid <- expand.grid(.degree = 1:2, .nprune = 2:15)
MARSmodel <- train(x = trainingData$x,
y = trainingData$y,
method = "earth",
tuneGrid = Mgrid,
preProcess = c("center", "scale"),
tuneLength = 10)
MARSmodel
## Multivariate Adaptive Regression Spline
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 4.447045 0.2249607 3.650128
## 1 3 3.744821 0.4546610 3.019175
## 1 4 2.828643 0.6892908 2.244131
## 1 5 2.524326 0.7516356 2.027435
## 1 6 2.406670 0.7747079 1.906733
## 1 7 2.027113 0.8375721 1.594956
## 1 8 1.874633 0.8618476 1.474219
## 1 9 1.800794 0.8728377 1.411703
## 1 10 1.810047 0.8721377 1.412023
## 1 11 1.821314 0.8714221 1.427124
## 1 12 1.831608 0.8700790 1.430044
## 1 13 1.839717 0.8686550 1.440537
## 1 14 1.849381 0.8672327 1.450876
## 1 15 1.856211 0.8663787 1.452430
## 2 2 4.447780 0.2248695 3.650597
## 2 3 3.737891 0.4543357 3.018103
## 2 4 2.854288 0.6832049 2.259488
## 2 5 2.513582 0.7550084 2.004730
## 2 6 2.387478 0.7799585 1.889787
## 2 7 2.044028 0.8354683 1.615415
## 2 8 1.910896 0.8568917 1.500375
## 2 9 1.810765 0.8703004 1.404288
## 2 10 1.677078 0.8885385 1.321634
## 2 11 1.561012 0.9045745 1.234778
## 2 12 1.503867 0.9112625 1.183593
## 2 13 1.507992 0.9112557 1.172444
## 2 14 1.505298 0.9114749 1.171595
## 2 15 1.527789 0.9091635 1.188885
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 12 and degree = 2.
plot(MARSmodel)
## DISCUSSION: Training - The final values used for the model were nprune = 12 and degree = 2. RMSE= 1.831608
MARS select the informative predictors using varIMP….
varImp(MARSmodel)
## earth variable importance
##
## Overall
## X1 100.00
## X4 75.40
## X2 49.00
## X5 15.72
## X3 0.00
using the test data..
marspred <- predict (MARSmodel, testData$x)
postResample(pred = marspred, obs = testData$y)
## RMSE Rsquared MAE
## 1.3227340 0.9291489 1.0524686
MARS test RMSE=1.322 as compared to training RMSE= 1.831608
set.seed(200)
SVMmodel <- train(x = trainingData$x,
y = trainingData$y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
SVMmodel
## Support Vector Machines with Radial Basis Function Kernel
##
## 200 samples
## 10 predictor
##
## Pre-processing: centered (10), scaled (10)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 2.525164 0.7810576 2.010680
## 0.50 2.270567 0.7944850 1.794902
## 1.00 2.099356 0.8155574 1.659376
## 2.00 2.005858 0.8302852 1.578799
## 4.00 1.934650 0.8435677 1.528373
## 8.00 1.915665 0.8475605 1.528648
## 16.00 1.923914 0.8463074 1.535991
## 32.00 1.923914 0.8463074 1.535991
## 64.00 1.923914 0.8463074 1.535991
## 128.00 1.923914 0.8463074 1.535991
##
## Tuning parameter 'sigma' was held constant at a value of 0.06299324
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.06299324 and C = 8.
plot(SVMmodel)
## DISCUSSION: The final values used for the model were sigma = 0.06299324 and C = 8. RMSE=1.915
So run test data…….
SVMpred <- predict(SVMmodel, newdata = testData$x)
postResample(pred = SVMpred, obs = testData$y)
## RMSE Rsquared MAE
## 2.0541197 0.8290353 1.5586411
test data RMSE=2.05
## Run a neural network model.
set.seed(200)
nnetmodel <- nnet(trainingData$x,
trainingData$y,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(trainingData$x) + 1) + 5 + 1)
nnetmodel
## a 10-5-1 network with 61 weights
## options were - linear output units decay=0.01
summary(nnetmodel)
## a 10-5-1 network with 61 weights
## options were - linear output units decay=0.01
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## 5.22 -3.79 -0.54 -19.57 8.95 -2.17 -0.65 -12.82 -4.60 2.98
## i10->h1
## 0.57
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## -2.22 1.19 1.56 -1.11 0.97 1.45 -0.61 0.88 -0.16 0.48
## i10->h2
## -0.49
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## -7.16 16.91 6.31 2.79 7.63 -1.50 2.06 -6.56 7.40 -11.59
## i10->h3
## 2.73
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h4
## -18.14 -1.87 -2.41 17.38 5.84 4.31 2.78 -1.44 -3.03 -0.05
## i10->h4
## 4.10
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 i9->h5
## -13.30 0.34 1.96 16.28 12.50 -14.89 -3.94 -3.25 3.36 0.97
## i10->h5
## -2.89
## b->o h1->o h2->o h3->o h4->o h5->o
## -0.22 6.51 19.32 4.76 5.44 4.44
## DISCUSSION: Results:
a 10-5-1 network with 61 weights
Let’s look at variable importance…
varImp(nnetmodel)
## Overall
## X1 10.152501
## X2 7.395536
## X3 23.690690
## X4 14.521013
## X5 11.405236
## X6 4.833571
## X7 10.246607
## X8 6.754084
## X9 6.013435
## X10 4.987327
Let’s look at new samples…..
nnetpred <- predict(nnetmodel, newdata = testData$x)
postResample(pred = nnetpred, obs = testData$y)
## RMSE Rsquared MAE
## 2.8134222 0.7083039 2.1815463
## DISCUSSION:
The neural network RMSE =2.813 on test data
## Conclusion:
The MARS model is the best fitting model for this dataset with
MARS test RMSE=1.322 as compared to training RMSE= 1.831608
# Exercise 7.5 Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several non linear regression models.
Impute, datasplit and preprocess…..
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176 58
str(ChemicalManufacturingProcess)
## 'data.frame': 176 obs. of 58 variables:
## $ Yield : num 38 42.4 42 41.4 42.5 ...
## $ BiologicalMaterial01 : num 6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
## $ BiologicalMaterial02 : num 49.6 61 61 61 63.3 ...
## $ BiologicalMaterial03 : num 57 67.5 67.5 67.5 72.2 ...
## $ BiologicalMaterial04 : num 12.7 14.6 14.6 14.6 14 ...
## $ BiologicalMaterial05 : num 19.5 19.4 19.4 19.4 17.9 ...
## $ BiologicalMaterial06 : num 43.7 53.1 53.1 53.1 54.7 ...
## $ BiologicalMaterial07 : num 100 100 100 100 100 100 100 100 100 100 ...
## $ BiologicalMaterial08 : num 16.7 19 19 19 18.2 ...
## $ BiologicalMaterial09 : num 11.4 12.6 12.6 12.6 12.8 ...
## $ BiologicalMaterial10 : num 3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
## $ BiologicalMaterial11 : num 138 154 154 154 148 ...
## $ BiologicalMaterial12 : num 18.8 21.1 21.1 21.1 21.1 ...
## $ ManufacturingProcess01: num NA 0 0 0 10.7 12 11.5 12 12 12 ...
## $ ManufacturingProcess02: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess03: num NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
## $ ManufacturingProcess04: num NA 917 912 911 918 924 933 929 928 938 ...
## $ ManufacturingProcess05: num NA 1032 1004 1015 1028 ...
## $ ManufacturingProcess06: num NA 210 207 213 206 ...
## $ ManufacturingProcess07: num NA 177 178 177 178 178 177 178 177 177 ...
## $ ManufacturingProcess08: num NA 178 178 177 178 178 178 178 177 177 ...
## $ ManufacturingProcess09: num 43 46.6 45.1 44.9 45 ...
## $ ManufacturingProcess10: num NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
## $ ManufacturingProcess11: num NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
## $ ManufacturingProcess12: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess13: num 35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
## $ ManufacturingProcess14: num 4898 4869 4878 4897 4992 ...
## $ ManufacturingProcess15: num 6108 6095 6087 6102 6233 ...
## $ ManufacturingProcess16: num 4682 4617 4617 4635 4733 ...
## $ ManufacturingProcess17: num 35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
## $ ManufacturingProcess18: num 4865 4867 4877 4872 4886 ...
## $ ManufacturingProcess19: num 6049 6097 6078 6073 6102 ...
## $ ManufacturingProcess20: num 4665 4621 4621 4611 4659 ...
## $ ManufacturingProcess21: num 0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
## $ ManufacturingProcess22: num NA 3 4 5 8 9 1 2 3 4 ...
## $ ManufacturingProcess23: num NA 0 1 2 4 1 1 2 3 1 ...
## $ ManufacturingProcess24: num NA 3 4 5 18 1 1 2 3 4 ...
## $ ManufacturingProcess25: num 4873 4869 4897 4892 4930 ...
## $ ManufacturingProcess26: num 6074 6107 6116 6111 6151 ...
## $ ManufacturingProcess27: num 4685 4630 4637 4630 4684 ...
## $ ManufacturingProcess28: num 10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
## $ ManufacturingProcess29: num 21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
## $ ManufacturingProcess30: num 9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
## $ ManufacturingProcess31: num 69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
## $ ManufacturingProcess32: num 156 169 173 171 171 173 159 161 160 164 ...
## $ ManufacturingProcess33: num 66 66 66 68 70 70 65 65 65 66 ...
## $ ManufacturingProcess34: num 2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ ManufacturingProcess35: num 486 508 509 496 468 490 475 478 491 488 ...
## $ ManufacturingProcess36: num 0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
## $ ManufacturingProcess37: num 0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
## $ ManufacturingProcess38: num 3 2 2 2 2 2 2 2 3 3 ...
## $ ManufacturingProcess39: num 7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
## $ ManufacturingProcess40: num NA 0.1 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess41: num NA 0.15 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess42: num 11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
## $ ManufacturingProcess43: num 3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
## $ ManufacturingProcess44: num 1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
## $ ManufacturingProcess45: num 2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
md.pattern(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 152 1 1 1 1
## 6 1 1 1 1
## 1 1 1 1 1
## 7 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess09 ManufacturingProcess13 ManufacturingProcess15
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess32 ManufacturingProcess37 ManufacturingProcess38
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess39 ManufacturingProcess42 ManufacturingProcess43
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess44 ManufacturingProcess45 ManufacturingProcess01
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 0
## 0 0 1
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess07
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess08 ManufacturingProcess12 ManufacturingProcess14
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 0
## 1 0 0 1
## 1 1 1
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 0
## 1 1 1 1
## 1 0 0 0
## 1 1 2
## ManufacturingProcess02 ManufacturingProcess25 ManufacturingProcess26
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 0 0
## 2 0 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 1 1
## 3 5 5
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess33
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess03
## 152 1 1 1 0
## 6 1 1 0 1
## 1 1 0 1 1
## 7 0 0 0 3
## 5 1 1 1 11
## 2 1 1 1 1
## 1 1 1 1 1
## 1 0 0 0 4
## 1 0 0 0 16
## 9 10 15 106
countNA<-colSums(is.na(ChemicalManufacturingProcess))
countNA
## Yield BiologicalMaterial01 BiologicalMaterial02
## 0 0 0
## BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
## 0 0 0
## BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
## 0 0 0
## BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
## 0 0 0
## BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
## 0 1 3
## ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
## 15 1 1
## ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
## 2 1 1
## ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
## 0 9 10
## ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
## 1 0 1
## ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
## 0 0 0
## ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
## 0 0 0
## ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
## 0 1 1
## ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
## 1 5 5
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 5 5 5
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
## 5 5 0
## ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
## 5 5 5
## ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
## 5 0 0
## ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
## 0 1 1
## ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
## 0 0 0
## ManufacturingProcess45
## 0
#Look at predictor part of dataset
pred <- ChemicalManufacturingProcess[,-c(1)]
#Imput with KNN
Imp_pred <- preProcess(pred, method="knnImpute")
#predict function
pred1 <- predict(Imp_pred, pred)
pred2 <- preProcess(pred1, method=c("center", "scale"))
pred3 <- predict(pred2, pred1)
#split
set.seed(200)
trainingRows <- createDataPartition(ChemicalManufacturingProcess$Yield,
p=0.75, list=FALSE)
train_X2 <- pred3[trainingRows, ]
train_Y2 <- ChemicalManufacturingProcess$Yield[trainingRows]
test_X2 <- pred3[-trainingRows, ]
test_Y2 <- ChemicalManufacturingProcess$Yield[-trainingRows]
## (a) Which nonlinear regression model gives the optimal resampling and test set performance?
set.seed(200)
knnModel2<-train(x=train_X2,
y=train_Y2,
method="knn",
preProc = c("center", "scale"),
tuneLength=10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
knnModel2
## k-Nearest Neighbors
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 1.549684 0.3261105 1.223911
## 7 1.506840 0.3461634 1.199741
## 9 1.483347 0.3626527 1.191919
## 11 1.468087 0.3787644 1.182889
## 13 1.457101 0.3929912 1.171540
## 15 1.465710 0.3889257 1.188223
## 17 1.471753 0.3864912 1.188771
## 19 1.470451 0.3930200 1.187516
## 21 1.469651 0.4002355 1.189690
## 23 1.474788 0.4047451 1.192799
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 13.
The final value used for the model was k = 13. RMSE=1.457101 .
knnPred2<-predict(knnModel2, newdata=test_X2)
plot(knnModel2)
## The function 'portResample' can be used to get the test set performance values
postResample(pred=knnPred2, obs=test_Y2)
## RMSE Rsquared MAE
## 1.4451812 0.4074479 1.2009441
RMSE=1.4451812
#MARS
set.seed(200)
Mgrid2 <- expand.grid(.degree = 1:2, .nprune = 2:15)
MARSmodel2 <- train(x = train_X2,
y = train_Y2,
method = "earth",
tuneGrid = Mgrid2,
preProcess = c("center", "scale"),
tuneLength = 10)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut =
## 10, : These variables have zero variances: BiologicalMaterial07
MARSmodel2
## Multivariate Adaptive Regression Spline
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## degree nprune RMSE Rsquared MAE
## 1 2 1.490279 0.3755794 1.1718045
## 1 3 1.263663 0.5439061 1.0038594
## 1 4 1.240091 0.5595437 0.9794399
## 1 5 1.284713 0.5331047 1.0046833
## 1 6 1.312602 0.5267242 1.0363784
## 1 7 1.335864 0.5114358 1.0500932
## 1 8 1.388257 0.4859620 1.0834030
## 1 9 1.551251 0.4629298 1.1194819
## 1 10 1.596157 0.4532295 1.1346676
## 1 11 1.771924 0.4250801 1.1845418
## 1 12 1.833954 0.4185417 1.2066192
## 1 13 2.663280 0.3818380 1.3504608
## 1 14 2.668555 0.3793821 1.3613282
## 1 15 2.633944 0.3749723 1.3605887
## 2 2 1.491632 0.3730524 1.1716428
## 2 3 1.379036 0.4764277 1.0871948
## 2 4 1.445515 0.4488930 1.0973929
## 2 5 1.475670 0.4282088 1.1029835
## 2 6 1.524896 0.4137309 1.1229552
## 2 7 1.617197 0.3863619 1.1768719
## 2 8 1.669632 0.3740409 1.2006241
## 2 9 1.817293 0.3430625 1.2538592
## 2 10 1.872404 0.3309127 1.2766724
## 2 11 1.930613 0.3282800 1.3292641
## 2 12 2.039573 0.3038107 1.3849343
## 2 13 2.005695 0.3214613 1.3949230
## 2 14 2.064212 0.3151779 1.4109945
## 2 15 2.165026 0.3176985 1.4397028
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 4 and degree = 1.
plot(MARSmodel2)
Training - TThe final values used for the model were nprune = 4 and degree = 1. RMSE= 1.240091
MARS select the informative predictors using varIMP….
varImp(MARSmodel2)
## earth variable importance
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess09 46.54
## ManufacturingProcess13 0.00
using the test data..
marspred2 <- predict (MARSmodel2, test_X2)
postResample(pred = marspred2, obs = test_Y2)
## RMSE Rsquared MAE
## 1.1285523 0.6370613 0.8805678
set.seed(200)
SVMmodel2 <- train(x = train_X2,
y = train_Y2,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
SVMmodel2
## Support Vector Machines with Radial Basis Function Kernel
##
## 132 samples
## 57 predictor
##
## Pre-processing: centered (57), scaled (57)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 117, 119, 119, 119, 120, 119, ...
## Resampling results across tuning parameters:
##
## C RMSE Rsquared MAE
## 0.25 1.421201 0.4526430 1.1627465
## 0.50 1.313353 0.5100836 1.0770470
## 1.00 1.238889 0.5483865 1.0060123
## 2.00 1.188412 0.5689254 0.9634596
## 4.00 1.181378 0.5668063 0.9639502
## 8.00 1.173178 0.5690772 0.9606567
## 16.00 1.171902 0.5695347 0.9589138
## 32.00 1.171902 0.5695347 0.9589138
## 64.00 1.171902 0.5695347 0.9589138
## 128.00 1.171902 0.5695347 0.9589138
##
## Tuning parameter 'sigma' was held constant at a value of 0.01390285
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.01390285 and C = 16.
plot(SVMmodel)
## DISCUSSION:
SVM - The final values used for the training model were sigma = 0.01390285 and C = 16. RMSE = 1.171902 .
So run SVM test data…….
SVMpred2 <- predict(SVMmodel2, newdata = test_X2)
postResample(pred = SVMpred2, obs = test_Y2)
## RMSE Rsquared MAE
## 1.1988287 0.5997168 0.9173792
RMSE = 1.198827
Let’s model nnet on the dataset…..
set.seed(200)
nnetmodel2 <- nnet(x=train_X2,
y=train_Y2,
size = 5,
decay = 0.01,
linout = TRUE,
trace = FALSE,
maxit = 500,
MaxNWts = 5 * (ncol(train_X2) + 1) + 5 + 1)
nnetmodel2
## a 57-5-1 network with 296 weights
## options were - linear output units decay=0.01
summary(nnetmodel2)
## a 57-5-1 network with 296 weights
## options were - linear output units decay=0.01
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## 0.89 0.47 0.78 0.80 -0.23 0.28 0.72 0.59 0.82 -0.31
## i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 i18->h1 i19->h1
## -0.42 -1.24 0.00 -0.53 -0.71 -0.36 0.47 -0.37 -0.22 -0.34
## i20->h1 i21->h1 i22->h1 i23->h1 i24->h1 i25->h1 i26->h1 i27->h1 i28->h1 i29->h1
## 0.35 0.12 -0.15 -0.27 -0.66 0.41 0.10 -0.34 -0.03 0.10
## i30->h1 i31->h1 i32->h1 i33->h1 i34->h1 i35->h1 i36->h1 i37->h1 i38->h1 i39->h1
## -0.04 -0.09 -0.02 -0.17 -0.59 -0.05 -0.42 0.03 -0.41 -0.13
## i40->h1 i41->h1 i42->h1 i43->h1 i44->h1 i45->h1 i46->h1 i47->h1 i48->h1 i49->h1
## 0.04 -0.01 0.14 0.46 0.95 -0.95 -0.11 0.08 -0.47 -0.40
## i50->h1 i51->h1 i52->h1 i53->h1 i54->h1 i55->h1 i56->h1 i57->h1
## 0.00 -0.49 0.94 -0.21 0.11 0.49 0.28 0.02
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## 1.42 -0.45 -0.34 0.03 -0.03 0.70 0.19 0.05 -0.47 -0.30
## i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 i18->h2 i19->h2
## 0.03 -0.35 0.57 -0.62 -1.04 0.29 -0.22 0.02 0.42 0.08
## i20->h2 i21->h2 i22->h2 i23->h2 i24->h2 i25->h2 i26->h2 i27->h2 i28->h2 i29->h2
## -0.12 0.09 0.44 -0.27 -0.02 -0.31 0.03 -0.21 -0.58 -0.42
## i30->h2 i31->h2 i32->h2 i33->h2 i34->h2 i35->h2 i36->h2 i37->h2 i38->h2 i39->h2
## 0.72 -0.55 -0.17 0.18 -0.59 0.17 0.20 0.04 -0.42 0.10
## i40->h2 i41->h2 i42->h2 i43->h2 i44->h2 i45->h2 i46->h2 i47->h2 i48->h2 i49->h2
## -0.37 0.23 0.11 -0.06 0.25 -0.10 -0.21 -0.14 -0.02 -0.23
## i50->h2 i51->h2 i52->h2 i53->h2 i54->h2 i55->h2 i56->h2 i57->h2
## -0.50 -0.05 -0.86 0.44 0.18 0.48 -0.32 0.72
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## 1.45 0.97 -0.28 -0.48 -0.32 -0.07 -0.09 0.10 0.59 -0.11
## i10->h3 i11->h3 i12->h3 i13->h3 i14->h3 i15->h3 i16->h3 i17->h3 i18->h3 i19->h3
## 0.12 -0.09 -0.36 0.41 0.08 -0.34 0.37 -0.14 -0.28 0.19
## i20->h3 i21->h3 i22->h3 i23->h3 i24->h3 i25->h3 i26->h3 i27->h3 i28->h3 i29->h3
## -0.31 -0.22 -0.32 0.23 -0.03 -0.60 0.47 -0.35 0.03 -0.57
## i30->h3 i31->h3 i32->h3 i33->h3 i34->h3 i35->h3 i36->h3 i37->h3 i38->h3 i39->h3
## 0.23 0.04 0.53 -0.26 -0.04 -0.38 0.11 0.21 -0.09 -0.35
## i40->h3 i41->h3 i42->h3 i43->h3 i44->h3 i45->h3 i46->h3 i47->h3 i48->h3 i49->h3
## 0.14 0.70 0.45 0.40 0.00 0.13 -0.21 0.21 0.30 0.18
## i50->h3 i51->h3 i52->h3 i53->h3 i54->h3 i55->h3 i56->h3 i57->h3
## -0.49 -0.19 -0.12 -0.48 0.28 -0.43 0.65 -0.72
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h4
## 1.51 -0.29 0.04 0.32 0.05 -0.43 0.74 -1.09 0.01 0.32
## i10->h4 i11->h4 i12->h4 i13->h4 i14->h4 i15->h4 i16->h4 i17->h4 i18->h4 i19->h4
## 0.28 -0.62 -0.05 -0.28 0.83 0.12 -0.11 0.69 0.09 0.09
## i20->h4 i21->h4 i22->h4 i23->h4 i24->h4 i25->h4 i26->h4 i27->h4 i28->h4 i29->h4
## -0.02 -0.18 0.18 -0.35 0.38 -0.30 -0.06 0.70 -0.31 -0.23
## i30->h4 i31->h4 i32->h4 i33->h4 i34->h4 i35->h4 i36->h4 i37->h4 i38->h4 i39->h4
## 0.67 0.34 0.43 -0.07 0.61 0.20 -0.04 -0.19 -0.06 -0.06
## i40->h4 i41->h4 i42->h4 i43->h4 i44->h4 i45->h4 i46->h4 i47->h4 i48->h4 i49->h4
## -0.28 -0.10 -0.11 0.50 0.76 -0.52 -0.06 -0.39 0.32 0.37
## i50->h4 i51->h4 i52->h4 i53->h4 i54->h4 i55->h4 i56->h4 i57->h4
## 0.12 0.67 -0.21 0.06 0.39 0.37 -0.83 0.04
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 i9->h5
## 1.35 0.13 -0.26 -0.39 -0.36 0.19 -0.38 -0.11 -0.44 0.40
## i10->h5 i11->h5 i12->h5 i13->h5 i14->h5 i15->h5 i16->h5 i17->h5 i18->h5 i19->h5
## 0.24 0.63 0.01 0.21 -0.04 -0.09 0.29 -0.15 0.15 0.10
## i20->h5 i21->h5 i22->h5 i23->h5 i24->h5 i25->h5 i26->h5 i27->h5 i28->h5 i29->h5
## 0.27 0.36 0.74 -0.10 0.46 0.53 -0.16 0.56 0.30 0.36
## i30->h5 i31->h5 i32->h5 i33->h5 i34->h5 i35->h5 i36->h5 i37->h5 i38->h5 i39->h5
## 0.92 -0.51 -0.36 0.36 0.65 -0.11 -0.46 0.09 0.01 -0.05
## i40->h5 i41->h5 i42->h5 i43->h5 i44->h5 i45->h5 i46->h5 i47->h5 i48->h5 i49->h5
## -0.29 -0.37 -0.45 0.27 -0.62 -0.22 0.18 0.82 -0.81 -0.06
## i50->h5 i51->h5 i52->h5 i53->h5 i54->h5 i55->h5 i56->h5 i57->h5
## 0.65 0.32 -0.19 0.75 0.36 -0.23 -0.40 -0.29
## b->o h1->o h2->o h3->o h4->o h5->o
## 11.06 8.18 8.11 8.29 10.04 7.22
Results:
a 57-5-1 network with 296 weights options were - linear output units decay=0.01
nnetpred <- predict(nnetmodel2, newdata = test_X2)
postResample(pred = nnetpred, obs = test_X2)
## RMSE Rsquared MAE
## 39.34299 NA 39.13668
The SVM model is superior with an training = 1.1719 and test RMSE = 1.019. However, the MARS is a close contender.
varImp(SVMmodel2)
## loess r-squared variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess17 80.31
## BiologicalMaterial06 75.09
## ManufacturingProcess13 74.47
## BiologicalMaterial03 67.11
## ManufacturingProcess06 66.44
## ManufacturingProcess36 66.30
## BiologicalMaterial12 65.94
## ManufacturingProcess09 65.29
## BiologicalMaterial02 55.94
## ManufacturingProcess31 53.57
## ManufacturingProcess29 47.17
## ManufacturingProcess33 45.79
## BiologicalMaterial08 44.98
## ManufacturingProcess11 41.72
## ManufacturingProcess02 41.58
## BiologicalMaterial11 41.18
## BiologicalMaterial04 40.31
## BiologicalMaterial09 35.12
## BiologicalMaterial01 34.15
Similar to the linear model, 11 of the 20 predictors are manufacturing.
The top ten from the linear model elastnet were: ## Overall Elastic net var importance ### ManufacturingProcess13 100.00 ### ManufacturingProcess32 93.67 ### BiologicalMaterial03 92.86 ### BiologicalMaterial06 86.68 ### ManufacturingProcess17 80.34 ### BiologicalMaterial12 76.76 ### ManufacturingProcess09 76.15 ### ManufacturingProcess36 75.95 ### ManufacturingProcess06 63.29 ### BiologicalMaterial02 59.13 ### ManufacturingProcess11 54.38 ### BiologicalMaterial11 53.65 ### ManufacturingProcess31 51.81 ### BiologicalMaterial04 48.83 ### BiologicalMaterial09 47.43 ### ManufacturingProcess18 43.88 ### ManufacturingProcess30 42.31 ### BiologicalMaterial08 40.33 ### BiologicalMaterial01 40.08 ### ManufacturingProcess33 38.33
The top 5 important predictors from the linear (elasticnet) and nonlinear (SVM) models match exactly, only with an ordering difference. The second 5-10 important predictors from linear and nonlinear also match exactly with an ordering difference.
The top ten are exactly the same in both models.
Furthermore, 18 of the 20 variables matched exactly.
Let’s look at the top five variables of importance…
use1 <- pred3 %>% cbind(ChemicalManufacturingProcess$Yield)
# Manufacturing Process 32 1st
cor(use1$ManufacturingProcess32,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.6083321
ggplot(use1, aes(use1$ManufacturingProcess32,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$ManufacturingProcess32` is discouraged. Use
## `ManufacturingProcess32` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Manufacturing Process 17 2nd
cor(use1$ManufacturingProcess17,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.4258069
ggplot(use1, aes(use1$ManufacturingProcess17,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$ManufacturingProcess17` is discouraged. Use
## `ManufacturingProcess17` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Biological Process 06 3rd
cor(use1$BiologicalMaterial06,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.4781634
ggplot(use1, aes(use1$BiologicalMaterial06,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$BiologicalMaterial06` is discouraged. Use
## `BiologicalMaterial06` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Manufacturing Process 13 4th
cor(use1$ManufacturingProcess13,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.5036797
ggplot(use1, aes(use1$ManufacturingProcess13,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$ManufacturingProcess13` is discouraged. Use
## `ManufacturingProcess13` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Biological Process 03 5th
cor(use1$BiologicalMaterial03,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.445086
ggplot(use1, aes(use1$BiologicalMaterial03,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$BiologicalMaterial03` is discouraged. Use
## `BiologicalMaterial03` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Manufacturing Process 06 6th
cor(use1$ManufacturingProcess06,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3918329
ggplot(use1, aes(use1$ManufacturingProcess06,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$ManufacturingProcess06` is discouraged. Use
## `ManufacturingProcess06` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Manufacturing Process 36 7th
cor(use1$ManufacturingProcess36,use1$`ChemicalManufacturingProcess$Yield`)
## [1] -0.5237389
ggplot(use1, aes(use1$ManufacturingProcess36,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$ManufacturingProcess36` is discouraged. Use
## `ManufacturingProcess36` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Biological Process 12 8th
cor(use1$BiologicalMaterial12,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.3674976
ggplot(use1, aes(use1$BiologicalMaterial12,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$BiologicalMaterial12` is discouraged. Use
## `BiologicalMaterial12` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Manufacturing Process 09 9th
cor(use1$ManufacturingProcess09,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.5034705
ggplot(use1, aes(use1$ManufacturingProcess09,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$ManufacturingProcess09` is discouraged. Use
## `ManufacturingProcess09` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
# Biological Process 02 10th
cor(use1$BiologicalMaterial02,use1$`ChemicalManufacturingProcess$Yield`)
## [1] 0.4815158
ggplot(use1, aes(use1$BiologicalMaterial02,use1$`ChemicalManufacturingProcess$Yield`)) +
geom_point()
## Warning: Use of `use1$BiologicalMaterial02` is discouraged. Use
## `BiologicalMaterial02` instead.
## Warning: Use of `use1$`ChemicalManufacturingProcess$Yield`` is discouraged. Use
## `ChemicalManufacturingProcess$Yield` instead.
## DISCUSSION:
The correlation does not adjust for other variables. This information is helpful to know how one variables is correlated to another with no other variables in the mix.
This could help in future decision making.
In all of the 10 important variables, the correlation is in the range of |.39, .6| depicting moderate correlation.
3 of the 10 variables have negative correlation, M32, M13, M36. However, the top variable is Manufacturing 32 with the highest correlation of .6. This should be further understood, because perhaps a manufacturing process could be tweaked more easily than a biological process.