library(AppliedPredictiveModeling)
library(psych)
library(reshape2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::%+%() masks psych::%+%()
## x ggplot2::alpha() masks psych::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(caTools)
library(elasticnet)
## Loading required package: lars
## Loaded lars 1.2
##
## Attaching package: 'lars'
## The following object is masked from 'package:psych':
##
## error.bars
library(pls)
##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
library(mlbench)
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:psych':
##
## rescale
## Loading required package: TeachingDemos
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
## The following object is masked from 'package:psych':
##
## alpha
library(nnet)
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:kernlab':
##
## prior
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
library(gbm)
## Loaded gbm 2.1.8
library(rpart)
library(Cubist)
Recreate the simulated data from Exercise 7.2:
library(mlbench)
set.seed(200)
simulated<-mlbench.friedman1(200, sd=1)
simulated<-cbind(simulated$x, simulated$y)
simulated<-as.data.frame(simulated)
colnames(simulated)[ncol(simulated)]<-"y"
Fit a random forest model to all of the predictors, then estimate the variable importance scores:
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
library(caret)
model1<-randomForest(y~., data=simulated,
importance=TRUE,
ntree=1000)
rfImp1<-varImp(model1, scale=FALSE)
model1
##
## Call:
## randomForest(formula = y ~ ., data = simulated, importance = TRUE, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 6.754258
## % Var explained: 72.3
rfImp1
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
varImpPlot(model1)
Did the random forest model significantly use the uninformative predictors (v6-V10)?
The importance scores of v6-v10 are much smaller than the most important predictors v1-15. The random forest model did not significanly use the uninformative predictors v6-10.
Now add an additional predictor that is highly correlated with one of the informative predictors. For example:
simulated$duplicate1<-simulated$V1 +rnorm(200)*.1
cor(simulated$duplicate1, simulated$V1)
## [1] 0.9460206
Fit another random forest model to these data.
model2<-randomForest(y~., data=simulated,
importance=TRUE,
ntree=1000)
rfImp2<-varImp(model2, scale=FALSE)
model2
##
## Call:
## randomForest(formula = y ~ ., data = simulated, importance = TRUE, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 6.922537
## % Var explained: 71.61
rfImp2
## Overall
## V1 5.69119973
## V2 6.06896061
## V3 0.62970218
## V4 7.04752238
## V5 1.87238438
## V6 0.13569065
## V7 -0.01345645
## V8 -0.04370565
## V9 0.00840438
## V10 0.02894814
## duplicate1 4.28331581
varImpPlot(model2)
Did the importance score for V1 change?
The importance score for V1 dropped from 8.7 to 5.8 when adding the highly correlated variable. The duplicate variable was named an important variables and uninformative variables became more uninformative.
What happens when you add another predictor that is also highly correlated with V1?
simulated$duplicate2<-simulated$V1 +(rnorm(200)*.1+3)
cor(simulated$duplicate2, simulated$V1)
## [1] 0.9408631
model3<-randomForest(y~., data=simulated,
importance=TRUE,
ntree=1000)
rfImp3<-varImp(model3, scale=FALSE)
model3
##
## Call:
## randomForest(formula = y ~ ., data = simulated, importance = TRUE, ntree = 1000)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 6.784205
## % Var explained: 72.18
rfImp3
## Overall
## V1 4.91687329
## V2 6.52816504
## V3 0.58711552
## V4 7.04870917
## V5 2.03115561
## V6 0.14213148
## V7 0.10991985
## V8 -0.08405687
## V9 -0.01075028
## V10 0.09230576
## duplicate1 3.80068234
## duplicate2 1.87721959
varImpPlot(model3)
What happens when you add another predictor that is also highly correlated with V1?
Because V1 was highly imporant, its highly correlated varable is highly important as well. Adding this, took some importance away from the V1 and a smaller amount of importance from dup1.
Use the cforest function in the party package to fit a random forest model using conditional inference trees. The party package function varimp can calculate predictor importance. The conditional argument of that function toggles between the traditional immportance measure and the modified version described in STRobl et al. (2007). Do these importances show the same pattern as the traditional random forest model?
simulated_orig <- subset(simulated, select =c( -duplicate1,-duplicate2))
modelCF <- cforest(y ~ ., data = simulated_orig)
CF_imp <- varimp(modelCF)
CF_imp<-varImp(modelCF)
modelCF
##
## Random Forest using Conditional Inference Trees
##
## Number of trees: 500
##
## Response: y
## Inputs: V1, V2, V3, V4, V5, V6, V7, V8, V9, V10
## Number of observations: 200
CF_imp
## Overall
## V1 9.088406489
## V2 6.616405032
## V3 -0.007299805
## V4 8.193748339
## V5 1.879879716
## V6 -0.058979879
## V7 0.028906702
## V8 -0.011912122
## V9 -0.046523639
## V10 -0.041972268
# Vs
rfImp1
## Overall
## V1 8.732235404
## V2 6.415369387
## V3 0.763591825
## V4 7.615118809
## V5 2.023524577
## V6 0.165111172
## V7 -0.005961659
## V8 -0.166362581
## V9 -0.095292651
## V10 -0.074944788
The top 5 variables remained important, however, this random forest model using conditional inference trees put more emphasis on variable 4 and less on variable 3.
Repeat this process with different tree models, such as boosted trees and Cubist. Does the same pattern occur?
#boosted trees
model_Boost <- gbm(y ~ ., data = simulated_orig, distribution="gaussian")
summary(model_Boost)
## var rel.inf
## V4 V4 33.1328535
## V1 V1 27.7045971
## V2 V2 21.8020792
## V5 V5 9.6321219
## V3 V3 7.5195471
## V6 V6 0.2088014
## V7 V7 0.0000000
## V8 V8 0.0000000
## V9 V9 0.0000000
## V10 V10 0.0000000
While the top 5 remain important in the boosted model, Var4 is the most important.
simulated_orig2<-subset(simulated, select =c(-y))
model_cubist <- cubist(simulated_orig2, simulated_orig$y)
model_cubist
##
## Call:
## cubist.default(x = simulated_orig2, y = simulated_orig$y)
##
## Number of samples: 200
## Number of predictors: 12
##
## Number of committees: 1
## Number of rules: 1
summary(model_cubist)
##
## Call:
## cubist.default(x = simulated_orig2, y = simulated_orig$y)
##
##
## Cubist [Release 2.07 GPL Edition] Sun Nov 21 16:47:24 2021
## ---------------------------------
##
## Target attribute `outcome'
##
## Read 200 cases (13 attributes) from undefined.data
##
## Model:
##
## Rule 1: [200 cases, mean 14.416183, range 3.55596 to 28.38167, est err 1.936506]
##
## outcome = 0.269253 + 8.9 V4 + 7.1 V2 + 5.1 V5 + 4.8 V1 + 3.2 duplicate1
##
##
## Evaluation on training data (200 cases):
##
## Average |error| 2.012236
## Relative |error| 0.50
## Correlation coefficient 0.87
##
##
## Attribute usage:
## Conds Model
##
## 100% V1
## 100% V2
## 100% V4
## 100% V5
## 100% duplicate1
##
##
## Time: 0.0 secs
varImp(model_cubist)
## Overall
## V1 50
## V2 50
## V4 50
## V5 50
## duplicate1 50
## V3 0
## V6 0
## V7 0
## V8 0
## V9 0
## V10 0
## duplicate2 0
Cubist only uses variables: 1,2,4,5
Use simulation to show tree bias with different granularities.
library(rpart.plot)
example <- twoClassSim(100, linearVars=10,
noiseVars = 6,
corrVar=4,
corrValue = 0.8,
mislabel= 0)
fit <- rpart(Linear01 ~ ., example)
rpart.plot(fit)
### DISCUSSION: FROM the text “Finally, these trees suffer from selection bias: predictors with a higher number of distinct values are favored over more granular predictors….”The danger occurs when a data set consists of a mix of informative and noise variables and the noise variables have many more splits than the informative variables. Then there is a high probability that the noise variables will be chosen to split the top nodes of the tree. Pruning will produce either a tree with misleading struture or no tree at all."
In stochastic gradient boosting the bagging fraction and learning rate will govern the construction of the trees as they are guided by the gradient. Although the optimal values of these parameters should be obtained through the tuning process, it is helpful to understand how the magnitudes of these parameters affect magnitudes of variable importance. Figure 8.24 provides the variable importance plots for boosting using two extreme values for the bagging fraction (0.1 nad 0.9) and the learning rate (0.1 and 0.9) for the solubility data. The left-had plot has both paramters set to 0.1, and the right-hand plot has both set to 0.9:
The learning rate can take values [0,1] and is a tuning parameter for the model. The gradient boosting could be subject to overfitting by selecting the optimal learner. Some suggest small values of the learning parameter work best, but this requires computing time.
Random sampling of bagging reduced prediction variance. Friedman inserted the bagging fraction, the fraction of the training data used, as a tuning parameter for the model. This modification improved prediction while reducing computing sources and Friedman suggests a bagging fraction of .5.
The effect of a small learning rate may tend to overfit the model.
The effect of a small bagging rate may allow other variables to be randomly selected.
The right hand plot is a depiction of a large bagging rate and large learning rate.
These two models are the extremes. A review of the RMSE would assist in picking a model. However, Ridgeway suggests that small values of the learning parameter are best, with Friedman suggesting a bagging fracture of .5, the left-hand model may serve better.
Increasing interaction depth is tree depths or nodes. If the tree is grown it may overfit and may need to be pruned. To further increase nodes will add complexity. Increasing the nodes may bring more predictors in or form partitions on existing predictors that overlap.
Refer to Exercises 6.3 and 7.5 which describe a chemical manufacturing process. Us the same data imputation, data splitting and pre-processing steps as before and train several tree-based models.
Exercise 6.3 describes data for a chemical manufacturing process. Use the same data imputation, data splitting, and pre-processing steps as before and train several non linear regression models.
Impute, datasplit and preprocess…..
library(AppliedPredictiveModeling)
data("ChemicalManufacturingProcess")
dim(ChemicalManufacturingProcess)
## [1] 176 58
str(ChemicalManufacturingProcess)
## 'data.frame': 176 obs. of 58 variables:
## $ Yield : num 38 42.4 42 41.4 42.5 ...
## $ BiologicalMaterial01 : num 6.25 8.01 8.01 8.01 7.47 6.12 7.48 6.94 6.94 6.94 ...
## $ BiologicalMaterial02 : num 49.6 61 61 61 63.3 ...
## $ BiologicalMaterial03 : num 57 67.5 67.5 67.5 72.2 ...
## $ BiologicalMaterial04 : num 12.7 14.6 14.6 14.6 14 ...
## $ BiologicalMaterial05 : num 19.5 19.4 19.4 19.4 17.9 ...
## $ BiologicalMaterial06 : num 43.7 53.1 53.1 53.1 54.7 ...
## $ BiologicalMaterial07 : num 100 100 100 100 100 100 100 100 100 100 ...
## $ BiologicalMaterial08 : num 16.7 19 19 19 18.2 ...
## $ BiologicalMaterial09 : num 11.4 12.6 12.6 12.6 12.8 ...
## $ BiologicalMaterial10 : num 3.46 3.46 3.46 3.46 3.05 3.78 3.04 3.85 3.85 3.85 ...
## $ BiologicalMaterial11 : num 138 154 154 154 148 ...
## $ BiologicalMaterial12 : num 18.8 21.1 21.1 21.1 21.1 ...
## $ ManufacturingProcess01: num NA 0 0 0 10.7 12 11.5 12 12 12 ...
## $ ManufacturingProcess02: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess03: num NA NA NA NA NA NA 1.56 1.55 1.56 1.55 ...
## $ ManufacturingProcess04: num NA 917 912 911 918 924 933 929 928 938 ...
## $ ManufacturingProcess05: num NA 1032 1004 1015 1028 ...
## $ ManufacturingProcess06: num NA 210 207 213 206 ...
## $ ManufacturingProcess07: num NA 177 178 177 178 178 177 178 177 177 ...
## $ ManufacturingProcess08: num NA 178 178 177 178 178 178 178 177 177 ...
## $ ManufacturingProcess09: num 43 46.6 45.1 44.9 45 ...
## $ ManufacturingProcess10: num NA NA NA NA NA NA 11.6 10.2 9.7 10.1 ...
## $ ManufacturingProcess11: num NA NA NA NA NA NA 11.5 11.3 11.1 10.2 ...
## $ ManufacturingProcess12: num NA 0 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess13: num 35.5 34 34.8 34.8 34.6 34 32.4 33.6 33.9 34.3 ...
## $ ManufacturingProcess14: num 4898 4869 4878 4897 4992 ...
## $ ManufacturingProcess15: num 6108 6095 6087 6102 6233 ...
## $ ManufacturingProcess16: num 4682 4617 4617 4635 4733 ...
## $ ManufacturingProcess17: num 35.5 34 34.8 34.8 33.9 33.4 33.8 33.6 33.9 35.3 ...
## $ ManufacturingProcess18: num 4865 4867 4877 4872 4886 ...
## $ ManufacturingProcess19: num 6049 6097 6078 6073 6102 ...
## $ ManufacturingProcess20: num 4665 4621 4621 4611 4659 ...
## $ ManufacturingProcess21: num 0 0 0 0 -0.7 -0.6 1.4 0 0 1 ...
## $ ManufacturingProcess22: num NA 3 4 5 8 9 1 2 3 4 ...
## $ ManufacturingProcess23: num NA 0 1 2 4 1 1 2 3 1 ...
## $ ManufacturingProcess24: num NA 3 4 5 18 1 1 2 3 4 ...
## $ ManufacturingProcess25: num 4873 4869 4897 4892 4930 ...
## $ ManufacturingProcess26: num 6074 6107 6116 6111 6151 ...
## $ ManufacturingProcess27: num 4685 4630 4637 4630 4684 ...
## $ ManufacturingProcess28: num 10.7 11.2 11.1 11.1 11.3 11.4 11.2 11.1 11.3 11.4 ...
## $ ManufacturingProcess29: num 21 21.4 21.3 21.3 21.6 21.7 21.2 21.2 21.5 21.7 ...
## $ ManufacturingProcess30: num 9.9 9.9 9.4 9.4 9 10.1 11.2 10.9 10.5 9.8 ...
## $ ManufacturingProcess31: num 69.1 68.7 69.3 69.3 69.4 68.2 67.6 67.9 68 68.5 ...
## $ ManufacturingProcess32: num 156 169 173 171 171 173 159 161 160 164 ...
## $ ManufacturingProcess33: num 66 66 66 68 70 70 65 65 65 66 ...
## $ ManufacturingProcess34: num 2.4 2.6 2.6 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
## $ ManufacturingProcess35: num 486 508 509 496 468 490 475 478 491 488 ...
## $ ManufacturingProcess36: num 0.019 0.019 0.018 0.018 0.017 0.018 0.019 0.019 0.019 0.019 ...
## $ ManufacturingProcess37: num 0.5 2 0.7 1.2 0.2 0.4 0.8 1 1.2 1.8 ...
## $ ManufacturingProcess38: num 3 2 2 2 2 2 2 2 3 3 ...
## $ ManufacturingProcess39: num 7.2 7.2 7.2 7.2 7.3 7.2 7.3 7.3 7.4 7.1 ...
## $ ManufacturingProcess40: num NA 0.1 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess41: num NA 0.15 0 0 0 0 0 0 0 0 ...
## $ ManufacturingProcess42: num 11.6 11.1 12 10.6 11 11.5 11.7 11.4 11.4 11.3 ...
## $ ManufacturingProcess43: num 3 0.9 1 1.1 1.1 2.2 0.7 0.8 0.9 0.8 ...
## $ ManufacturingProcess44: num 1.8 1.9 1.8 1.8 1.7 1.8 2 2 1.9 1.9 ...
## $ ManufacturingProcess45: num 2.4 2.2 2.3 2.1 2.1 2 2.2 2.2 2.1 2.4 ...
md.pattern(ChemicalManufacturingProcess)
## Yield BiologicalMaterial01 BiologicalMaterial02 BiologicalMaterial03
## 152 1 1 1 1
## 6 1 1 1 1
## 1 1 1 1 1
## 7 1 1 1 1
## 5 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 0 0 0 0
## BiologicalMaterial04 BiologicalMaterial05 BiologicalMaterial06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial07 BiologicalMaterial08 BiologicalMaterial09
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## BiologicalMaterial10 BiologicalMaterial11 BiologicalMaterial12
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess09 ManufacturingProcess13 ManufacturingProcess15
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess16 ManufacturingProcess17 ManufacturingProcess18
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess19 ManufacturingProcess20 ManufacturingProcess21
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess32 ManufacturingProcess37 ManufacturingProcess38
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess39 ManufacturingProcess42 ManufacturingProcess43
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 0 0 0
## ManufacturingProcess44 ManufacturingProcess45 ManufacturingProcess01
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 0
## 0 0 1
## ManufacturingProcess04 ManufacturingProcess05 ManufacturingProcess07
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess08 ManufacturingProcess12 ManufacturingProcess14
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 0
## 1 0 0 1
## 1 1 1
## ManufacturingProcess22 ManufacturingProcess23 ManufacturingProcess24
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 0 0
## 1 1 1
## ManufacturingProcess40 ManufacturingProcess41 ManufacturingProcess06
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 1 1
## 2 1 1 1
## 1 1 1 0
## 1 1 1 1
## 1 0 0 0
## 1 1 2
## ManufacturingProcess02 ManufacturingProcess25 ManufacturingProcess26
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 1 0 0
## 2 0 1 1
## 1 1 1 1
## 1 1 1 1
## 1 0 1 1
## 3 5 5
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess33
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess34 ManufacturingProcess35 ManufacturingProcess36
## 152 1 1 1
## 6 1 1 1
## 1 1 1 1
## 7 1 1 1
## 5 0 0 0
## 2 1 1 1
## 1 1 1 1
## 1 1 1 1
## 1 1 1 1
## 5 5 5
## ManufacturingProcess10 ManufacturingProcess11 ManufacturingProcess03
## 152 1 1 1 0
## 6 1 1 0 1
## 1 1 0 1 1
## 7 0 0 0 3
## 5 1 1 1 11
## 2 1 1 1 1
## 1 1 1 1 1
## 1 0 0 0 4
## 1 0 0 0 16
## 9 10 15 106
countNA<-colSums(is.na(ChemicalManufacturingProcess))
countNA
## Yield BiologicalMaterial01 BiologicalMaterial02
## 0 0 0
## BiologicalMaterial03 BiologicalMaterial04 BiologicalMaterial05
## 0 0 0
## BiologicalMaterial06 BiologicalMaterial07 BiologicalMaterial08
## 0 0 0
## BiologicalMaterial09 BiologicalMaterial10 BiologicalMaterial11
## 0 0 0
## BiologicalMaterial12 ManufacturingProcess01 ManufacturingProcess02
## 0 1 3
## ManufacturingProcess03 ManufacturingProcess04 ManufacturingProcess05
## 15 1 1
## ManufacturingProcess06 ManufacturingProcess07 ManufacturingProcess08
## 2 1 1
## ManufacturingProcess09 ManufacturingProcess10 ManufacturingProcess11
## 0 9 10
## ManufacturingProcess12 ManufacturingProcess13 ManufacturingProcess14
## 1 0 1
## ManufacturingProcess15 ManufacturingProcess16 ManufacturingProcess17
## 0 0 0
## ManufacturingProcess18 ManufacturingProcess19 ManufacturingProcess20
## 0 0 0
## ManufacturingProcess21 ManufacturingProcess22 ManufacturingProcess23
## 0 1 1
## ManufacturingProcess24 ManufacturingProcess25 ManufacturingProcess26
## 1 5 5
## ManufacturingProcess27 ManufacturingProcess28 ManufacturingProcess29
## 5 5 5
## ManufacturingProcess30 ManufacturingProcess31 ManufacturingProcess32
## 5 5 0
## ManufacturingProcess33 ManufacturingProcess34 ManufacturingProcess35
## 5 5 5
## ManufacturingProcess36 ManufacturingProcess37 ManufacturingProcess38
## 5 0 0
## ManufacturingProcess39 ManufacturingProcess40 ManufacturingProcess41
## 0 1 1
## ManufacturingProcess42 ManufacturingProcess43 ManufacturingProcess44
## 0 0 0
## ManufacturingProcess45
## 0
#Look at predictor part of dataset
pred <- ChemicalManufacturingProcess[,-c(1)]
#Imput with KNN
Imp_pred <- preProcess(pred, method="knnImpute")
#predict function
pred1 <- predict(Imp_pred, pred)
pred2 <- preProcess(pred1, method=c("center", "scale"))
pred3 <- predict(pred2, pred1)
#split
set.seed(200)
trainingRows <- createDataPartition(ChemicalManufacturingProcess$Yield,
p=0.75, list=FALSE)
train_X2 <- pred3[trainingRows, ]
train_Y2 <- ChemicalManufacturingProcess$Yield[trainingRows]
test_X2 <- pred3[-trainingRows, ]
test_Y2 <- ChemicalManufacturingProcess$Yield[-trainingRows]
Which tree-based regression model gives the optimal resampling and test set performance?
#RANDOM FOREST
set.seed(200)
rf_Model <- train(x = train_X2, y = train_Y2, method = "rf", tuneLength = 10)
rf_Model
## Random Forest
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.348893 0.5293886 1.0696605
## 8 1.274199 0.5575346 0.9932945
## 14 1.251322 0.5630105 0.9659533
## 20 1.240094 0.5666342 0.9528162
## 26 1.235294 0.5657072 0.9453581
## 32 1.230988 0.5652645 0.9410238
## 38 1.232093 0.5626957 0.9411426
## 44 1.229151 0.5613501 0.9394080
## 50 1.235517 0.5549264 0.9452981
## 57 1.235908 0.5540221 0.9456433
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 44.
rf_ModelImp<-varImp(rf_Model, scale=FALSE)
rf_ModelImp
## rf variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 160.094
## ManufacturingProcess17 30.294
## BiologicalMaterial12 22.691
## ManufacturingProcess13 14.908
## ManufacturingProcess31 14.468
## BiologicalMaterial03 13.588
## ManufacturingProcess06 13.507
## ManufacturingProcess09 12.020
## BiologicalMaterial11 11.958
## BiologicalMaterial06 9.814
## BiologicalMaterial02 6.027
## BiologicalMaterial04 5.919
## ManufacturingProcess15 5.850
## ManufacturingProcess11 5.820
## ManufacturingProcess21 5.464
## BiologicalMaterial05 5.275
## ManufacturingProcess28 5.010
## ManufacturingProcess36 4.577
## ManufacturingProcess24 4.487
## ManufacturingProcess39 4.272
rf_Pred <- predict(rf_Model, newdata = test_X2)
postResample(pred = rf_Pred, obs = test_Y2)
## RMSE Rsquared MAE
## 1.1304686 0.6491234 0.8625628
Random forest: 44 RMSE=1.229151
test:
RMSE Rsquared MAE 1.1304686 0.6491234 0.8625628
set.seed(200)
gbmGrid <- expand.grid(interaction.depth=seq(1,7,by=2),
n.trees=seq(100,1000,by=50),
shrinkage=c(0.01,0.1),
n.minobsinnode=c(5,10))
gb_Model <- train(x = train_X2, y = train_Y2, method = "gbm", tuneGrid = gbmGrid, verbose=FALSE)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 7: BiologicalMaterial07 has no variation.
gb_Model
## Stochastic Gradient Boosting
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.minobsinnode n.trees RMSE Rsquared
## 0.01 1 5 100 1.499632 0.4947512
## 0.01 1 5 150 1.417461 0.5105176
## 0.01 1 5 200 1.360321 0.5260203
## 0.01 1 5 250 1.323412 0.5322827
## 0.01 1 5 300 1.296580 0.5385795
## 0.01 1 5 350 1.279495 0.5414004
## 0.01 1 5 400 1.268340 0.5433324
## 0.01 1 5 450 1.262272 0.5431086
## 0.01 1 5 500 1.258412 0.5429886
## 0.01 1 5 550 1.254102 0.5439175
## 0.01 1 5 600 1.251736 0.5438272
## 0.01 1 5 650 1.250824 0.5436833
## 0.01 1 5 700 1.250649 0.5424084
## 0.01 1 5 750 1.250381 0.5417903
## 0.01 1 5 800 1.249261 0.5422278
## 0.01 1 5 850 1.249046 0.5421169
## 0.01 1 5 900 1.249040 0.5420316
## 0.01 1 5 950 1.247811 0.5427731
## 0.01 1 5 1000 1.247777 0.5429090
## 0.01 1 10 100 1.501809 0.4864556
## 0.01 1 10 150 1.416911 0.5043683
## 0.01 1 10 200 1.360048 0.5164907
## 0.01 1 10 250 1.324937 0.5227843
## 0.01 1 10 300 1.301207 0.5278352
## 0.01 1 10 350 1.286024 0.5301827
## 0.01 1 10 400 1.276005 0.5325708
## 0.01 1 10 450 1.268655 0.5344481
## 0.01 1 10 500 1.263677 0.5362141
## 0.01 1 10 550 1.260768 0.5365805
## 0.01 1 10 600 1.258678 0.5372867
## 0.01 1 10 650 1.256590 0.5382483
## 0.01 1 10 700 1.255561 0.5384441
## 0.01 1 10 750 1.254105 0.5387985
## 0.01 1 10 800 1.253554 0.5391463
## 0.01 1 10 850 1.252346 0.5398259
## 0.01 1 10 900 1.252651 0.5397289
## 0.01 1 10 950 1.252674 0.5398174
## 0.01 1 10 1000 1.253202 0.5395628
## 0.01 3 5 100 1.403157 0.5216697
## 0.01 3 5 150 1.327696 0.5316870
## 0.01 3 5 200 1.289239 0.5378374
## 0.01 3 5 250 1.265997 0.5433461
## 0.01 3 5 300 1.251552 0.5478952
## 0.01 3 5 350 1.243133 0.5506859
## 0.01 3 5 400 1.237429 0.5530759
## 0.01 3 5 450 1.234209 0.5544853
## 0.01 3 5 500 1.230168 0.5565592
## 0.01 3 5 550 1.227400 0.5583432
## 0.01 3 5 600 1.226493 0.5588772
## 0.01 3 5 650 1.225415 0.5595342
## 0.01 3 5 700 1.223757 0.5605969
## 0.01 3 5 750 1.222957 0.5611199
## 0.01 3 5 800 1.221843 0.5617946
## 0.01 3 5 850 1.220169 0.5630096
## 0.01 3 5 900 1.219520 0.5635094
## 0.01 3 5 950 1.218799 0.5639823
## 0.01 3 5 1000 1.218722 0.5640007
## 0.01 3 10 100 1.398422 0.5211494
## 0.01 3 10 150 1.326115 0.5291885
## 0.01 3 10 200 1.289345 0.5354014
## 0.01 3 10 250 1.268787 0.5387175
## 0.01 3 10 300 1.257548 0.5411297
## 0.01 3 10 350 1.251224 0.5427314
## 0.01 3 10 400 1.246711 0.5443658
## 0.01 3 10 450 1.244690 0.5453073
## 0.01 3 10 500 1.242739 0.5459452
## 0.01 3 10 550 1.241280 0.5470684
## 0.01 3 10 600 1.240714 0.5471145
## 0.01 3 10 650 1.240585 0.5470726
## 0.01 3 10 700 1.240211 0.5473304
## 0.01 3 10 750 1.239491 0.5477385
## 0.01 3 10 800 1.239723 0.5474466
## 0.01 3 10 850 1.240295 0.5471081
## 0.01 3 10 900 1.240570 0.5469857
## 0.01 3 10 950 1.240199 0.5471966
## 0.01 3 10 1000 1.239808 0.5476067
## 0.01 5 5 100 1.377279 0.5325597
## 0.01 5 5 150 1.303276 0.5439147
## 0.01 5 5 200 1.268279 0.5489251
## 0.01 5 5 250 1.249024 0.5527714
## 0.01 5 5 300 1.238296 0.5555155
## 0.01 5 5 350 1.230647 0.5582922
## 0.01 5 5 400 1.225794 0.5606792
## 0.01 5 5 450 1.221577 0.5630869
## 0.01 5 5 500 1.218498 0.5647443
## 0.01 5 5 550 1.216291 0.5660116
## 0.01 5 5 600 1.214989 0.5669343
## 0.01 5 5 650 1.212653 0.5683785
## 0.01 5 5 700 1.211540 0.5690693
## 0.01 5 5 750 1.210538 0.5697310
## 0.01 5 5 800 1.209784 0.5703990
## 0.01 5 5 850 1.208856 0.5710570
## 0.01 5 5 900 1.207994 0.5716023
## 0.01 5 5 950 1.207048 0.5721661
## 0.01 5 5 1000 1.206383 0.5726665
## 0.01 5 10 100 1.386371 0.5280223
## 0.01 5 10 150 1.317433 0.5341875
## 0.01 5 10 200 1.281341 0.5397960
## 0.01 5 10 250 1.261588 0.5438357
## 0.01 5 10 300 1.252411 0.5447899
## 0.01 5 10 350 1.247644 0.5455782
## 0.01 5 10 400 1.243160 0.5477063
## 0.01 5 10 450 1.240467 0.5486866
## 0.01 5 10 500 1.237982 0.5496623
## 0.01 5 10 550 1.236580 0.5502976
## 0.01 5 10 600 1.236836 0.5499420
## 0.01 5 10 650 1.236150 0.5502862
## 0.01 5 10 700 1.236359 0.5500084
## 0.01 5 10 750 1.235464 0.5503440
## 0.01 5 10 800 1.234867 0.5506802
## 0.01 5 10 850 1.234515 0.5510357
## 0.01 5 10 900 1.235091 0.5507661
## 0.01 5 10 950 1.234740 0.5510170
## 0.01 5 10 1000 1.234985 0.5509279
## 0.01 7 5 100 1.369084 0.5391824
## 0.01 7 5 150 1.298645 0.5463421
## 0.01 7 5 200 1.260908 0.5533550
## 0.01 7 5 250 1.242951 0.5564316
## 0.01 7 5 300 1.230060 0.5616252
## 0.01 7 5 350 1.222710 0.5644305
## 0.01 7 5 400 1.218812 0.5662773
## 0.01 7 5 450 1.214538 0.5684453
## 0.01 7 5 500 1.211571 0.5701823
## 0.01 7 5 550 1.209251 0.5715768
## 0.01 7 5 600 1.206425 0.5732941
## 0.01 7 5 650 1.204975 0.5740483
## 0.01 7 5 700 1.203340 0.5749015
## 0.01 7 5 750 1.201958 0.5758507
## 0.01 7 5 800 1.200951 0.5765829
## 0.01 7 5 850 1.200257 0.5770871
## 0.01 7 5 900 1.199154 0.5778294
## 0.01 7 5 950 1.198399 0.5783426
## 0.01 7 5 1000 1.197772 0.5786886
## 0.01 7 10 100 1.390716 0.5224340
## 0.01 7 10 150 1.319261 0.5322520
## 0.01 7 10 200 1.284965 0.5358154
## 0.01 7 10 250 1.265928 0.5389826
## 0.01 7 10 300 1.254788 0.5420886
## 0.01 7 10 350 1.248902 0.5435873
## 0.01 7 10 400 1.244693 0.5449893
## 0.01 7 10 450 1.244009 0.5442071
## 0.01 7 10 500 1.242475 0.5451513
## 0.01 7 10 550 1.242331 0.5449894
## 0.01 7 10 600 1.242656 0.5445368
## 0.01 7 10 650 1.240951 0.5456534
## 0.01 7 10 700 1.240243 0.5461927
## 0.01 7 10 750 1.240354 0.5459520
## 0.01 7 10 800 1.240710 0.5458000
## 0.01 7 10 850 1.240776 0.5457145
## 0.01 7 10 900 1.241014 0.5454642
## 0.01 7 10 950 1.241368 0.5452337
## 0.01 7 10 1000 1.240766 0.5456602
## 0.10 1 5 100 1.265785 0.5286735
## 0.10 1 5 150 1.264905 0.5333988
## 0.10 1 5 200 1.262661 0.5358153
## 0.10 1 5 250 1.264866 0.5353327
## 0.10 1 5 300 1.267426 0.5338333
## 0.10 1 5 350 1.272374 0.5308585
## 0.10 1 5 400 1.275443 0.5293441
## 0.10 1 5 450 1.279062 0.5273558
## 0.10 1 5 500 1.282426 0.5258003
## 0.10 1 5 550 1.283628 0.5257270
## 0.10 1 5 600 1.284446 0.5255097
## 0.10 1 5 650 1.285621 0.5251681
## 0.10 1 5 700 1.286501 0.5246601
## 0.10 1 5 750 1.289708 0.5226108
## 0.10 1 5 800 1.290668 0.5223274
## 0.10 1 5 850 1.292281 0.5213446
## 0.10 1 5 900 1.294383 0.5201654
## 0.10 1 5 950 1.295763 0.5194885
## 0.10 1 5 1000 1.295946 0.5193919
## 0.10 1 10 100 1.272727 0.5244975
## 0.10 1 10 150 1.273977 0.5254529
## 0.10 1 10 200 1.279504 0.5255056
## 0.10 1 10 250 1.288944 0.5210394
## 0.10 1 10 300 1.296661 0.5185810
## 0.10 1 10 350 1.298479 0.5177224
## 0.10 1 10 400 1.305641 0.5149803
## 0.10 1 10 450 1.310729 0.5129133
## 0.10 1 10 500 1.314172 0.5118746
## 0.10 1 10 550 1.318305 0.5101500
## 0.10 1 10 600 1.320167 0.5102354
## 0.10 1 10 650 1.322979 0.5091117
## 0.10 1 10 700 1.325478 0.5083940
## 0.10 1 10 750 1.327746 0.5073821
## 0.10 1 10 800 1.329752 0.5067096
## 0.10 1 10 850 1.330285 0.5067085
## 0.10 1 10 900 1.331095 0.5064901
## 0.10 1 10 950 1.332304 0.5060501
## 0.10 1 10 1000 1.333372 0.5060864
## 0.10 3 5 100 1.272759 0.5265740
## 0.10 3 5 150 1.271500 0.5289225
## 0.10 3 5 200 1.269408 0.5310128
## 0.10 3 5 250 1.268993 0.5313417
## 0.10 3 5 300 1.269396 0.5314449
## 0.10 3 5 350 1.268542 0.5320807
## 0.10 3 5 400 1.267986 0.5325204
## 0.10 3 5 450 1.267900 0.5325379
## 0.10 3 5 500 1.267758 0.5326887
## 0.10 3 5 550 1.267755 0.5326764
## 0.10 3 5 600 1.267682 0.5327358
## 0.10 3 5 650 1.267603 0.5327943
## 0.10 3 5 700 1.267561 0.5328208
## 0.10 3 5 750 1.267524 0.5328497
## 0.10 3 5 800 1.267525 0.5328499
## 0.10 3 5 850 1.267522 0.5328521
## 0.10 3 5 900 1.267520 0.5328534
## 0.10 3 5 950 1.267517 0.5328564
## 0.10 3 5 1000 1.267518 0.5328562
## 0.10 3 10 100 1.266026 0.5352758
## 0.10 3 10 150 1.271463 0.5328325
## 0.10 3 10 200 1.275464 0.5315194
## 0.10 3 10 250 1.278644 0.5298383
## 0.10 3 10 300 1.280758 0.5291379
## 0.10 3 10 350 1.281332 0.5291723
## 0.10 3 10 400 1.282097 0.5290674
## 0.10 3 10 450 1.282747 0.5288042
## 0.10 3 10 500 1.283388 0.5286558
## 0.10 3 10 550 1.283546 0.5286606
## 0.10 3 10 600 1.283919 0.5285656
## 0.10 3 10 650 1.284115 0.5286389
## 0.10 3 10 700 1.284562 0.5284787
## 0.10 3 10 750 1.284766 0.5284683
## 0.10 3 10 800 1.285252 0.5282684
## 0.10 3 10 850 1.285444 0.5282580
## 0.10 3 10 900 1.285549 0.5282573
## 0.10 3 10 950 1.285646 0.5282253
## 0.10 3 10 1000 1.285779 0.5281892
## 0.10 5 5 100 1.256488 0.5367741
## 0.10 5 5 150 1.253949 0.5393069
## 0.10 5 5 200 1.252471 0.5408591
## 0.10 5 5 250 1.250927 0.5423133
## 0.10 5 5 300 1.250158 0.5430085
## 0.10 5 5 350 1.249968 0.5433103
## 0.10 5 5 400 1.249638 0.5435811
## 0.10 5 5 450 1.249477 0.5437474
## 0.10 5 5 500 1.249388 0.5438610
## 0.10 5 5 550 1.249371 0.5439014
## 0.10 5 5 600 1.249323 0.5439347
## 0.10 5 5 650 1.249308 0.5439522
## 0.10 5 5 700 1.249318 0.5439548
## 0.10 5 5 750 1.249310 0.5439692
## 0.10 5 5 800 1.249301 0.5439790
## 0.10 5 5 850 1.249303 0.5439792
## 0.10 5 5 900 1.249295 0.5439861
## 0.10 5 5 950 1.249293 0.5439885
## 0.10 5 5 1000 1.249296 0.5439865
## 0.10 5 10 100 1.262315 0.5266417
## 0.10 5 10 150 1.264619 0.5272806
## 0.10 5 10 200 1.264685 0.5289003
## 0.10 5 10 250 1.267190 0.5278543
## 0.10 5 10 300 1.267591 0.5279323
## 0.10 5 10 350 1.268888 0.5272508
## 0.10 5 10 400 1.269767 0.5269480
## 0.10 5 10 450 1.270295 0.5267430
## 0.10 5 10 500 1.271157 0.5265118
## 0.10 5 10 550 1.271586 0.5263102
## 0.10 5 10 600 1.272347 0.5259151
## 0.10 5 10 650 1.273000 0.5256591
## 0.10 5 10 700 1.273784 0.5252669
## 0.10 5 10 750 1.274088 0.5251450
## 0.10 5 10 800 1.274596 0.5249240
## 0.10 5 10 850 1.274818 0.5248358
## 0.10 5 10 900 1.275099 0.5247348
## 0.10 5 10 950 1.275380 0.5246277
## 0.10 5 10 1000 1.275639 0.5245199
## 0.10 7 5 100 1.231998 0.5493671
## 0.10 7 5 150 1.228463 0.5522580
## 0.10 7 5 200 1.227557 0.5533134
## 0.10 7 5 250 1.227144 0.5537407
## 0.10 7 5 300 1.226696 0.5541309
## 0.10 7 5 350 1.226404 0.5545032
## 0.10 7 5 400 1.226553 0.5544547
## 0.10 7 5 450 1.226693 0.5544716
## 0.10 7 5 500 1.226693 0.5545135
## 0.10 7 5 550 1.226736 0.5545266
## 0.10 7 5 600 1.226767 0.5545267
## 0.10 7 5 650 1.226792 0.5545323
## 0.10 7 5 700 1.226799 0.5545285
## 0.10 7 5 750 1.226804 0.5545257
## 0.10 7 5 800 1.226804 0.5545322
## 0.10 7 5 850 1.226811 0.5545272
## 0.10 7 5 900 1.226819 0.5545221
## 0.10 7 5 950 1.226818 0.5545242
## 0.10 7 5 1000 1.226821 0.5545232
## 0.10 7 10 100 1.272979 0.5252150
## 0.10 7 10 150 1.275732 0.5232129
## 0.10 7 10 200 1.275471 0.5243331
## 0.10 7 10 250 1.275679 0.5248403
## 0.10 7 10 300 1.275894 0.5249240
## 0.10 7 10 350 1.276357 0.5248143
## 0.10 7 10 400 1.276732 0.5250323
## 0.10 7 10 450 1.276978 0.5251458
## 0.10 7 10 500 1.277021 0.5251952
## 0.10 7 10 550 1.277488 0.5250215
## 0.10 7 10 600 1.278145 0.5246933
## 0.10 7 10 650 1.278404 0.5245881
## 0.10 7 10 700 1.278435 0.5246454
## 0.10 7 10 750 1.278813 0.5245265
## 0.10 7 10 800 1.279001 0.5244094
## 0.10 7 10 850 1.279166 0.5243588
## 0.10 7 10 900 1.279443 0.5242565
## 0.10 7 10 950 1.279416 0.5242819
## 0.10 7 10 1000 1.279562 0.5242121
## MAE
## 1.2051136
## 1.1318774
## 1.0785588
## 1.0409058
## 1.0124928
## 0.9934169
## 0.9823291
## 0.9754767
## 0.9706661
## 0.9655137
## 0.9617848
## 0.9590147
## 0.9577952
## 0.9569008
## 0.9547784
## 0.9536253
## 0.9523793
## 0.9507849
## 0.9498266
## 1.2075958
## 1.1289186
## 1.0729049
## 1.0361063
## 1.0096329
## 0.9928228
## 0.9810674
## 0.9729514
## 0.9668001
## 0.9636250
## 0.9609879
## 0.9578927
## 0.9564796
## 0.9546976
## 0.9537115
## 0.9519070
## 0.9517902
## 0.9518266
## 0.9512637
## 1.1186603
## 1.0432764
## 1.0039773
## 0.9801778
## 0.9649798
## 0.9550787
## 0.9483765
## 0.9432284
## 0.9382808
## 0.9335654
## 0.9321010
## 0.9302355
## 0.9275181
## 0.9263163
## 0.9242567
## 0.9231492
## 0.9223860
## 0.9218004
## 0.9216254
## 1.1139905
## 1.0398546
## 0.9987168
## 0.9765132
## 0.9643419
## 0.9564416
## 0.9519411
## 0.9491393
## 0.9463533
## 0.9448007
## 0.9433792
## 0.9424675
## 0.9419181
## 0.9409738
## 0.9412925
## 0.9417486
## 0.9417425
## 0.9417804
## 0.9417040
## 1.0932106
## 1.0202840
## 0.9823847
## 0.9607460
## 0.9482726
## 0.9377023
## 0.9304586
## 0.9250899
## 0.9209949
## 0.9178884
## 0.9161386
## 0.9139342
## 0.9124960
## 0.9113960
## 0.9105176
## 0.9096916
## 0.9087797
## 0.9080170
## 0.9074920
## 1.0993289
## 1.0303457
## 0.9913724
## 0.9695580
## 0.9578954
## 0.9513281
## 0.9465517
## 0.9436210
## 0.9406683
## 0.9390522
## 0.9388004
## 0.9376664
## 0.9372237
## 0.9367517
## 0.9363602
## 0.9359764
## 0.9359021
## 0.9354277
## 0.9355993
## 1.0882867
## 1.0169080
## 0.9766609
## 0.9549155
## 0.9390700
## 0.9293287
## 0.9237789
## 0.9191153
## 0.9158142
## 0.9133948
## 0.9106731
## 0.9090327
## 0.9074102
## 0.9063222
## 0.9053007
## 0.9046468
## 0.9036279
## 0.9030691
## 0.9026707
## 1.1052381
## 1.0312196
## 0.9925489
## 0.9710257
## 0.9583923
## 0.9511555
## 0.9460285
## 0.9452072
## 0.9430458
## 0.9426622
## 0.9426144
## 0.9416499
## 0.9408055
## 0.9412515
## 0.9416895
## 0.9417553
## 0.9420172
## 0.9419374
## 0.9416010
## 0.9650825
## 0.9605987
## 0.9561264
## 0.9572718
## 0.9579809
## 0.9599097
## 0.9621161
## 0.9652683
## 0.9671470
## 0.9684269
## 0.9706056
## 0.9715003
## 0.9721887
## 0.9743213
## 0.9755779
## 0.9763642
## 0.9778163
## 0.9785409
## 0.9785893
## 0.9637605
## 0.9661677
## 0.9684562
## 0.9788428
## 0.9860489
## 0.9868310
## 0.9922677
## 0.9958767
## 0.9985568
## 1.0030230
## 1.0048913
## 1.0072345
## 1.0089213
## 1.0107214
## 1.0131644
## 1.0140346
## 1.0147737
## 1.0158271
## 1.0173670
## 0.9626265
## 0.9624324
## 0.9602410
## 0.9592744
## 0.9594281
## 0.9589471
## 0.9584877
## 0.9584212
## 0.9582477
## 0.9582920
## 0.9582773
## 0.9582411
## 0.9582107
## 0.9581893
## 0.9581956
## 0.9581994
## 0.9581984
## 0.9581957
## 0.9581997
## 0.9672163
## 0.9748987
## 0.9793514
## 0.9824668
## 0.9843933
## 0.9852521
## 0.9854203
## 0.9860380
## 0.9865861
## 0.9868320
## 0.9872792
## 0.9875621
## 0.9878971
## 0.9880899
## 0.9885050
## 0.9888291
## 0.9890563
## 0.9892266
## 0.9894029
## 0.9566232
## 0.9542732
## 0.9525324
## 0.9512686
## 0.9507258
## 0.9504082
## 0.9501267
## 0.9499844
## 0.9499349
## 0.9499153
## 0.9498459
## 0.9498417
## 0.9498493
## 0.9498470
## 0.9498320
## 0.9498307
## 0.9498257
## 0.9498195
## 0.9498215
## 0.9625243
## 0.9643228
## 0.9650804
## 0.9666755
## 0.9673584
## 0.9682105
## 0.9689919
## 0.9694512
## 0.9699356
## 0.9703704
## 0.9710086
## 0.9714197
## 0.9720841
## 0.9723179
## 0.9727676
## 0.9729902
## 0.9731977
## 0.9734864
## 0.9737665
## 0.9504910
## 0.9440653
## 0.9428057
## 0.9421130
## 0.9417730
## 0.9414214
## 0.9413808
## 0.9414553
## 0.9415004
## 0.9414834
## 0.9414622
## 0.9414868
## 0.9414980
## 0.9414919
## 0.9414894
## 0.9414935
## 0.9415040
## 0.9415050
## 0.9415082
## 0.9705664
## 0.9730958
## 0.9731751
## 0.9722419
## 0.9726411
## 0.9735421
## 0.9740486
## 0.9744827
## 0.9746127
## 0.9751904
## 0.9756646
## 0.9760272
## 0.9761953
## 0.9764737
## 0.9768073
## 0.9769678
## 0.9772846
## 0.9773485
## 0.9775540
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 1000, interaction.depth =
## 7, shrinkage = 0.01 and n.minobsinnode = 5.
varImp(gb_Model)
## gbm variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess17 27.047
## ManufacturingProcess13 16.090
## BiologicalMaterial12 14.913
## ManufacturingProcess06 14.888
## ManufacturingProcess09 13.583
## BiologicalMaterial03 12.066
## ManufacturingProcess31 11.651
## BiologicalMaterial09 8.734
## BiologicalMaterial11 8.325
## ManufacturingProcess24 6.523
## ManufacturingProcess01 6.310
## BiologicalMaterial05 6.292
## ManufacturingProcess04 5.900
## ManufacturingProcess14 5.586
## BiologicalMaterial08 5.571
## BiologicalMaterial04 5.468
## BiologicalMaterial06 5.467
## ManufacturingProcess05 5.274
## ManufacturingProcess11 4.948
gb_Pred <- predict(gb_Model, newdata = test_X2)
postResample(pred = gb_Pred, obs = test_Y2)
## RMSE Rsquared MAE
## 1.1202764 0.6565479 0.8919856
BOOST
RMSE was used to select the optimal model using the smallest value. The final values used for the model were n.trees = 1000, interaction.depth = 7, shrinkage = 0.01 and n.minobsinnode = 5. RMSE = 1.197772
test RMSE Rsquared MAE 1.1202764 0.6565479 0.8919856
BOOST
RMSE=1.206580
test: RMSE Rsquared MAE 1.1202764 0.6565479 0.8919856
#CUBIST
set.seed(200)
cubist_Model <- train(x = train_X2, y = train_Y2, method = "cubist")
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
## Warning in cubist.default(x, y, committees = param$committees, ...): NAs
## introduced by coercion
cubist_Model
## Cubist
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## committees neighbors RMSE Rsquared MAE
## 1 0 1.914375 0.3209284 1.3469601
## 1 5 1.878481 0.3374995 1.3129553
## 1 9 1.885257 0.3330481 1.3186288
## 10 0 1.294170 0.5354825 0.9938812
## 10 5 1.266547 0.5538839 0.9683847
## 10 9 1.269738 0.5507295 0.9721896
## 20 0 1.232534 0.5685524 0.9510921
## 20 5 1.206580 0.5842537 0.9265917
## 20 9 1.210972 0.5808630 0.9305998
##
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 20 and neighbors = 5.
varImp(cubist_Model)
## cubist variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess17 64.84
## ManufacturingProcess28 56.04
## ManufacturingProcess39 40.66
## ManufacturingProcess33 37.36
## BiologicalMaterial06 28.57
## ManufacturingProcess09 26.37
## ManufacturingProcess04 23.08
## BiologicalMaterial02 19.78
## ManufacturingProcess01 15.38
## BiologicalMaterial03 15.38
## ManufacturingProcess24 15.38
## BiologicalMaterial11 14.29
## ManufacturingProcess27 13.19
## BiologicalMaterial12 13.19
## ManufacturingProcess06 12.09
## ManufacturingProcess13 10.99
## ManufacturingProcess29 10.99
## ManufacturingProcess11 10.99
## ManufacturingProcess31 10.99
cubist_Pred <- predict(cubist_Model, newdata = test_X2)
postResample(pred = cubist_Pred, obs = test_Y2)
## RMSE Rsquared MAE
## 0.8929243 0.7793957 0.6829956
CUBIST: RMSE=1.206861 test: RMSE Rsquared MAE 0.8957785 0.7772698 0.6900375
##DISCUSSION:
The tree based model (random forest, gradient boost, cubist) with the optimal resampling and test set performance is:
CUBIST RMSE=1.206 test RMSE=.8957
Which predictors are most important in the optimal tree-based regression model? Do either the biological or process variables dominate the list? How do the top 10 important predictors compare to the top 10 predictors from the optimal linear and non linear models?
#CUBIST VARIABLE IMPORTANCE
varImp(cubist_Model)
## cubist variable importance
##
## only 20 most important variables shown (out of 57)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess17 64.84
## ManufacturingProcess28 56.04
## ManufacturingProcess39 40.66
## ManufacturingProcess33 37.36
## BiologicalMaterial06 28.57
## ManufacturingProcess09 26.37
## ManufacturingProcess04 23.08
## BiologicalMaterial02 19.78
## ManufacturingProcess24 15.38
## BiologicalMaterial03 15.38
## ManufacturingProcess01 15.38
## BiologicalMaterial11 14.29
## ManufacturingProcess27 13.19
## BiologicalMaterial12 13.19
## ManufacturingProcess06 12.09
## ManufacturingProcess29 10.99
## ManufacturingProcess11 10.99
## ManufacturingProcess13 10.99
## ManufacturingProcess31 10.99
Manufacturing predictors dominate the list with 8 of the 10 spots.
Recall SVM non-Linear Var IMP: ## Overall ## ManufacturingProcess32 100.00 ## ManufacturingProcess17 80.31 ## BiologicalMaterial06 75.09 ## ManufacturingProcess13 74.47 ## BiologicalMaterial03 67.11 ## ManufacturingProcess06 66.44 ## ManufacturingProcess36 66.30 ## BiologicalMaterial12 65.94 ## ManufacturingProcess09 65.29 ## BiologicalMaterial02 55.94 ## ManufacturingProcess31 53.57 ## ManufacturingProcess29 47.17 ## ManufacturingProcess33 45.79 ## BiologicalMaterial08 44.98 ## ManufacturingProcess11 41.72 ## ManufacturingProcess02 41.58 ## BiologicalMaterial11 41.18 ## BiologicalMaterial04 40.31 ## BiologicalMaterial09 35.12 ## BiologicalMaterial01 34.15
AND the linear Elastic net….
The nonlinear SVM (6 of 10 Manufacturing) and linear Elastic net: 6 of 10 Manufacturingpredictors are manufacturing.
AND the linear Elastic net…. The top 20 from the linear model elastnet were: Overall Elastic net var importance ManufacturingProcess13 100.00 ManufacturingProcess32 93.67 BiologicalMaterial03 92.86 BiologicalMaterial06 86.68 ManufacturingProcess17 80.34 BiologicalMaterial12 76.76 ManufacturingProcess09 76.15 ManufacturingProcess36 75.95 ManufacturingProcess06 63.29 BiologicalMaterial02 59.13 ManufacturingProcess11 54.38 BiologicalMaterial11 53.65 ManufacturingProcess31 51.81 BiologicalMaterial04 48.83 BiologicalMaterial09 47.43 ManufacturingProcess18 43.88 ManufacturingProcess30 42.31 BiologicalMaterial0
Plot the optimal single tree with the distribution of yield in the terminal nodes. Does this view of the data provide additional knowledge about the biological or process predictors and their relationship with yield?
set.seed(200)
st_Model <- train(x = train_X2, y = train_Y2, method = "rpart",
tuneLength = 10, control=rpart.control(maxdepth=2))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
st_Model
## CART
##
## 132 samples
## 57 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 132, 132, 132, 132, 132, 132, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.01329166 1.524074 0.3550603 1.213918
## 0.01915705 1.524074 0.3550603 1.213918
## 0.02096592 1.524074 0.3550603 1.213918
## 0.02929826 1.524074 0.3550603 1.213918
## 0.03394084 1.524074 0.3550603 1.213918
## 0.04299920 1.522016 0.3566972 1.213177
## 0.04755934 1.522458 0.3563845 1.211768
## 0.06066735 1.529012 0.3534197 1.215071
## 0.09672549 1.541091 0.3375758 1.221246
## 0.39433963 1.698655 0.2806679 1.366262
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.0429992.
rpart.plot::rpart.plot(st_Model$finalModel)
##DISCUSSION:
Manufacturing32 is at the top of all models. Manufacturing 17 is at number 2 or 3 in other models.
Certainly, the analysis gives you a feel that certain manufacturing variables are good predictors for yield as opposed to their biological counterparts.