library(mlbench)
library(AppliedPredictiveModeling)
library(mice)
library(glmnet)
library(rpart.plot)
library(rpart)
library(randomForest)
library(caret)
library(gbm)
library(party)
set.seed(200)
simulated <- mlbench.friedman1(200,sd=1)
simulated <- cbind(simulated$x,simulated$y)
simulated <- as.data.frame(simulated)
colnames(simulated) [ncol(simulated)] <- "y"model1 <- randomForest(y ~ .,data = simulated,
importance = T,
ntree=1000)
rfImp1 <- varImp(model1,scale = F)
rfImp1## Overall
## V1 8.83890885
## V2 6.49023056
## V3 0.67583163
## V4 7.58822553
## V5 2.27426009
## V6 0.17436781
## V7 0.15136583
## V8 -0.03078937
## V9 -0.02989832
## V10 -0.08529218
Answer: No.
simulated$duplicate1 <- simulated$V1 + rnorm(200) * .1
cor(simulated$duplicate1,simulated$V1)## [1] 0.9396216
set.seed(200)
model2 <- randomForest(y ~ .,data = simulated,
importance = T,
ntree=1000)
rfImp2 <- varImp(model2,scale = F)
rfImp2## Overall
## V1 5.98437487
## V2 6.25489202
## V3 0.43837543
## V4 7.15358387
## V5 2.48949664
## V6 0.11440487
## V7 -0.02794148
## V8 -0.06304965
## V9 0.01960016
## V10 0.04054481
## duplicate1 3.73591550
The V1 importance value changed. when you add other correlated variables, this gets affected the model reduces the importance of that variable.
## model
model3 <- cforest(y~.,data = simulated)
rfImp3 <- varImp(model3,conditional = T)
rfImp3## Overall
## V1 3.1693280188
## V2 4.8840435653
## V3 0.0098451007
## V4 6.2260531322
## V5 1.1189835747
## V6 -0.0018071530
## V7 0.0281145471
## V8 -0.0028838278
## V9 -0.0004290913
## V10 -0.0049792526
## duplicate1 0.8346559974
## Boosted Trees
gbmG <- expand.grid(.interaction.depth = seq(1, 7, by = 2),
.n.trees = seq(100, 1000, by = 100),
.shrinkage = c(0.01, 0.1),
.n.minobsinnode = 8)
set.seed(100)
gbmModel <- train(y ~.,data=simulated,
method = "gbm",
tuneGrid =gbmG,
verbose=F
)
gbmModel## Stochastic Gradient Boosting
##
## 200 samples
## 11 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 200, 200, 200, 200, 200, 200, ...
## Resampling results across tuning parameters:
##
## shrinkage interaction.depth n.trees RMSE Rsquared MAE
## 0.01 1 100 4.178012 0.5995731 3.440232
## 0.01 1 200 3.622404 0.6786334 2.988966
## 0.01 1 300 3.244692 0.7172428 2.682294
## 0.01 1 400 2.973813 0.7460880 2.453410
## 0.01 1 500 2.762667 0.7702920 2.273950
## 0.01 1 600 2.604577 0.7883159 2.136035
## 0.01 1 700 2.469156 0.8038703 2.015820
## 0.01 1 800 2.362118 0.8156714 1.922345
## 0.01 1 900 2.273324 0.8248678 1.845458
## 0.01 1 1000 2.199770 0.8322394 1.782315
## 0.01 3 100 3.634945 0.7002001 2.986401
## 0.01 3 200 2.954822 0.7600813 2.421691
## 0.01 3 300 2.568118 0.7990696 2.095650
## 0.01 3 400 2.328693 0.8235169 1.888299
## 0.01 3 500 2.180180 0.8376818 1.759354
## 0.01 3 600 2.085584 0.8464500 1.676836
## 0.01 3 700 2.026024 0.8516805 1.624534
## 0.01 3 800 1.989089 0.8548408 1.591619
## 0.01 3 900 1.962865 0.8571933 1.568312
## 0.01 3 1000 1.945740 0.8586485 1.552526
## 0.01 5 100 3.446506 0.7285909 2.831779
## 0.01 5 200 2.767486 0.7803475 2.267982
## 0.01 5 300 2.414322 0.8137915 1.963803
## 0.01 5 400 2.217119 0.8329981 1.792983
## 0.01 5 500 2.106997 0.8431495 1.696944
## 0.01 5 600 2.043564 0.8489449 1.641401
## 0.01 5 700 2.004394 0.8526798 1.607344
## 0.01 5 800 1.981411 0.8547352 1.587122
## 0.01 5 900 1.966312 0.8560527 1.574756
## 0.01 5 1000 1.954311 0.8572874 1.564144
## 0.01 7 100 3.372906 0.7370016 2.763059
## 0.01 7 200 2.705003 0.7851620 2.210439
## 0.01 7 300 2.378531 0.8150526 1.928110
## 0.01 7 400 2.205493 0.8317320 1.777105
## 0.01 7 500 2.113172 0.8404191 1.695506
## 0.01 7 600 2.060774 0.8455228 1.649824
## 0.01 7 700 2.030694 0.8484757 1.623990
## 0.01 7 800 2.013265 0.8500857 1.609038
## 0.01 7 900 1.999323 0.8515334 1.597335
## 0.01 7 1000 1.989479 0.8525312 1.589274
## 0.10 1 100 2.195764 0.8285276 1.776892
## 0.10 1 200 1.959457 0.8503871 1.582230
## 0.10 1 300 1.932871 0.8526404 1.558672
## 0.10 1 400 1.926477 0.8526955 1.551107
## 0.10 1 500 1.932369 0.8517473 1.552430
## 0.10 1 600 1.934477 0.8511304 1.554695
## 0.10 1 700 1.946886 0.8490138 1.563973
## 0.10 1 800 1.948370 0.8486842 1.560990
## 0.10 1 900 1.953661 0.8479323 1.563938
## 0.10 1 1000 1.960700 0.8468495 1.569715
## 0.10 3 100 1.988825 0.8500829 1.585613
## 0.10 3 200 1.938198 0.8555845 1.542416
## 0.10 3 300 1.933977 0.8556436 1.539291
## 0.10 3 400 1.929703 0.8558850 1.534560
## 0.10 3 500 1.929368 0.8558105 1.533742
## 0.10 3 600 1.929929 0.8556187 1.535045
## 0.10 3 700 1.931114 0.8554105 1.536152
## 0.10 3 800 1.931348 0.8553588 1.536404
## 0.10 3 900 1.931773 0.8552943 1.536791
## 0.10 3 1000 1.931845 0.8552686 1.536923
## 0.10 5 100 2.049572 0.8406216 1.652940
## 0.10 5 200 2.022033 0.8436971 1.628273
## 0.10 5 300 2.018530 0.8438815 1.625497
## 0.10 5 400 2.017191 0.8439611 1.624523
## 0.10 5 500 2.016495 0.8439874 1.623664
## 0.10 5 600 2.016455 0.8439491 1.623628
## 0.10 5 700 2.016606 0.8439161 1.623759
## 0.10 5 800 2.016657 0.8438950 1.623783
## 0.10 5 900 2.016749 0.8438773 1.623799
## 0.10 5 1000 2.016787 0.8438720 1.623811
## 0.10 7 100 2.072232 0.8366470 1.664053
## 0.10 7 200 2.048919 0.8391755 1.642860
## 0.10 7 300 2.044841 0.8397148 1.639547
## 0.10 7 400 2.044210 0.8397409 1.638574
## 0.10 7 500 2.043446 0.8398404 1.637736
## 0.10 7 600 2.043324 0.8398537 1.637507
## 0.10 7 700 2.043368 0.8398486 1.637590
## 0.10 7 800 2.043500 0.8398325 1.637673
## 0.10 7 900 2.043454 0.8398480 1.637652
## 0.10 7 1000 2.043402 0.8398584 1.637585
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 8
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 400, interaction.depth =
## 1, shrinkage = 0.1 and n.minobsinnode = 8.
varImp(gbmModel)## gbm variable importance
##
## Overall
## V4 100.0000
## V2 84.7556
## V1 67.9084
## V5 43.4834
## V3 32.3890
## duplicate1 28.4273
## V7 3.0259
## V6 3.0160
## V8 1.4249
## V10 0.7694
## V9 0.0000
## Cubist
library(Cubist)
cubistmodel <- train(y ~ .,data = simulated,method ="cubist")
varImp(cubistmodel)## cubist variable importance
##
## Overall
## V1 100.000
## V2 75.694
## V4 68.750
## V3 58.333
## V5 56.944
## V6 13.889
## duplicate1 2.083
## V7 0.000
## V8 0.000
## V9 0.000
## V10 0.000
## Cubist,removing the duplicate variable
cubistmode2 <- train(y ~ .,data = subset( simulated, select = -duplicate1 ),method ="cubist")
varImp(cubistmode2)## cubist variable importance
##
## Overall
## V1 100.00
## V2 75.69
## V4 68.06
## V3 58.33
## V5 55.56
## V6 15.28
## V9 0.00
## V7 0.00
## V8 0.00
## V10 0.00
## Boosted Trees,removing the duplicate variable
gbmModel <- train(y ~.,data=subset( simulated, select = -duplicate1 ),
method = "gbm",
tuneGrid =gbmG,
verbose=F
)
varImp(gbmModel)## gbm variable importance
##
## Overall
## V4 100.0000
## V1 92.3936
## V2 78.5856
## V5 40.7876
## V3 25.7773
## V7 3.2176
## V6 2.0752
## V8 0.4303
## V9 0.4180
## V10 0.0000
x1 <- rep(1:1, times = 200)
x2 <- rep(1:10, times = 20)
x3 <- rep(6:7, each= 5,times = 20)
x4 <- rep(4:8, each= 2, times = 20)
y <- x1 + x2 + x3+ x4
data <- data.frame(cbind(y,x1,x2,x3,x4))
model <- rpart(y ~.,data=data)
varImp(model)## Overall
## x2 4.1270925
## x3 0.7945946
## x4 4.0832487
## x1 0.0000000
We can see the model bias when selecting the variables with higher variance than the one with lower variance. we can see that variable x is all 1 value and the overall importance was 0.
A-)
I would say it’s affected by how much on the tunning section(Bagging and Shrikange) it has which makes the model be cutting off more and be more sharp and precise by selecting the variables.
b-)
we can see that the model on the right it more selecting with the importance of the variables, for which I would say that this can make this more accurate with few variables but in the left also can be a good preditor too, just that will be selecting more variables.
c-)
increasing the interaction depth will affect the slope of the predictor. the more the increase the more the importance variable decreased
data("ChemicalManufacturingProcess")
data <- ChemicalManufacturingProcess
Amelia::missmap(data)temp <- mice(data,m=5, maxit = 50, method = 'pmm', seed = 500,printFlag = F)## Warning: Number of logged events: 6750
imputed.data <- complete(temp)
imputed.data <- imputed.data[,-nearZeroVar(imputed.data)]
Amelia::missmap(imputed.data)set.seed(250)
yield_data <- createDataPartition(imputed.data$Yield, p=0.8, list=F)
train_data <- imputed.data[yield_data, ]
test_data <- imputed.data[-yield_data, ]
## Model 1
model_1 <- randomForest(Yield ~ .,data = train_data,
importance = T,
ntree=1000)
varImp(model_1,scale = F)## Overall
## BiologicalMaterial01 0.0426966936
## BiologicalMaterial02 0.1240280327
## BiologicalMaterial03 0.2670477431
## BiologicalMaterial04 0.0738353195
## BiologicalMaterial05 0.0390409516
## BiologicalMaterial06 0.3027663975
## BiologicalMaterial08 0.0729511657
## BiologicalMaterial09 0.0372654223
## BiologicalMaterial10 0.0131264695
## BiologicalMaterial11 0.1105449282
## BiologicalMaterial12 0.2926017443
## ManufacturingProcess01 0.0512100922
## ManufacturingProcess02 0.0230209516
## ManufacturingProcess03 0.0046418270
## ManufacturingProcess04 0.0175594873
## ManufacturingProcess05 0.0203529768
## ManufacturingProcess06 0.0613674512
## ManufacturingProcess07 -0.0012368999
## ManufacturingProcess08 0.0018919718
## ManufacturingProcess09 0.0969428522
## ManufacturingProcess10 0.0048648135
## ManufacturingProcess11 0.0278007086
## ManufacturingProcess12 0.0069989400
## ManufacturingProcess13 0.1662447869
## ManufacturingProcess14 0.0015902013
## ManufacturingProcess15 0.0059216136
## ManufacturingProcess16 0.0202062122
## ManufacturingProcess17 0.1087102678
## ManufacturingProcess18 0.0405865801
## ManufacturingProcess19 0.0135716556
## ManufacturingProcess20 0.0369963520
## ManufacturingProcess21 0.0112075876
## ManufacturingProcess22 0.0090416783
## ManufacturingProcess23 0.0029466812
## ManufacturingProcess24 0.0192405975
## ManufacturingProcess25 0.0408411742
## ManufacturingProcess26 0.0079980450
## ManufacturingProcess27 0.0359580098
## ManufacturingProcess28 0.1069922929
## ManufacturingProcess29 0.0242011249
## ManufacturingProcess30 0.0333595148
## ManufacturingProcess31 0.1588404515
## ManufacturingProcess32 1.1421379654
## ManufacturingProcess33 0.0627240975
## ManufacturingProcess34 0.0062974536
## ManufacturingProcess35 -0.0021251947
## ManufacturingProcess36 0.1853098443
## ManufacturingProcess37 0.0071797826
## ManufacturingProcess38 0.0040607220
## ManufacturingProcess39 0.0548080692
## ManufacturingProcess40 -0.0010118684
## ManufacturingProcess41 0.0036438697
## ManufacturingProcess42 0.0197711583
## ManufacturingProcess43 0.0180009404
## ManufacturingProcess44 0.0065751749
## ManufacturingProcess45 0.0005136061
m1Pred <- predict(model_1,newdata = test_data)
postResample(pred = m1Pred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.0613079 0.7141271 0.7959005
## Model 2
model_2 <- cforest(Yield~.,data = train_data)
varImp(model_2,conditional = T)## Overall
## BiologicalMaterial01 0.0030101489
## BiologicalMaterial02 0.0552226104
## BiologicalMaterial03 0.0471211332
## BiologicalMaterial04 0.0210819842
## BiologicalMaterial05 0.0084635541
## BiologicalMaterial06 0.0621491894
## BiologicalMaterial08 0.0102516839
## BiologicalMaterial09 0.0001837210
## BiologicalMaterial10 -0.0003240165
## BiologicalMaterial11 0.0102918968
## BiologicalMaterial12 0.0297864445
## ManufacturingProcess01 0.0020023506
## ManufacturingProcess02 0.0066310054
## ManufacturingProcess03 -0.0038548565
## ManufacturingProcess04 -0.0001491352
## ManufacturingProcess05 0.0029402878
## ManufacturingProcess06 0.0418093763
## ManufacturingProcess07 -0.0009710851
## ManufacturingProcess08 -0.0018998929
## ManufacturingProcess09 0.0865705386
## ManufacturingProcess10 0.0016134526
## ManufacturingProcess11 0.0160208912
## ManufacturingProcess12 0.0095589829
## ManufacturingProcess13 0.0701623767
## ManufacturingProcess14 -0.0007784407
## ManufacturingProcess15 0.0018814450
## ManufacturingProcess16 0.0021811429
## ManufacturingProcess17 0.0404237411
## ManufacturingProcess18 0.0049236461
## ManufacturingProcess19 0.0039359128
## ManufacturingProcess20 0.0109191683
## ManufacturingProcess21 -0.0015598975
## ManufacturingProcess22 0.0006088870
## ManufacturingProcess23 0.0006669005
## ManufacturingProcess24 0.0015587443
## ManufacturingProcess25 0.0048959455
## ManufacturingProcess26 0.0031421256
## ManufacturingProcess27 0.0077562831
## ManufacturingProcess28 0.0036051715
## ManufacturingProcess29 0.0243083982
## ManufacturingProcess30 0.0030013839
## ManufacturingProcess31 0.0373292819
## ManufacturingProcess32 0.2284602646
## ManufacturingProcess33 0.0371538119
## ManufacturingProcess34 0.0074333863
## ManufacturingProcess35 0.0020324927
## ManufacturingProcess36 0.0857178519
## ManufacturingProcess37 0.0071399660
## ManufacturingProcess38 -0.0019010303
## ManufacturingProcess39 0.0077861400
## ManufacturingProcess40 -0.0003153308
## ManufacturingProcess41 0.0003226814
## ManufacturingProcess42 -0.0008832979
## ManufacturingProcess43 0.0038618663
## ManufacturingProcess44 0.0062521291
## ManufacturingProcess45 0.0007253181
m2Pred <- predict(model_2,newdata = test_data)
postResample(pred = m2Pred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.3013265 0.5610503 1.0313082
## Model 3
gbmModel <- train(Yield ~.,data=train_data,
method = "gbm",
tuneGrid =gbmG,
verbose=F
)
varImp(gbmModel)## gbm variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09 18.349
## ManufacturingProcess06 15.948
## BiologicalMaterial12 15.750
## ManufacturingProcess17 11.702
## ManufacturingProcess13 11.172
## BiologicalMaterial11 8.631
## ManufacturingProcess31 8.455
## BiologicalMaterial03 8.372
## BiologicalMaterial09 7.724
## BiologicalMaterial06 7.263
## ManufacturingProcess01 7.211
## ManufacturingProcess05 7.000
## ManufacturingProcess25 6.271
## ManufacturingProcess28 5.278
## BiologicalMaterial02 5.137
## BiologicalMaterial05 5.129
## ManufacturingProcess37 4.591
## BiologicalMaterial04 4.562
## ManufacturingProcess18 4.530
gbmPred <- predict(gbmModel,newdata = test_data)
postResample(pred = gbmPred , obs = test_data$Yield)## RMSE Rsquared MAE
## 0.9293387 0.7664653 0.7422395
## Model 4
plsModel <- train(
Yield~., data = train_data, method = "pls",
trControl = trainControl("cv", number = 10),
scale = T,
tuneLength = 20
)
varImp(plsModel)##
## Attaching package: 'pls'
## The following object is masked from 'package:caret':
##
## R2
## The following object is masked from 'package:stats':
##
## loadings
## pls variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess36 76.62
## ManufacturingProcess13 71.23
## ManufacturingProcess09 69.26
## ManufacturingProcess17 66.09
## ManufacturingProcess06 61.12
## ManufacturingProcess33 61.00
## BiologicalMaterial02 56.78
## BiologicalMaterial06 54.14
## BiologicalMaterial08 51.87
## BiologicalMaterial12 48.38
## BiologicalMaterial11 48.19
## BiologicalMaterial03 48.05
## ManufacturingProcess12 48.02
## BiologicalMaterial01 47.92
## ManufacturingProcess29 47.55
## BiologicalMaterial04 45.74
## ManufacturingProcess04 41.77
## ManufacturingProcess28 38.63
## ManufacturingProcess37 38.16
plsPred <- predict(plsModel,newdata = test_data)
postResample(pred = plsPred , obs = test_data$Yield)## RMSE Rsquared MAE
## 1.6880579 0.3808667 1.1458658
data.frame(rbind(randomForest = postResample(pred = m1Pred , obs = test_data$Yield),cforest = postResample(pred = m2Pred , obs = test_data$Yield), Gbm = postResample(pred = gbmPred , obs = test_data$Yield), pls = postResample(pred = plsPred , obs = test_data$Yield) ))## RMSE Rsquared MAE
## randomForest 1.0613079 0.7141271 0.7959005
## cforest 1.3013265 0.5610503 1.0313082
## Gbm 0.9293387 0.7664653 0.7422395
## pls 1.6880579 0.3808667 1.1458658
varImp(plsModel)## pls variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.00
## ManufacturingProcess36 76.62
## ManufacturingProcess13 71.23
## ManufacturingProcess09 69.26
## ManufacturingProcess17 66.09
## ManufacturingProcess06 61.12
## ManufacturingProcess33 61.00
## BiologicalMaterial02 56.78
## BiologicalMaterial06 54.14
## BiologicalMaterial08 51.87
## BiologicalMaterial12 48.38
## BiologicalMaterial11 48.19
## BiologicalMaterial03 48.05
## ManufacturingProcess12 48.02
## BiologicalMaterial01 47.92
## ManufacturingProcess29 47.55
## BiologicalMaterial04 45.74
## ManufacturingProcess04 41.77
## ManufacturingProcess28 38.63
## ManufacturingProcess37 38.16
varImp(gbmModel)## gbm variable importance
##
## only 20 most important variables shown (out of 56)
##
## Overall
## ManufacturingProcess32 100.000
## ManufacturingProcess09 18.349
## ManufacturingProcess06 15.948
## BiologicalMaterial12 15.750
## ManufacturingProcess17 11.702
## ManufacturingProcess13 11.172
## BiologicalMaterial11 8.631
## ManufacturingProcess31 8.455
## BiologicalMaterial03 8.372
## BiologicalMaterial09 7.724
## BiologicalMaterial06 7.263
## ManufacturingProcess01 7.211
## ManufacturingProcess05 7.000
## ManufacturingProcess25 6.271
## ManufacturingProcess28 5.278
## BiologicalMaterial02 5.137
## BiologicalMaterial05 5.129
## ManufacturingProcess37 4.591
## BiologicalMaterial04 4.562
## ManufacturingProcess18 4.530
When comparing the gbm model with the nonlinear PLS model we see that on both models the “manufacturing process” variables still dominating the list. you can also seem some changes in the importance order on the variables but seem very similar to the pick of the 10 top predictors. But we can see the Gbm model gives a better result checking on the metrics RMSE and R2.
rpartModel <- train(Yield ~.,data=train_data,
method = "rpart2",
tuneLength = 10,
trControl = trainControl(method = "cv")
)
rpart.plot(rpartModel$finalModel)Observing the tree graph we can see more clearly that Manufacturing process 32 has to be less than 160 and process 06 less than 208 and biological material less than 20, taking into consideration this is the 3 top predictors variables for which have the most influential in the model and the Yield results.