Select the most accurate model for your predictive analytics
project.
When working on a machine learning project, you often have several good
models to choose from. Each of the models you selected needs to be
measure for accuracy.
In order to select the best and final model(s), you should use several different methods to estimated the accuracy of your machine learning models.
One of these methods is the Vertical Box-and-Whisker Plot visualization method. This type of plot compares all models’ accuracy distribution and rank models accuracy from highest to lowest.
Here are the steps on how to select the best and final model(s) using the Vertical Box-and-Whisker Plot method:
setwd("~/Documents/Compare 13 Classification Models And Select The Best Using The Caret R Package")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlbench)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
data(PimaIndiansDiabetes)
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
# change reference value to 'pos' using the relevel() function.
PimaIndiansDiabetes$diabetes <- relevel(PimaIndiansDiabetes$diabetes, ref = "pos")
levels(PimaIndiansDiabetes$diabetes)
## [1] "pos" "neg"
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "pos","neg": 1 2 1 2 1 2 1 2 1 1 ...
set.seed(3456)
trainIndex <- createDataPartition(PimaIndiansDiabetes$diabetes, p = .8, list = FALSE, times = 1)
df_Train <- PimaIndiansDiabetes[ trainIndex,]
df_Test <- PimaIndiansDiabetes[-trainIndex,]
names(getModelInfo())
## [1] "ada" "AdaBag" "AdaBoost.M1"
## [4] "adaboost" "amdai" "ANFIS"
## [7] "avNNet" "awnb" "awtan"
## [10] "bag" "bagEarth" "bagEarthGCV"
## [13] "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "binda"
## [19] "blackboost" "blasso" "blassoAveraged"
## [22] "bridge" "brnn" "BstLm"
## [25] "bstSm" "bstTree" "C5.0"
## [28] "C5.0Cost" "C5.0Rules" "C5.0Tree"
## [31] "cforest" "chaid" "CSimca"
## [34] "ctree" "ctree2" "cubist"
## [37] "dda" "deepboost" "DENFIS"
## [40] "dnn" "dwdLinear" "dwdPoly"
## [43] "dwdRadial" "earth" "elm"
## [46] "enet" "evtree" "extraTrees"
## [49] "fda" "FH.GBML" "FIR.DM"
## [52] "foba" "FRBCS.CHI" "FRBCS.W"
## [55] "FS.HGD" "gam" "gamboost"
## [58] "gamLoess" "gamSpline" "gaussprLinear"
## [61] "gaussprPoly" "gaussprRadial" "gbm_h2o"
## [64] "gbm" "gcvEarth" "GFS.FR.MOGUL"
## [67] "GFS.LT.RS" "GFS.THRIFT" "glm.nb"
## [70] "glm" "glmboost" "glmnet_h2o"
## [73] "glmnet" "glmStepAIC" "gpls"
## [76] "hda" "hdda" "hdrda"
## [79] "HYFIS" "icr" "J48"
## [82] "JRip" "kernelpls" "kknn"
## [85] "knn" "krlsPoly" "krlsRadial"
## [88] "lars" "lars2" "lasso"
## [91] "lda" "lda2" "leapBackward"
## [94] "leapForward" "leapSeq" "Linda"
## [97] "lm" "lmStepAIC" "LMT"
## [100] "loclda" "logicBag" "LogitBoost"
## [103] "logreg" "lssvmLinear" "lssvmPoly"
## [106] "lssvmRadial" "lvq" "M5"
## [109] "M5Rules" "manb" "mda"
## [112] "Mlda" "mlp" "mlpKerasDecay"
## [115] "mlpKerasDecayCost" "mlpKerasDropout" "mlpKerasDropoutCost"
## [118] "mlpML" "mlpSGD" "mlpWeightDecay"
## [121] "mlpWeightDecayML" "monmlp" "msaenet"
## [124] "multinom" "mxnet" "mxnetAdam"
## [127] "naive_bayes" "nb" "nbDiscrete"
## [130] "nbSearch" "neuralnet" "nnet"
## [133] "nnls" "nodeHarvest" "null"
## [136] "OneR" "ordinalNet" "ordinalRF"
## [139] "ORFlog" "ORFpls" "ORFridge"
## [142] "ORFsvm" "ownn" "pam"
## [145] "parRF" "PART" "partDSA"
## [148] "pcaNNet" "pcr" "pda"
## [151] "pda2" "penalized" "PenalizedLDA"
## [154] "plr" "pls" "plsRglm"
## [157] "polr" "ppr" "pre"
## [160] "PRIM" "protoclass" "qda"
## [163] "QdaCov" "qrf" "qrnn"
## [166] "randomGLM" "ranger" "rbf"
## [169] "rbfDDA" "Rborist" "rda"
## [172] "regLogistic" "relaxo" "rf"
## [175] "rFerns" "RFlda" "rfRules"
## [178] "ridge" "rlda" "rlm"
## [181] "rmda" "rocc" "rotationForest"
## [184] "rotationForestCp" "rpart" "rpart1SE"
## [187] "rpart2" "rpartCost" "rpartScore"
## [190] "rqlasso" "rqnc" "RRF"
## [193] "RRFglobal" "rrlda" "RSimca"
## [196] "rvmLinear" "rvmPoly" "rvmRadial"
## [199] "SBC" "sda" "sdwd"
## [202] "simpls" "SLAVE" "slda"
## [205] "smda" "snn" "sparseLDA"
## [208] "spikeslab" "spls" "stepLDA"
## [211] "stepQDA" "superpc" "svmBoundrangeString"
## [214] "svmExpoString" "svmLinear" "svmLinear2"
## [217] "svmLinear3" "svmLinearWeights" "svmLinearWeights2"
## [220] "svmPoly" "svmRadial" "svmRadialCost"
## [223] "svmRadialSigma" "svmRadialWeights" "svmSpectrumString"
## [226] "tan" "tanSearch" "treebag"
## [229] "vbmpRadial" "vglmAdjCat" "vglmContRatio"
## [232] "vglmCumulative" "widekernelpls" "WM"
## [235] "wsrf" "xgbDART" "xgbLinear"
## [238] "xgbTree" "xyf"
modelLookup(model = "lvq")
## model parameter label forReg forClass probModel
## 1 lvq size Codebook Size FALSE TRUE FALSE
## 2 lvq k #Prototypes FALSE TRUE FALSE
modelLookup(model = "gbm")
## model parameter label forReg forClass probModel
## 1 gbm n.trees # Boosting Iterations TRUE TRUE TRUE
## 2 gbm interaction.depth Max Tree Depth TRUE TRUE TRUE
## 3 gbm shrinkage Shrinkage TRUE TRUE TRUE
## 4 gbm n.minobsinnode Min. Terminal Node Size TRUE TRUE TRUE
modelLookup(model = "svmRadial")
## model parameter label forReg forClass probModel
## 1 svmRadial sigma Sigma TRUE TRUE TRUE
## 2 svmRadial C Cost TRUE TRUE TRUE
modelLookup(model = "glm")
## model parameter label forReg forClass probModel
## 1 glm parameter parameter TRUE TRUE TRUE
modelLookup(model = "treebag")
## model parameter label forReg forClass probModel
## 1 treebag parameter parameter TRUE TRUE TRUE
modelLookup(model = "rf")
## model parameter label forReg forClass probModel
## 1 rf mtry #Randomly Selected Predictors TRUE TRUE TRUE
modelLookup(model = "C5.0")
## model parameter label forReg forClass probModel
## 1 C5.0 trials # Boosting Iterations FALSE TRUE TRUE
## 2 C5.0 model Model Type FALSE TRUE TRUE
## 3 C5.0 winnow Winnow FALSE TRUE TRUE
modelLookup(model = "lda")
## model parameter label forReg forClass probModel
## 1 lda parameter parameter FALSE TRUE TRUE
modelLookup(model = "glmnet")
## model parameter label forReg forClass probModel
## 1 glmnet alpha Mixing Percentage TRUE TRUE TRUE
## 2 glmnet lambda Regularization Parameter TRUE TRUE TRUE
modelLookup(model = "knn")
## model parameter label forReg forClass probModel
## 1 knn k #Neighbors TRUE TRUE TRUE
modelLookup(model = "rpart")
## model parameter label forReg forClass probModel
## 1 rpart cp Complexity Parameter TRUE TRUE TRUE
modelLookup(model = "nb")
## model parameter label forReg forClass probModel
## 1 nb fL Laplace Correction FALSE TRUE TRUE
## 2 nb usekernel Distribution Type FALSE TRUE TRUE
## 3 nb adjust Bandwidth Adjustment FALSE TRUE TRUE
modelLookup(model = "xgbTree")
## model parameter label forReg forClass
## 1 xgbTree nrounds # Boosting Iterations TRUE TRUE
## 2 xgbTree max_depth Max Tree Depth TRUE TRUE
## 3 xgbTree eta Shrinkage TRUE TRUE
## 4 xgbTree gamma Minimum Loss Reduction TRUE TRUE
## 5 xgbTree colsample_bytree Subsample Ratio of Columns TRUE TRUE
## 6 xgbTree min_child_weight Minimum Sum of Instance Weight TRUE TRUE
## 7 xgbTree subsample Subsample Percentage TRUE TRUE
## probModel
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 TRUE
control<- trainControl(method="repeatedcv", number=10, repeats=3)
set.seed(7)
modelLvq <- train(diabetes~., data=df_Train, method="lvq", metric="Accuracy", trControl=control)
# estimate variable importance
importanceLvq <- varImp(modelLvq, scale=FALSE)
# summarize importance
print(importanceLvq)
## ROC curve variable importance
##
## Importance
## glucose 0.8068
## age 0.6782
## mass 0.6729
## pedigree 0.6194
## pregnant 0.6159
## pressure 0.5867
## triceps 0.5617
## insulin 0.5496
# plot importance
plot(importanceLvq, main = "Variable Importance LVQ Model")
# use model to predict probability of default
Lvq_pred <- predict(modelLvq, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, Lvq_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 29 24
## neg 30 70
##
## Accuracy : 0.6471
## 95% CI : (0.5658, 0.7225)
## No Information Rate : 0.6144
## P-Value [Acc > NIR] : 0.2283
##
## Kappa : 0.2408
##
## Mcnemar's Test P-Value : 0.4962
##
## Sensitivity : 0.4915
## Specificity : 0.7447
## Pos Pred Value : 0.5472
## Neg Pred Value : 0.7000
## Prevalence : 0.3856
## Detection Rate : 0.1895
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6181
##
## 'Positive' Class : pos
##
library(gbm)
## Loaded gbm 2.1.8.1
set.seed(7)
modelGbm <- train(diabetes~., data=df_Train, method="gbm", metric="Accuracy", trControl=control, verbose=FALSE)
# estimate variable importance
importanceGbm <- varImp(modelGbm, scale=FALSE)
# summarize importance
print(importanceGbm)
## gbm variable importance
##
## Overall
## glucose 111.807
## mass 28.584
## age 24.290
## pedigree 12.337
## pregnant 6.508
## pressure 1.832
## insulin 0.000
## triceps 0.000
# plot importance
plot(importanceGbm, main = "Variable Importance GBM Model")
# use model to predict probability of default
Gbm_pred <- predict(modelGbm, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, Gbm_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 27 26
## neg 17 83
##
## Accuracy : 0.719
## 95% CI : (0.6407, 0.7886)
## No Information Rate : 0.7124
## P-Value [Acc > NIR] : 0.4695
##
## Kappa : 0.3535
##
## Mcnemar's Test P-Value : 0.2225
##
## Sensitivity : 0.6136
## Specificity : 0.7615
## Pos Pred Value : 0.5094
## Neg Pred Value : 0.8300
## Prevalence : 0.2876
## Detection Rate : 0.1765
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6876
##
## 'Positive' Class : pos
##
set.seed(7)
modelSvm <- train(diabetes~., data=df_Train, method="svmRadial", metric="Accuracy", trControl=control)
# estimate variable importance
importanceSvm <- varImp(modelSvm, scale=FALSE)
# summarize importance
print(importanceSvm)
## ROC curve variable importance
##
## Importance
## glucose 0.8068
## age 0.6782
## mass 0.6729
## pedigree 0.6194
## pregnant 0.6159
## pressure 0.5867
## triceps 0.5617
## insulin 0.5496
# plot importance
plot(importanceSvm, main = "Variable Importance SVM Model")
# use model to predict probability of default
Svm_pred <- predict(modelSvm, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, Svm_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 24 29
## neg 10 90
##
## Accuracy : 0.7451
## 95% CI : (0.6684, 0.812)
## No Information Rate : 0.7778
## P-Value [Acc > NIR] : 0.857162
##
## Kappa : 0.3853
##
## Mcnemar's Test P-Value : 0.003948
##
## Sensitivity : 0.7059
## Specificity : 0.7563
## Pos Pred Value : 0.4528
## Neg Pred Value : 0.9000
## Prevalence : 0.2222
## Detection Rate : 0.1569
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.7311
##
## 'Positive' Class : pos
##
set.seed(7)
modelGlm <- train(diabetes~., data=df_Train, method="glm", metric="Accuracy", trControl=control)
# estimate variable importance
importanceGlm <- varImp(modelGlm, scale=FALSE)
# summarize importance
print(importanceGlm)
## glm variable importance
##
## Overall
## glucose 9.3086
## mass 4.6648
## pregnant 3.7379
## pedigree 2.9118
## pressure 2.1757
## insulin 1.5014
## age 0.5091
## triceps 0.4291
# plot importance
plot(importanceGlm, main = "Variable Importance GLM Model")
# use model to predict probability of default
Glm_pred <- predict(modelGlm, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, Glm_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 26 27
## neg 15 85
##
## Accuracy : 0.7255
## 95% CI : (0.6476, 0.7945)
## No Information Rate : 0.732
## P-Value [Acc > NIR] : 0.61284
##
## Kappa : 0.3597
##
## Mcnemar's Test P-Value : 0.08963
##
## Sensitivity : 0.6341
## Specificity : 0.7589
## Pos Pred Value : 0.4906
## Neg Pred Value : 0.8500
## Prevalence : 0.2680
## Detection Rate : 0.1699
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6965
##
## 'Positive' Class : pos
##
set.seed(7)
modelbCART <- train(diabetes~., data=df_Train, method="treebag", metric="Accuracy", trControl=control)
# estimate variable importance
importancebCART <- varImp(modelbCART, scale=FALSE)
# summarize importance
print(importancebCART)
## treebag variable importance
##
## Overall
## glucose 168.38
## mass 147.83
## age 124.19
## pedigree 115.02
## pregnant 86.31
## pressure 80.69
## insulin 70.42
## triceps 58.83
# plot importance
plot(importancebCART, main = "Variable Importance Bagged CART Model")
# use model to predict probability of default
bCART_pred <- predict(modelbCART, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, bCART_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 29 24
## neg 21 79
##
## Accuracy : 0.7059
## 95% CI : (0.6269, 0.7767)
## No Information Rate : 0.6732
## P-Value [Acc > NIR] : 0.2202
##
## Kappa : 0.3417
##
## Mcnemar's Test P-Value : 0.7656
##
## Sensitivity : 0.5800
## Specificity : 0.7670
## Pos Pred Value : 0.5472
## Neg Pred Value : 0.7900
## Prevalence : 0.3268
## Detection Rate : 0.1895
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6735
##
## 'Positive' Class : pos
##
set.seed(7)
modelRF <- train(diabetes~., data=df_Train, method="rf", metric="Accuracy", trControl=control)
# estimate variable importance
importanceRF <- varImp(modelRF, scale=FALSE)
# summarize importance
print(importanceRF)
## rf variable importance
##
## Overall
## glucose 76.73
## mass 43.78
## age 35.91
## pedigree 35.28
## pressure 23.50
## pregnant 23.44
## insulin 20.59
## triceps 18.57
# plot importance
plot(importanceRF, main = "Variable Importance Random Forest Model")
# use model to predict probability of default
RF_pred <- predict(modelRF, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, RF_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 29 24
## neg 19 81
##
## Accuracy : 0.719
## 95% CI : (0.6407, 0.7886)
## No Information Rate : 0.6863
## P-Value [Acc > NIR] : 0.2178
##
## Kappa : 0.3653
##
## Mcnemar's Test P-Value : 0.5419
##
## Sensitivity : 0.6042
## Specificity : 0.7714
## Pos Pred Value : 0.5472
## Neg Pred Value : 0.8100
## Prevalence : 0.3137
## Detection Rate : 0.1895
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6878
##
## 'Positive' Class : pos
##
set.seed(7)
modelC50 <- train(diabetes~., data=df_Train, method="C5.0", metric="Accuracy", trControl=control)
# estimate variable importance
importanceC50 <- varImp(modelC50, scale=FALSE)
# summarize importance
print(importanceC50)
## C5.0 variable importance
##
## Overall
## mass 100.00
## age 100.00
## pregnant 100.00
## glucose 100.00
## pedigree 92.36
## pressure 61.30
## insulin 40.81
## triceps 28.46
# plot importance
plot(importanceC50, main = "Variable Importance C5.0 Model")
# use model to predict probability of default
C50_pred <- predict(modelC50, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, C50_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 29 24
## neg 21 79
##
## Accuracy : 0.7059
## 95% CI : (0.6269, 0.7767)
## No Information Rate : 0.6732
## P-Value [Acc > NIR] : 0.2202
##
## Kappa : 0.3417
##
## Mcnemar's Test P-Value : 0.7656
##
## Sensitivity : 0.5800
## Specificity : 0.7670
## Pos Pred Value : 0.5472
## Neg Pred Value : 0.7900
## Prevalence : 0.3268
## Detection Rate : 0.1895
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6735
##
## 'Positive' Class : pos
##
set.seed(7)
modelLDA <- train(diabetes~., data=df_Train, method="lda", metric="Accuracy", trControl=control)
# estimate variable importance
importanceLDA <- varImp(modelLDA, scale=FALSE)
# summarize importance
print(importanceLDA)
## ROC curve variable importance
##
## Importance
## glucose 0.8068
## age 0.6782
## mass 0.6729
## pedigree 0.6194
## pregnant 0.6159
## pressure 0.5867
## triceps 0.5617
## insulin 0.5496
# plot importance
plot(importanceLDA, main = "Variable Importance Linear Discriminate Analysis (LDA) Model")
# use model to predict probability of default
LDA_pred <- predict(modelLDA, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, LDA_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 26 27
## neg 14 86
##
## Accuracy : 0.732
## 95% CI : (0.6545, 0.8003)
## No Information Rate : 0.7386
## P-Value [Acc > NIR] : 0.61385
##
## Kappa : 0.372
##
## Mcnemar's Test P-Value : 0.06092
##
## Sensitivity : 0.6500
## Specificity : 0.7611
## Pos Pred Value : 0.4906
## Neg Pred Value : 0.8600
## Prevalence : 0.2614
## Detection Rate : 0.1699
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.7055
##
## 'Positive' Class : pos
##
set.seed(7)
modelglmnet <- train(diabetes~., data=df_Train, method="glmnet", metric="Accuracy", trControl=control)
# estimate variable importance
importanceglmnet <- varImp(modelglmnet, scale=FALSE)
# summarize importance
print(importanceglmnet)
## glmnet variable importance
##
## Overall
## pedigree 0.968546
## pregnant 0.135017
## mass 0.077501
## glucose 0.040184
## pressure 0.012490
## age 0.005887
## triceps 0.003089
## insulin 0.001410
# plot importance
plot(importanceglmnet, main = "Variable Importance Regularized Logistic Regression Model")
# use model to predict probability of default
glmnet_pred <- predict(modelglmnet, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, glmnet_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 26 27
## neg 14 86
##
## Accuracy : 0.732
## 95% CI : (0.6545, 0.8003)
## No Information Rate : 0.7386
## P-Value [Acc > NIR] : 0.61385
##
## Kappa : 0.372
##
## Mcnemar's Test P-Value : 0.06092
##
## Sensitivity : 0.6500
## Specificity : 0.7611
## Pos Pred Value : 0.4906
## Neg Pred Value : 0.8600
## Prevalence : 0.2614
## Detection Rate : 0.1699
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.7055
##
## 'Positive' Class : pos
##
set.seed(7)
modelKNN <- train(diabetes~., data=df_Train, method="knn", metric="Accuracy", trControl=control)
# estimate variable importance
importanceKNN <- varImp(modelKNN, scale=FALSE)
# summarize importance
print(importanceKNN)
## ROC curve variable importance
##
## Importance
## glucose 0.8068
## age 0.6782
## mass 0.6729
## pedigree 0.6194
## pregnant 0.6159
## pressure 0.5867
## triceps 0.5617
## insulin 0.5496
# plot importance
plot(importanceKNN, main = "Variable Importance k-Nearest Neighbors Model")
# use model to predict probability of default
KNN_pred <- predict(modelKNN, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, KNN_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 28 25
## neg 21 79
##
## Accuracy : 0.6993
## 95% CI : (0.62, 0.7707)
## No Information Rate : 0.6797
## P-Value [Acc > NIR] : 0.3356
##
## Kappa : 0.324
##
## Mcnemar's Test P-Value : 0.6583
##
## Sensitivity : 0.5714
## Specificity : 0.7596
## Pos Pred Value : 0.5283
## Neg Pred Value : 0.7900
## Prevalence : 0.3203
## Detection Rate : 0.1830
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6655
##
## 'Positive' Class : pos
##
set.seed(7)
modelrpart <- train(diabetes~., data=df_Train, method="rpart", metric="Accuracy", trControl=control)
# estimate variable importance
importancerpart <- varImp(modelrpart, scale=FALSE)
# summarize importance
print(importancerpart)
## rpart variable importance
##
## Overall
## glucose 70.568
## mass 40.092
## age 32.935
## pregnant 15.792
## insulin 15.272
## pedigree 4.645
## triceps 0.000
## pressure 0.000
# plot importance
plot(importancerpart, main = "Variable Importance Classification and Regression Trees (CART) Model")
# use model to predict probability of default
rpart_pred <- predict(modelrpart, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, rpart_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 26 27
## neg 16 84
##
## Accuracy : 0.719
## 95% CI : (0.6407, 0.7886)
## No Information Rate : 0.7255
## P-Value [Acc > NIR] : 0.6119
##
## Kappa : 0.3475
##
## Mcnemar's Test P-Value : 0.1273
##
## Sensitivity : 0.6190
## Specificity : 0.7568
## Pos Pred Value : 0.4906
## Neg Pred Value : 0.8400
## Prevalence : 0.2745
## Detection Rate : 0.1699
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6879
##
## 'Positive' Class : pos
##
set.seed(7)
Grid = expand.grid(usekernel=TRUE,adjust=1,fL=c(0.2,0.5,0.8))
modelNB <- train(diabetes~., data=df_Train, method="nb", metric="Accuracy", trControl=control, tuneGrid=Grid)
# estimate variable importance
importanceNB <- varImp(modelNB, scale=FALSE)
# summarize importance
print(importanceNB)
## ROC curve variable importance
##
## Importance
## glucose 0.8068
## age 0.6782
## mass 0.6729
## pedigree 0.6194
## pregnant 0.6159
## pressure 0.5867
## triceps 0.5617
## insulin 0.5496
# plot importance
plot(importanceNB, main = "Variable Importance Naive Bayes (NB) Model")
# use model to predict probability of default
NB_pred <- predict(modelNB, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, NB_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 28 25
## neg 15 85
##
## Accuracy : 0.7386
## 95% CI : (0.6615, 0.8062)
## No Information Rate : 0.719
## P-Value [Acc > NIR] : 0.3304
##
## Kappa : 0.3959
##
## Mcnemar's Test P-Value : 0.1547
##
## Sensitivity : 0.6512
## Specificity : 0.7727
## Pos Pred Value : 0.5283
## Neg Pred Value : 0.8500
## Prevalence : 0.2810
## Detection Rate : 0.1830
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.7119
##
## 'Positive' Class : pos
##
set.seed(7)
Grid <- expand.grid(
nrounds = 100,
max_depth = 6,
eta = 0.3,
gamma = 0,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1)
modelxgBoost <- train(diabetes~., data=df_Train, method="xgbTree", metric="Accuracy", trControl=control, tuneGrid=Grid)
# estimate variable importance
importancexgBoost <- varImp(modelxgBoost, scale=FALSE)
# summarize importance
print(importancexgBoost)
## xgbTree variable importance
##
## Overall
## glucose 0.33453
## mass 0.18695
## pedigree 0.13812
## age 0.10412
## pressure 0.07310
## pregnant 0.06602
## insulin 0.05986
## triceps 0.03729
# plot importance
plot(importancexgBoost, main = "Variable Importance xgBoost Model")
# use model to predict probability of default
xgBoost_pred <- predict(modelxgBoost, df_Test)
# create confusion matrix
confusionMatrix(df_Test$diabetes, xgBoost_pred) # add [, positive='neg'] to change positive class
## Confusion Matrix and Statistics
##
## Reference
## Prediction pos neg
## pos 30 23
## neg 22 78
##
## Accuracy : 0.7059
## 95% CI : (0.6269, 0.7767)
## No Information Rate : 0.6601
## P-Value [Acc > NIR] : 0.1331
##
## Kappa : 0.3476
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.5769
## Specificity : 0.7723
## Pos Pred Value : 0.5660
## Neg Pred Value : 0.7800
## Prevalence : 0.3399
## Detection Rate : 0.1961
## Detection Prevalence : 0.3464
## Balanced Accuracy : 0.6746
##
## 'Positive' Class : pos
##
# model results plots----
models_results <- resamples(list(bCART=modelbCART, C50=modelC50, GBM=modelGbm,
GLM=modelGlm, GLMNET=modelglmnet, KNN=modelKNN,
LDA=modelLDA, LVQ=modelLvq, NB=modelNB,
RF=modelRF, RPART=modelrpart, SVM=modelSvm,
xgBoost=modelxgBoost))
# summarize the distributions
summary(models_results)
##
## Call:
## summary.resamples(object = models_results)
##
## Models: bCART, C50, GBM, GLM, GLMNET, KNN, LDA, LVQ, NB, RF, RPART, SVM, xgBoost
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## bCART 0.6721311 0.7287811 0.7540984 0.7593161 0.7837123 0.8524590 0
## C50 0.6290323 0.7377049 0.7642782 0.7697074 0.8000397 0.9016393 0
## GBM 0.6774194 0.7387626 0.7723427 0.7773312 0.8056584 0.8870968 0
## GLM 0.6721311 0.7580645 0.7886039 0.7827428 0.8218535 0.8524590 0
## GLMNET 0.6721311 0.7580645 0.7886039 0.7849198 0.8225806 0.8688525 0
## KNN 0.6557377 0.6963908 0.7398202 0.7365591 0.7732681 0.8196721 0
## LDA 0.6612903 0.7550899 0.7805394 0.7811387 0.8225806 0.8548387 0
## LVQ 0.6065574 0.6774194 0.6992332 0.7003173 0.7258065 0.7868852 0
## NB 0.6557377 0.7419355 0.7723427 0.7658734 0.8024194 0.8524590 0
## RF 0.6612903 0.7377049 0.7741935 0.7766790 0.8185484 0.8852459 0
## RPART 0.6774194 0.7377049 0.7704918 0.7626476 0.7868852 0.8524590 0
## SVM 0.6393443 0.7387626 0.7723427 0.7723250 0.8064516 0.8524590 0
## xgBoost 0.6393443 0.6963908 0.7419355 0.7436453 0.7868852 0.8688525 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## bCART 0.25700365 0.3759470 0.4585153 0.4584325 0.5050348 0.6617375 0
## C50 0.16412661 0.4190476 0.4548652 0.4810380 0.5621338 0.7718204 0
## GBM 0.26540284 0.3984304 0.4644479 0.4813847 0.5773023 0.7559055 0
## GLM 0.23267327 0.4196654 0.5169877 0.4973651 0.5968179 0.6617375 0
## GLMNET 0.23267327 0.4275028 0.5169877 0.5019271 0.6002345 0.6883780 0
## KNN 0.19179811 0.3177098 0.4033574 0.4014736 0.5030303 0.5865681 0
## LDA 0.20318237 0.4292239 0.4965825 0.4922640 0.6002345 0.6729191 0
## LVQ -0.03536068 0.2141653 0.2807425 0.2908548 0.3668287 0.4996845 0
## NB 0.21072089 0.4123223 0.4772394 0.4689893 0.5605594 0.6694762 0
## RF 0.23681125 0.4056029 0.4745313 0.4945616 0.6042909 0.7369070 0
## RPART 0.24939467 0.3960902 0.4630482 0.4593244 0.5147285 0.6361829 0
## SVM 0.14303959 0.3966294 0.4784215 0.4657342 0.5645034 0.6403712 0
## xgBoost 0.19194313 0.3352380 0.4397387 0.4277546 0.5084701 0.7159488 0
# boxplots of results
bwplot(models_results)
saveRDS(modelglmnet, file = "finalModel_glmnet.rds")
saveRDS(modelGlm, file = "finalModel_Glm.rds")
saveRDS(modelLDA, file = "finalModel_LDA.rds")
A.M.D.G.