# load the Pima Indians dataset from the mlbench dataset
library(mlbench)
data(PimaIndiansDiabetes)
# rename dataset to have shorter name because lazy
diabetes <- PimaIndiansDiabetes
# install.packages(c('caret', 'skimr', 'RANN', 'randomForest', 'fastAdaboost', 'gbm', 'xgboost', 'caretEnsemble', 'C50', 'earth'))
# Load the caret package
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Structure of the dataframe
str(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
# See top 6 rows
head(diabetes )
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
# Create the training and test datasets
set.seed(100)
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(diabetes$diabetes, p=0.8, list=FALSE)
# Step 2: Create the training dataset
trainData <- diabetes[trainRowNumbers,]
# Step 3: Create the test dataset
testData <- diabetes[-trainRowNumbers,]
# Store X and Y for later use.
# x = trainData[, -1]
y = trainData$diabetes
library(skimr)
skimmed <- skim (trainData)
skimmed
| Name | trainData |
| Number of rows | 615 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| diabetes | 0 | 1 | FALSE | 2 | neg: 400, pos: 215 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| pregnant | 0 | 1 | 3.88 | 3.42 | 0.00 | 1.00 | 3.00 | 6.00 | 17.00 | ▇▃▂▁▁ |
| glucose | 0 | 1 | 120.87 | 32.58 | 0.00 | 99.00 | 116.00 | 140.00 | 199.00 | ▁▁▇▅▂ |
| pressure | 0 | 1 | 69.33 | 19.44 | 0.00 | 62.00 | 72.00 | 80.00 | 122.00 | ▁▁▇▇▁ |
| triceps | 0 | 1 | 19.97 | 15.87 | 0.00 | 0.00 | 22.00 | 32.00 | 99.00 | ▇▇▂▁▁ |
| insulin | 0 | 1 | 78.00 | 114.39 | 0.00 | 0.00 | 18.00 | 125.00 | 846.00 | ▇▁▁▁▁ |
| mass | 0 | 1 | 32.08 | 7.97 | 0.00 | 27.50 | 32.00 | 36.80 | 67.10 | ▁▂▇▂▁ |
| pedigree | 0 | 1 | 0.46 | 0.33 | 0.08 | 0.24 | 0.36 | 0.61 | 2.29 | ▇▃▁▁▁ |
| age | 0 | 1 | 33.41 | 11.77 | 21.00 | 24.00 | 29.00 | 41.00 | 81.00 | ▇▃▁▁▁ |
# Create the knn imputation model on the training data
preProcess_missingdata_model <- preProcess(trainData, method='knnImpute')
preProcess_missingdata_model
## Created from 615 samples and 9 variables
##
## Pre-processing:
## - centered (8)
## - ignored (1)
## - 5 nearest neighbor imputation (8)
## - scaled (8)
# Use the imputation model to predict the values of missing data points
library(RANN) # required for knnInpute
trainData <- predict(preProcess_missingdata_model, newdata = trainData)
anyNA(trainData)
## [1] FALSE
# One-Hot Encoding
# Creating dummy variables is converting a categorical variable to as many binary variables as here are categories.
dummies_model <- dummyVars(diabetes ~ ., data=trainData)
# Create the dummy variables using predict. The Y variable (Purchase) will not be present in trainData_mat.
trainData_mat <- predict(dummies_model, newdata = trainData)
## Warning in model.frame.default(Terms, newdata, na.action = na.action, xlev =
## object$lvls): variable 'diabetes' is not a factor
# # Convert to dataframe
trainData <- data.frame(trainData_mat)
# # See the structure of the new dataset
str(trainData)
## 'data.frame': 615 obs. of 8 variables:
## $ pregnant: num -0.843 1.205 -0.843 -1.136 0.327 ...
## $ glucose : num -1.101 1.907 -0.978 0.495 -0.15 ...
## $ pressure: num -0.171 -0.274 -0.171 -1.508 0.24 ...
## $ triceps : num 0.569 -1.258 0.191 0.947 -1.258 ...
## $ insulin : num -0.682 -0.682 0.14 0.787 -0.682 ...
## $ mass : num -0.687 -1.101 -0.499 1.382 -0.812 ...
## $ pedigree: num -0.349 0.637 -0.915 5.601 -0.81 ...
## $ age : num -0.205 -0.1201 -1.0548 -0.0351 -0.29 ...
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
# Append the Y variable instead of normalized data
trainData$diabetes <- y
# Look the dataset
apply(trainData[, -1], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
## glucose pressure triceps insulin mass pedigree
## min "0.0000000" "0.0000000" "0.00000000" "0.00000000" "0.0000000" "0.000000000"
## max "1.0000000" "1.0000000" "1.00000000" "1.00000000" "1.0000000" "1.000000000"
## age diabetes
## min "0.00000000" "neg"
## max "1.00000000" "pos"
str(trainData)
## 'data.frame': 615 obs. of 9 variables:
## $ pregnant: num 0.0588 0.4706 0.0588 0 0.2941 ...
## $ glucose : num 0.427 0.92 0.447 0.688 0.583 ...
## $ pressure: num 0.541 0.525 0.541 0.328 0.607 ...
## $ triceps : num 0.293 0 0.232 0.354 0 ...
## $ insulin : num 0 0 0.111 0.199 0 ...
## $ mass : num 0.396 0.347 0.419 0.642 0.382 ...
## $ pedigree: num 0.1235 0.2688 0.0403 1 0.0557 ...
## $ age : num 0.167 0.183 0 0.2 0.15 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 1 2 1 2 1 2 1 2 2 2 ...
featurePlot(x = trainData[, 1:8],
y = trainData$diabetes,
plot = "box",
strip=strip.custom(par.strip.text=list(cex=.7)),
scales = list(x = list(relation="free"),
y = list(relation="free")))
featurePlot(x = trainData[, 1:8],
y = trainData$diabetes,
plot = "density",
strip=strip.custom(par.strip.text=list(cex=.7)),
scales = list(x = list(relation="free"),
y = list(relation="free")))
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor((trainData[,-9] )))
starisk in the
last column, here it is 8th.set.seed(100)
options(warn=-1)
subsets <- c(1:8)
ctrl <- rfeControl(functions = rfFuncs, #random forest algorithm
method = "repeatedcv", #k fold cross validation repeated 5 times
repeats = 5,
verbose = FALSE)
lmProfile <- rfe(x=trainData[, 1:8], y=trainData$diabetes,
sizes = subsets,
rfeControl = ctrl)
lmProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.6952 0.2841 0.04915 0.10605
## 2 0.7275 0.3815 0.04359 0.09609
## 3 0.7642 0.4673 0.04216 0.09490
## 4 0.7620 0.4631 0.04762 0.10945
## 5 0.7571 0.4534 0.05152 0.11813
## 6 0.7627 0.4679 0.04949 0.11218
## 7 0.7620 0.4619 0.05210 0.12019
## 8 0.7682 0.4728 0.04620 0.10576 *
##
## The top 5 variables (out of 8):
## glucose, mass, age, pregnant, insulin
look up features of all models in R
# See available algorithms in caret
modelnames <- paste(names(getModelInfo()), collapse=', ')
modelLookup('xgbTree')
## model parameter label forReg forClass
## 1 xgbTree nrounds # Boosting Iterations TRUE TRUE
## 2 xgbTree max_depth Max Tree Depth TRUE TRUE
## 3 xgbTree eta Shrinkage TRUE TRUE
## 4 xgbTree gamma Minimum Loss Reduction TRUE TRUE
## 5 xgbTree colsample_bytree Subsample Ratio of Columns TRUE TRUE
## 6 xgbTree min_child_weight Minimum Sum of Instance Weight TRUE TRUE
## 7 xgbTree subsample Subsample Percentage TRUE TRUE
## probModel
## 1 TRUE
## 2 TRUE
## 3 TRUE
## 4 TRUE
## 5 TRUE
## 6 TRUE
## 7 TRUE
Multivariate Adaptive Regression Splines (MARS)# Set the seed for reproducibility
set.seed(100)
# Train the model using randomForest and predict on the training data itself.
model_mars = train(diabetes ~ ., data=trainData, method='earth')
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
fitted <- predict(model_mars)
model_mars
## Multivariate Adaptive Regression Spline
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 615, 615, 615, 615, 615, 615, ...
## Resampling results across tuning parameters:
##
## nprune Accuracy Kappa
## 2 0.7451922 0.4023855
## 8 0.7680686 0.4748261
## 14 0.7603116 0.4581491
##
## Tuning parameter 'degree' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 8 and degree = 1.
interaction.depth and n.trees.plot(model_mars, main="Model Accuracies with MARS")
varimp_mars <- varImp(model_mars)
plot(varimp_mars, main="Variable Importance with MARS")
imputation,dummy, and normalization# Step 1: Impute missing values
testData2 <- predict(preProcess_missingdata_model, testData)
# Step 2: Create one-hot encodings (dummy variables)
testData3 <- predict(dummies_model, testData2)
# Step 3: Transform the features to range between 0 and 1
testData4 <- predict(preProcess_range_model, testData3)
# View
head(testData4 )
## pregnant glucose pressure triceps insulin mass pedigree
## 1 0.35294118 0.7437186 0.5901639 0.3535354 0.0000000 0.5007452 0.24841629
## 11 0.23529412 0.5527638 0.7540984 0.0000000 0.0000000 0.5603577 0.05113122
## 21 0.17647059 0.6331658 0.7213115 0.4141414 0.2777778 0.5856930 0.28325792
## 24 0.52941176 0.5979899 0.6557377 0.3535354 0.0000000 0.4321908 0.08371041
## 28 0.05882353 0.4874372 0.5409836 0.1515152 0.1654846 0.3457526 0.18506787
## 37 0.64705882 0.6934673 0.6229508 0.0000000 0.0000000 0.4947839 0.15475113
## age
## 1 0.48333333
## 11 0.15000000
## 21 0.10000000
## 24 0.13333333
## 28 0.01666667
## 37 0.23333333
# Predict on testData
predicted <- predict(model_mars, testData4)
head(predicted)
## [1] pos neg neg neg neg pos
## Levels: neg pos
# Compute the confusion matrix
confusionMatrix(reference = as.factor(testData$diabetes), data = predicted )
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 86 22
## pos 14 31
##
## Accuracy : 0.7647
## 95% CI : (0.6894, 0.8294)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.001988
##
## Kappa : 0.4613
##
## Mcnemar's Test P-Value : 0.243345
##
## Sensitivity : 0.8600
## Specificity : 0.5849
## Pos Pred Value : 0.7963
## Neg Pred Value : 0.6889
## Prevalence : 0.6536
## Detection Rate : 0.5621
## Detection Prevalence : 0.7059
## Balanced Accuracy : 0.7225
##
## 'Positive' Class : neg
##
tuneLength, tuneGrid# Define the training control
fitControl <- trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'final', # saves predictions for optimal tuning parameter
classProbs = T, # should class probabilities be returned
summaryFunction=twoClassSummary # results summary function
)
# Step 1: Define the tuneGrid
marsGrid <- expand.grid(nprune = c(2, 4, 6, 8, 10),
degree = c(1, 2, 3))
# Step 2: Tune hyper parameters by setting tuneGrid
set.seed(100)
model_mars3 = train(diabetes ~ ., data=trainData, method='earth', metric='ROC', tuneGrid = marsGrid, trControl = fitControl)
model_mars3
## Multivariate Adaptive Regression Spline
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 492, 492, 492, 492, 492
## Resampling results across tuning parameters:
##
## degree nprune ROC Sens Spec
## 1 2 0.7962500 0.8850 0.4976744
## 1 4 0.8400581 0.8725 0.6046512
## 1 6 0.8410465 0.8825 0.6046512
## 1 8 0.8471512 0.8850 0.5860465
## 1 10 0.8437209 0.8775 0.6093023
## 2 2 0.7962500 0.8850 0.4976744
## 2 4 0.8284593 0.8775 0.6000000
## 2 6 0.8224419 0.8725 0.5488372
## 2 8 0.8237209 0.8700 0.5395349
## 2 10 0.8212209 0.8650 0.5395349
## 3 2 0.7962500 0.8850 0.4976744
## 3 4 0.8245058 0.8825 0.6000000
## 3 6 0.8205814 0.8750 0.5627907
## 3 8 0.8191860 0.8725 0.5581395
## 3 10 0.8195349 0.8650 0.5627907
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nprune = 8 and degree = 1.
# Step 3: Predict on testData and Compute the confusion matrix
predicted3 <- predict(model_mars3, testData4)
confusionMatrix(reference = as.factor(testData$diabetes), data = predicted3 )
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 86 22
## pos 14 31
##
## Accuracy : 0.7647
## 95% CI : (0.6894, 0.8294)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.001988
##
## Kappa : 0.4613
##
## Mcnemar's Test P-Value : 0.243345
##
## Sensitivity : 0.8600
## Specificity : 0.5849
## Pos Pred Value : 0.7963
## Neg Pred Value : 0.6889
## Prevalence : 0.6536
## Detection Rate : 0.5621
## Detection Prevalence : 0.7059
## Balanced Accuracy : 0.7225
##
## 'Positive' Class : neg
##
set.seed(100)
# Train the model using adaboost
model_adaboost = train(diabetes ~ ., data=trainData, method='adaboost', tuneLength=2, trControl = fitControl)
model_adaboost
## AdaBoost Classification Trees
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 492, 492, 492, 492, 492
## Resampling results across tuning parameters:
##
## nIter method ROC Sens Spec
## 50 Adaboost.M1 0.7856395 0.8025 0.5906977
## 50 Real adaboost 0.6250000 0.8350 0.5534884
## 100 Adaboost.M1 0.7852907 0.8050 0.6325581
## 100 Real adaboost 0.6051163 0.8450 0.5581395
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were nIter = 50 and method = Adaboost.M1.
set.seed(100)
# Train the model using rf
model_rf = train(diabetes ~ ., data=trainData, method='rf', tuneLength=5, trControl = fitControl)
model_rf
## Random Forest
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 492, 492, 492, 492, 492
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8210756 0.8600 0.5906977
## 3 0.8217733 0.8575 0.6046512
## 5 0.8145640 0.8550 0.5906977
## 6 0.8152616 0.8575 0.6093023
## 8 0.8145349 0.8500 0.6000000
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# set.seed(100)
#
# # Train the model using MARS
# model_xgbDART = train(Purchase ~ ., data=trainData, method='xgbDART', tuneLength=5, trControl = fitControl, verbose=F)
# model_xgbDART
set.seed(100)
# Train the model using MARS
model_svmRadial = train(diabetes ~ ., data=trainData, method='svmRadial', tuneLength=15, trControl = fitControl)
model_svmRadial
## Support Vector Machines with Radial Basis Function Kernel
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 492, 492, 492, 492, 492
## Resampling results across tuning parameters:
##
## C ROC Sens Spec
## 0.25 0.8306395 0.8650 0.5813953
## 0.50 0.8308140 0.8800 0.5720930
## 1.00 0.8279070 0.8750 0.5348837
## 2.00 0.8216860 0.8825 0.4976744
## 4.00 0.8204070 0.8925 0.4883721
## 8.00 0.8080814 0.8800 0.4790698
## 16.00 0.7892442 0.8825 0.4651163
## 32.00 0.7677326 0.8875 0.4093023
## 64.00 0.7430814 0.8925 0.3674419
## 128.00 0.7165698 0.8825 0.3255814
## 256.00 0.7062209 0.8975 0.3255814
## 512.00 0.7051163 0.9100 0.2930233
## 1024.00 0.7005814 0.9025 0.3023256
## 2048.00 0.6955233 0.9000 0.3162791
## 4096.00 0.6948837 0.8925 0.3348837
##
## Tuning parameter 'sigma' was held constant at a value of 0.1161195
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1161195 and C = 0.5.
set.seed(100)
# Train the model using MARS
model_knn = train(diabetes ~ ., data=trainData, method='knn', tuneLength=15, trControl = fitControl)
model_knn
## k-Nearest Neighbors
##
## 615 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 492, 492, 492, 492, 492
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 5 0.7401744 0.8250 0.4651163
## 7 0.7655523 0.8425 0.4744186
## 9 0.7707849 0.8500 0.4930233
## 11 0.7797384 0.8800 0.4976744
## 13 0.7876744 0.8725 0.4790698
## 15 0.7951163 0.8800 0.4837209
## 17 0.7933721 0.8775 0.4651163
## 19 0.7997965 0.8825 0.4465116
## 21 0.8001163 0.8975 0.4418605
## 23 0.8024709 0.9050 0.4651163
## 25 0.8037791 0.9050 0.4744186
## 27 0.8082267 0.9050 0.4790698
## 29 0.8069767 0.9150 0.4697674
## 31 0.8083430 0.9100 0.4418605
## 33 0.8064244 0.9175 0.4372093
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 31.
# Compare model performances using resample()
models_compare <- resamples(list(ADABOOST=model_adaboost, RF=model_rf, knn=model_knn, MARS=model_mars3, SVM=model_svmRadial))
# Summary of the models performances
summary(models_compare)
##
## Call:
## summary.resamples(object = models_compare)
##
## Models: ADABOOST, RF, knn, MARS, SVM
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## ADABOOST 0.7436047 0.7750000 0.7851744 0.7856395 0.7889535 0.8354651 0
## RF 0.7697674 0.8155523 0.8170058 0.8217733 0.8497093 0.8568314 0
## knn 0.7646802 0.7680233 0.8209302 0.8083430 0.8297965 0.8582849 0
## MARS 0.8238372 0.8430233 0.8450581 0.8471512 0.8613372 0.8625000 0
## SVM 0.7648256 0.8279070 0.8436047 0.8308140 0.8441860 0.8735465 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## ADABOOST 0.7500 0.7750 0.8125 0.8025 0.8250 0.8500 0
## RF 0.8250 0.8375 0.8625 0.8575 0.8750 0.8875 0
## knn 0.8875 0.9000 0.9000 0.9100 0.9125 0.9500 0
## MARS 0.8500 0.8625 0.8875 0.8850 0.9125 0.9125 0
## SVM 0.8625 0.8750 0.8750 0.8800 0.8875 0.9000 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## ADABOOST 0.5348837 0.5348837 0.5581395 0.5906977 0.6046512 0.7209302 0
## RF 0.5581395 0.5581395 0.5813953 0.6046512 0.6511628 0.6744186 0
## knn 0.4186047 0.4418605 0.4418605 0.4418605 0.4418605 0.4651163 0
## MARS 0.5348837 0.5348837 0.5813953 0.5860465 0.6046512 0.6744186 0
## SVM 0.5116279 0.5581395 0.5813953 0.5720930 0.6046512 0.6046512 0
# Draw box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(models_compare, scales=scales)
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
# Stacking Algorithms - Run multiple algos in one call.
trainControl <- trainControl(method="repeatedcv",
number=10,
repeats=3,
savePredictions=TRUE,
classProbs=TRUE)
algorithmList <- c('rf', 'adaboost', 'earth', 'knn', 'svmRadial')
set.seed(100)
models <- caretList(diabetes ~ ., data=trainData, trControl=trainControl, methodList=algorithmList)
results <- resamples(models)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: rf, adaboost, earth, knn, svmRadial
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.6774194 0.7540984 0.7704918 0.7723956 0.8056584 0.8548387 0
## adaboost 0.6612903 0.7224352 0.7540984 0.7539221 0.7868852 0.8387097 0
## earth 0.6612903 0.7419355 0.7741935 0.7745725 0.8032787 0.8709677 0
## knn 0.6229508 0.7049180 0.7398202 0.7372466 0.7704918 0.8360656 0
## svmRadial 0.6935484 0.7387626 0.7805394 0.7712938 0.8000397 0.8387097 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rf 0.2654028 0.4262678 0.4828423 0.4872370 0.5579818 0.6796785 0
## adaboost 0.2203593 0.3688623 0.4526472 0.4369009 0.4864705 0.6403712 0
## earth 0.2368113 0.4089641 0.4935398 0.4820558 0.5378447 0.7122970 0
## knn 0.1553281 0.3014894 0.4019217 0.3890542 0.4644021 0.6007853 0
## svmRadial 0.3237658 0.3923780 0.5012975 0.4765574 0.5300695 0.6403712 0
# Box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(results, scales=scales)
# Create the trainControl
set.seed(101)
stackControl <- trainControl(method="repeatedcv",
number=10,
repeats=3,
savePredictions=TRUE,
classProbs=TRUE)
# Ensemble the predictions of `models` to form a new combined prediction based on glm
stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=stackControl)
print(stack.glm)
## A glm ensemble of 5 base models: rf, adaboost, earth, knn, svmRadial
##
## Ensemble results:
## Generalized Linear Model
##
## 1845 samples
## 5 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 1660, 1660, 1661, 1661, 1661, 1660, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7799569 0.4938476
# Predict on testData
stack_predicteds <- predict(stack.glm, newdata=testData4)
confusionMatrix(reference = as.factor(testData$diabetes), data = stack_predicteds )
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 84 22
## pos 16 31
##
## Accuracy : 0.7516
## 95% CI : (0.6754, 0.8179)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.005891
##
## Kappa : 0.4365
##
## Mcnemar's Test P-Value : 0.417304
##
## Sensitivity : 0.8400
## Specificity : 0.5849
## Pos Pred Value : 0.7925
## Neg Pred Value : 0.6596
## Prevalence : 0.6536
## Detection Rate : 0.5490
## Detection Prevalence : 0.6928
## Balanced Accuracy : 0.7125
##
## 'Positive' Class : neg
##