We will show hot to integrate caret models into FRESA benchmarking
library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)
library(caret)
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
Loading the data from the mlbech package.
data(Sonar, package = "mlbench")
Sonar$Class <- 1*(Sonar$Class == "M")
table(Sonar$Class)
We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction
SonarF <- Sonar
SonarF$Class <- as.factor(SonarF$Class)
ExperimentName <- "Sonar_Caret"
theData <- Sonar;
theOutcome <- "Class";
reps <- 20;
fraction <- 0.75;
tranSet <- sample(nrow(SonarF),fraction*nrow(SonarF))
sonarTrain <- SonarF[tranSet,]
sonarTest <- SonarF[-tranSet,]
sonarNTrain <- Sonar[tranSet,]
sonarNTest <- Sonar[-tranSet,]
CVFileName <- paste(ExperimentName,"CVMethod_CARET_web.RDATA",sep = "_")
tunningctrl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 3
)
noTuningControl <- trainControl(method = "none")
set.seed(107)
gbm_fit <- train(Class ~ .,sonarTrain,
method = "gbm",
trControl = tunningctrl,
preProc = c("center", "scale"),
verbose = FALSE)
set.seed(107)
avNNet_fit <- train(Class ~ .,sonarTrain,
method = "avNNet",
trControl = tunningctrl,
trace = FALSE
)
set.seed(107)
lda2_fit <- train(Class ~ .,sonarTrain,
method = "lda2",
trControl = tunningctrl
)
set.seed(107)
mda_fit <- train(Class ~ .,sonarTrain,
method = "mda",
trControl = tunningctrl
)
LCmodel <- HLCM_EM(Class ~ .,sonarNTrain,hysteresis = 0.0)
par(mfrow = c(2,2),cex = 0.6);
gbm_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
predict(gbm_fit,sonarTest,type="prob")[,"1"]),
"caret gbm",
cex=0.8)
#> caret gbm
avNNet_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
predict(avNNet_fit,sonarTest,type="prob")[,"1"]),
"caret avNNet",
cex=0.8)
#> caret avNNet
lda2_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
predict(lda2_fit,sonarTest,type="prob")[,"1"]),
"caret lda2",
cex=0.8)
#> caret lda2
mda_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
predict(mda_fit,sonarTest,type="prob")[,"1"]),
"caret Mixture Discriminant Analysis",
cex=0.8)
#> caret Mixture Discriminant Analysis
HLCM_EM_bs <- predictionStats_binary(cbind(sonarNTest$Class,
predict(LCmodel,sonarNTest)),
"Latent Class Model",
cex=0.8)
#> Latent Class Model
par(mfrow = c(1,1),cex = 1.0);
resamps <- resamples(list(gbm = gbm_fit, avNNet = avNNet_fit,lda2=lda2_fit,mda_fit))
pander::pander(summary(resamps))
values:
| gbm~Accuracy | gbm~Kappa | avNNet~Accuracy | avNNet~Kappa | lda2~Accuracy |
|---|---|---|---|---|
| 0.871 | 0.7395 | 0.8387 | 0.6764 | 0.7419 |
| 0.7742 | 0.5412 | 0.7097 | 0.4026 | 0.6774 |
| 0.9375 | 0.871 | 0.8438 | 0.6748 | 0.5938 |
| 0.7742 | 0.547 | 0.9032 | 0.8034 | 0.8387 |
| 0.871 | 0.7395 | 0.8387 | 0.6638 | 0.7097 |
| 0.8387 | 0.6681 | 0.8387 | 0.6764 | 0.7097 |
| 0.8065 | 0.6043 | 0.7742 | 0.5353 | 0.8387 |
| 0.7097 | 0.4026 | 0.6774 | 0.3404 | 0.7097 |
| 0.7097 | 0.4247 | 0.8065 | 0.6043 | 0.6452 |
| 0.9355 | 0.8681 | 0.7742 | 0.5231 | 0.6774 |
| 0.9688 | 0.936 | 0.8125 | 0.6129 | 0.7188 |
| 0.7742 | 0.5353 | 0.9032 | 0.8034 | 0.7742 |
| 0.75 | 0.4754 | 0.7812 | 0.552 | 0.5312 |
| 0.8065 | 0.6092 | 0.871 | 0.7328 | 0.6452 |
| 0.871 | 0.7395 | 0.9032 | 0.8034 | 0.7742 |
| lda2~Kappa | Model4~Accuracy | Model4~Kappa |
|---|---|---|
| 0.479 | 0.6774 | 0.3487 |
| 0.3404 | 0.7419 | 0.4723 |
| 0.1811 | 0.625 | 0.2381 |
| 0.6764 | 0.8387 | 0.6764 |
| 0.4318 | 0.8387 | 0.6764 |
| 0.4247 | 0.7097 | 0.4318 |
| 0.6681 | 0.7742 | 0.5412 |
| 0.4101 | 0.7097 | 0.4101 |
| 0.2791 | 0.5806 | 0.1587 |
| 0.3319 | 0.6452 | 0.2791 |
| 0.4146 | 0.6875 | 0.3548 |
| 0.547 | 0.8065 | 0.6043 |
| 0.09774 | 0.75 | 0.5 |
| 0.2791 | 0.7419 | 0.479 |
| 0.547 | 0.9355 | 0.8681 |
call: summary.resamples(object = resamps)
statistics:
Accuracy:
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | NA’s | |
|---|---|---|---|---|---|---|---|
| gbm | 0.7097 | 0.7742 | 0.8065 | 0.8265 | 0.871 | 0.9688 | 0 |
| avNNet | 0.6774 | 0.7777 | 0.8387 | 0.8184 | 0.8574 | 0.9032 | 0 |
| lda2 | 0.5312 | 0.6613 | 0.7097 | 0.7057 | 0.7581 | 0.8387 | 0 |
| Model4 | 0.5806 | 0.6825 | 0.7419 | 0.7375 | 0.7903 | 0.9355 | 0 |
Kappa:
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | NA’s | |
|---|---|---|---|---|---|---|---|
| gbm | 0.4026 | 0.5383 | 0.6092 | 0.6468 | 0.7395 | 0.936 | 0 |
| avNNet | 0.3404 | 0.5437 | 0.6638 | 0.627 | 0.7046 | 0.8034 | 0 |
| lda2 | 0.09774 | 0.3055 | 0.4146 | 0.4072 | 0.513 | 0.6764 | 0 |
| Model4 | 0.1587 | 0.3518 | 0.4723 | 0.4693 | 0.5727 | 0.8681 | 0 |
models: gbm, avNNet, lda2 and Model4
metrics: Accuracy and Kappa
methods:
| gbm | avNNet | lda2 | |
|---|---|---|---|
| gbm | avNNet | lda2 | mda |
HouldOutaccuracys <- rbind(gbm = gbm_bs$accc,
avNNet = avNNet_bs$accc,
lda2=lda2_bs$accc,
mda=mda_bs$accc,
HLCM_EM=HLCM_EM_bs$accc)
pander::pander(HouldOutaccuracys)
| est | lower | upper | |
|---|---|---|---|
| gbm | 0.8654 | 0.7421 | 0.9441 |
| avNNet | 0.8846 | 0.7656 | 0.9565 |
| lda2 | 0.8269 | 0.6967 | 0.9177 |
| mda | 0.8077 | 0.6747 | 0.9037 |
| HLCM_EM | 0.7885 | 0.653 | 0.8894 |
Let us do a repeated cross validation using all data
par(mfrow = c(2,2),cex = 0.6);
caretlda2cv <- randomCV(theData,theOutcome,
train,
trainFraction = fraction,
repetitions = reps,
asFactor = TRUE,
method = "lda2",
preProc = c("center", "scale"),
featureSelectionFunction = univariate_Wilcoxon,
featureSelection.control = list(thr = 0.95,limit=0.1)
)
caregbmcv <- randomCV(fittingFunction=train,
trainSampleSets=caretlda2cv$trainSamplesSets,
asFactor = TRUE,
method = "gbm",
verbose = FALSE
)
careavNNetcv <- randomCV(fittingFunction=train,
trainSampleSets=caretlda2cv$trainSamplesSets,
asFactor = TRUE,
method = "avNNet",
trControl = tunningctrl,
trace = FALSE
)
caregbmNocv <- randomCV(fittingFunction=train,
trainSampleSets=caretlda2cv$trainSamplesSets,
asFactor = TRUE,
method = "gbm",
trControl = noTuningControl,
tuneGrid = data.frame(interaction.depth = 3,
n.trees = 75,
shrinkage = .1,
n.minobsinnode = 20),
verbose = FALSE
)
ADABOOSTcv <- randomCV(fittingFunction=adaboost,
trainSampleSets=caretlda2cv$trainSamplesSets,
featureSelectionFunction = univariate_Wilcoxon,
featureSelection.control = list(thr = 0.95),
asFactor = TRUE,
nIter=10
)
HCLAS_BSWiMScv <- randomCV(fittingFunction=HLCM_EM,
trainSampleSets=caretlda2cv$trainSamplesSets,hysteresis = 0.10)
par(mfrow = c(1,1),cex = 1.0);
By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM
par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(lda2=caretlda2cv,
gbm = caregbmcv,
avNNet = careavNNetcv,
gbmNot = caregbmNocv,
adaboost = ADABOOSTcv,
LCLAS = HCLAS_BSWiMScv
))
save(cp, file = CVFileName)
#load(file = CVFileName)
par(mfrow = c(1,1),cex = 1.0);
Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.
par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)
pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)
| gbm | gbmNot | LCLAS | RF | ENS | avNNet | adaboost | lda2 | |
|---|---|---|---|---|---|---|---|---|
| BER | 0.152 | 0.155 | 0.163 | 0.171 | 0.171 | 0.172 | 0.174 | 0.235 |
| ACC | 0.85 | 0.845 | 0.841 | 0.831 | 0.831 | 0.831 | 0.826 | 0.763 |
| AUC | 0.928 | 0.91 | 0.917 | 0.925 | 0.919 | 0.905 | 0.896 | 0.84 |
| SEN | 0.883 | 0.865 | 0.874 | 0.865 | 0.865 | 0.874 | 0.847 | 0.739 |
| SPE | 0.812 | 0.823 | 0.802 | 0.792 | 0.792 | 0.781 | 0.802 | 0.792 |
| CIDX | 0.913 | 0.899 | 0.882 | 0.918 | 0.921 | 0.898 | 0.868 | 0.835 |
| SVM | RPART | KNN | LASSO | |
|---|---|---|---|---|
| BER | 0.238 | 0.243 | 0.253 | 0.255 |
| ACC | 0.763 | 0.758 | 0.754 | 0.744 |
| AUC | 0.86 | 0.82 | 0.869 | 0.826 |
| SEN | 0.793 | 0.784 | 0.838 | 0.721 |
| SPE | 0.729 | 0.729 | 0.656 | 0.771 |
| CIDX | 0.85 | 0.743 | 0.848 | 0.821 |
Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot
par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend
cp$featureSelectionFrequency$FS_gbm <- NULL
cp$featureSelectionFrequency$FS_avNNet <- NULL
cp$featureSelectionFrequency$FS_gbmNot <- NULL
cp$featureSelectionFrequency$FS_adaboost <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)
gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.5,srtCol = 25)