1 CARET FRESA.CAD Sonar Benchmark

We will show hot to integrate caret models into FRESA benchmarking

1.1 The required libraries

library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)
library(caret)
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)

1.2 Sonar Data Set

Loading the data from the mlbech package.

data(Sonar, package = "mlbench")
Sonar$Class <- 1*(Sonar$Class == "M")
table(Sonar$Class)

We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction

SonarF <- Sonar
SonarF$Class <- as.factor(SonarF$Class)


ExperimentName <- "Sonar_Caret"
theData <- Sonar;
theOutcome <- "Class";
reps <- 20;
fraction <- 0.75;

tranSet <- sample(nrow(SonarF),fraction*nrow(SonarF))
sonarTrain <- SonarF[tranSet,]
sonarTest <- SonarF[-tranSet,]

sonarNTrain <- Sonar[tranSet,]
sonarNTest <- Sonar[-tranSet,]

CVFileName <- paste(ExperimentName,"CVMethod_CARET_web.RDATA",sep = "_")

1.3 Caret taining control


tunningctrl <- trainControl(
  method = "repeatedcv", 
  number = 5,
  repeats = 3
)

noTuningControl <- trainControl(method = "none")

1.4 Simple hold-out validation of caret methods


set.seed(107)
gbm_fit <- train(Class ~ .,sonarTrain, 
             method = "gbm",  
             trControl = tunningctrl,
             preProc = c("center", "scale"),
             verbose = FALSE)


set.seed(107)
avNNet_fit <- train(Class ~ .,sonarTrain, 
             method = "avNNet",
             trControl = tunningctrl,
             trace = FALSE
             )


set.seed(107)
lda2_fit <- train(Class ~ .,sonarTrain, 
             method = "lda2",
             trControl = tunningctrl
             )

set.seed(107)
mda_fit <- train(Class ~ .,sonarTrain, 
             method = "mda",
             trControl = tunningctrl
             )

1.4.1 Compare to FRESA.CAD latent class modeling

LCmodel <- HLCM_EM(Class ~ .,sonarNTrain,hysteresis = 0.0)

1.4.2 ROC Plots and performance

par(mfrow = c(2,2),cex = 0.6);

gbm_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(gbm_fit,sonarTest,type="prob")[,"1"]),
                                 "caret gbm",
                                 cex=0.8)
#> caret gbm
avNNet_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(avNNet_fit,sonarTest,type="prob")[,"1"]),
                                   "caret avNNet",
                                   cex=0.8)
#> caret avNNet
lda2_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(lda2_fit,sonarTest,type="prob")[,"1"]),
                                  "caret lda2",
                                  cex=0.8)
#> caret lda2
mda_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(mda_fit,sonarTest,type="prob")[,"1"]),
                                 "caret Mixture Discriminant Analysis",
                                 cex=0.8)
#> caret Mixture Discriminant Analysis

HLCM_EM_bs <- predictionStats_binary(cbind(sonarNTest$Class,
                                   predict(LCmodel,sonarNTest)),
                                   "Latent Class Model",
                                   cex=0.8)
#> Latent Class Model

par(mfrow = c(1,1),cex = 1.0);

1.5 FRESA.CAD Cross-validation of Caret methods

Let us do a repeated cross validation using all data

par(mfrow = c(2,2),cex = 0.6);
caretlda2cv <- randomCV(theData,theOutcome,
                  train,
                  trainFraction = fraction,
                  repetitions = reps,
                  asFactor = TRUE,  
                  method = "lda2",
                  preProc = c("center", "scale"),
                  featureSelectionFunction = univariate_Wilcoxon,
                  featureSelection.control = list(thr = 0.95,limit=0.1)
                  )


caregbmcv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "gbm",
                  verbose = FALSE
                  )


careavNNetcv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "avNNet",
                  trControl = tunningctrl,
                  trace = FALSE
                  )


caregbmNocv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "gbm",
                  trControl = noTuningControl,
                  tuneGrid = data.frame(interaction.depth = 3,
                                       n.trees = 75,
                                       shrinkage = .1,
                                       n.minobsinnode = 20),
                  verbose = FALSE
                  )

ADABOOSTcv <- randomCV(fittingFunction=adaboost,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  featureSelectionFunction = univariate_Wilcoxon,
                  featureSelection.control = list(thr = 0.95),
                  asFactor = TRUE,
                  nIter=10
)


HCLAS_BSWiMScv <- randomCV(fittingFunction=HLCM_EM,
                   trainSampleSets=caretlda2cv$trainSamplesSets,hysteresis = 0.10)
par(mfrow = c(1,1),cex = 1.0);

1.6 FRESA.CAD::BinaryBenchmark and Comparing Methods

By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM


par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(lda2=caretlda2cv,
                                         gbm = caregbmcv,
                                         avNNet = careavNNetcv,
                                         gbmNot = caregbmNocv,
                                         adaboost = ADABOOSTcv,
                                         LCLAS = HCLAS_BSWiMScv
                                         ))


save(cp, file = CVFileName)
#load(file = CVFileName)

par(mfrow = c(1,1),cex = 1.0);

1.7 Reporting the results of the Benchmark procedure

Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.

par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)

pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)
Classifier Performance (continued below)
  gbm gbmNot LCLAS RF ENS avNNet adaboost lda2
BER 0.152 0.155 0.163 0.171 0.171 0.172 0.174 0.235
ACC 0.85 0.845 0.841 0.831 0.831 0.831 0.826 0.763
AUC 0.928 0.91 0.917 0.925 0.919 0.905 0.896 0.84
SEN 0.883 0.865 0.874 0.865 0.865 0.874 0.847 0.739
SPE 0.812 0.823 0.802 0.792 0.792 0.781 0.802 0.792
CIDX 0.913 0.899 0.882 0.918 0.921 0.898 0.868 0.835
  SVM RPART KNN LASSO
BER 0.238 0.243 0.253 0.255
ACC 0.763 0.758 0.754 0.744
AUC 0.86 0.82 0.869 0.826
SEN 0.793 0.784 0.838 0.721
SPE 0.729 0.729 0.656 0.771
CIDX 0.85 0.743 0.848 0.821

Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot


par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend

cp$featureSelectionFrequency$FS_gbm <- NULL
cp$featureSelectionFrequency$FS_avNNet <- NULL
cp$featureSelectionFrequency$FS_gbmNot <- NULL
cp$featureSelectionFrequency$FS_adaboost <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)

gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.5,srtCol = 25)