1 CARET FRESA.CAD Sonar Benchmark

We will show hot to integrate caret models into FRESA benchmarking

1.1 The required libraries

library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)
library(caret)
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)

1.2 Sonar Data Set

Loading the data from the mlbech package.

data(Sonar, package = "mlbench")
Sonar$Class <- 1*(Sonar$Class == "M")
table(Sonar$Class)

We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction

SonarF <- Sonar
SonarF$Class <- as.factor(SonarF$Class)


ExperimentName <- "Sonar_Caret"
theData <- Sonar;
theOutcome <- "Class";
reps <- 20;
fraction <- 0.75;

tranSet <- sample(nrow(SonarF),fraction*nrow(SonarF))
sonarTrain <- SonarF[tranSet,]
sonarTest <- SonarF[-tranSet,]

sonarNTrain <- Sonar[tranSet,]
sonarNTest <- Sonar[-tranSet,]

CVFileName <- paste(ExperimentName,"CVMethod_CARET_web.RDATA",sep = "_")

1.3 Caret taining control


tunningctrl <- trainControl(
  method = "repeatedcv", 
  number = 5,
  repeats = 3
)

noTuningControl <- trainControl(method = "none")

1.4 Simple hold-out validation of caret methods


set.seed(107)
gbm_fit <- train(Class ~ .,sonarTrain, 
             method = "gbm",  
             trControl = tunningctrl,
             preProc = c("center", "scale"),
             verbose = FALSE)


set.seed(107)
avNNet_fit <- train(Class ~ .,sonarTrain, 
             method = "avNNet",
             trControl = tunningctrl,
             trace = FALSE
             )


set.seed(107)
lda2_fit <- train(Class ~ .,sonarTrain, 
             method = "lda2",
             trControl = tunningctrl
             )

set.seed(107)
mda_fit <- train(Class ~ .,sonarTrain, 
             method = "mda",
             trControl = tunningctrl
             )

1.4.1 Compare to FRESA.CAD latent class modeling

LCmodel <- HLCM_EM(Class ~ .,sonarNTrain)

1.4.2 ROC Plots and performance

par(mfrow = c(2,2),cex = 0.6);

gbm_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(gbm_fit,sonarTest,type="prob")[,"1"]),
                                 "caret gbm",
                                 cex=0.8)
#> caret gbm
avNNet_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(avNNet_fit,sonarTest,type="prob")[,"1"]),
                                   "caret avNNet",
                                   cex=0.8)
#> caret avNNet
lda2_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(lda2_fit,sonarTest,type="prob")[,"1"]),
                                  "caret lda2",
                                  cex=0.8)
#> caret lda2
mda_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(mda_fit,sonarTest,type="prob")[,"1"]),
                                 "caret Mixture Discriminant Analysis",
                                 cex=0.8)
#> caret Mixture Discriminant Analysis

HLCM_EM_bs <- predictionStats_binary(cbind(sonarNTest$Class,
                                   predict(LCmodel,sonarNTest)),
                                   "Latent Class Model",
                                   cex=0.8)
#> Latent Class Model

par(mfrow = c(1,1),cex = 1.0);

1.5 FRESA.CAD Cross-validation of Caret methods

Let us do a repeated cross validation using all data

par(mfrow = c(2,2),cex = 0.6);
caretlda2cv <- randomCV(theData,theOutcome,
                  train,
                  trainFraction = fraction,
                  repetitions = reps,
                  asFactor = TRUE,  
                  method = "lda2",
                  preProc = c("center", "scale"),
                  featureSelectionFunction = univariate_Wilcoxon,
                  featureSelection.control = list(thr = 0.95,limit=0.1)
                  )


caregbmcv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "gbm",
                  verbose = FALSE
                  )


careavNNetcv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "avNNet",
                  trControl = tunningctrl,
                  trace = FALSE
                  )


caregbmNocv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "gbm",
                  trControl = noTuningControl,
                  tuneGrid = data.frame(interaction.depth = 3,
                                       n.trees = 75,
                                       shrinkage = .1,
                                       n.minobsinnode = 20),
                  verbose = FALSE
                  )

ADABOOSTcv <- randomCV(fittingFunction=adaboost,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  featureSelectionFunction = univariate_Wilcoxon,
                  featureSelection.control = list(thr = 0.95),
                  asFactor = TRUE,
                  nIter=10
)


HCLAS_BSWiMScv <- randomCV(fittingFunction=HLCM_EM,
                   trainSampleSets=caretlda2cv$trainSamplesSets,hysteresis = 0.10)
par(mfrow = c(1,1),cex = 1.0);

1.6 FRESA.CAD::BinaryBenchmark and Comparing Methods

By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM


cStats <- predictionStats_binary(HCLAS_BSWiMScv$medianTest,plotname = "CHK",center = TRUE,cex=0.8);


  
par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(lda2=caretlda2cv,
                                         gbm = caregbmcv,
                                         avNNet = careavNNetcv,
                                         gbmNot = caregbmNocv,
                                         adaboost = ADABOOSTcv,
                                         LCLAS = HCLAS_BSWiMScv
                                         ))


save(cp, file = CVFileName)
#load(file = CVFileName)

par(mfrow = c(1,1),cex = 1.0);

1.7 Reporting the results of the Benchmark procedure

Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.

par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)

pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)
Classifier Performance (continued below)
  gbm gbmNot avNNet RF ENS adaboost KNN lda2
BER 0.151 0.161 0.175 0.181 0.189 0.195 0.2 0.212
ACC 0.851 0.841 0.827 0.822 0.812 0.808 0.803 0.784
AUC 0.93 0.916 0.9 0.927 0.92 0.904 0.908 0.864
SEN 0.892 0.892 0.883 0.874 0.856 0.856 0.847 0.757
SPE 0.804 0.784 0.763 0.763 0.763 0.753 0.753 0.814
CIDX 0.925 0.913 0.899 0.926 0.921 0.879 0.886 0.859
  RPART LCLAS SVM LASSO
BER 0.227 0.231 0.232 0.245
ACC 0.774 0.769 0.769 0.755
AUC 0.818 0.868 0.886 0.83
SEN 0.811 0.766 0.802 0.748
SPE 0.732 0.773 0.732 0.763
CIDX 0.772 0.864 0.876 0.842

Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot


par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend

cp$featureSelectionFrequency$FS_gbm <- NULL
cp$featureSelectionFrequency$FS_avNNet <- NULL
cp$featureSelectionFrequency$FS_gbmNot <- NULL
cp$featureSelectionFrequency$FS_adaboost <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)

gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.5,srtCol = 25)