1 FRESA.CAD PimaIndiansDiabetes Benchmark

We will use FRESA.CAD to evaluate the performance of classifiers on the PimaIndiansDiabetes data set. The following classifiers will be evaluated:

BSWiMS and eBSWiMS, Discriminant Analysis: LDA and QDA, Naive Bayes: Raw and PCA, Boosting ADABOOST and Gradient Boost, BESS: Golden Section and Sequential, LASSO, RPART, Random Forest, SVM: Default, KNN, and the ensemble of Methods.

Furthermore, this vignette will teach how to interface a ML method into the FRESA.CAD::randomCV function.

1.1 The required libraries

library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)

1.2 PimaIndiansDiabetes Data Set

Loading the data from the mlbech package.

data("PimaIndiansDiabetes2", package = "mlbench")

We will test second-order interactions between features:

PimaIndiansDiabetes  <- PimaIndiansDiabetes2[complete.cases(PimaIndiansDiabetes2),]
PimaIndiansDiabetes$diabetes <- 1*(PimaIndiansDiabetes$diabetes == "pos")

#PimaIndiansDiabetes.mat <- as.data.frame(model.matrix(diabetes~.*.*.*.,PimaIndiansDiabetes))
PimaIndiansDiabetes.mat <- as.data.frame(model.matrix(diabetes ~ .,PimaIndiansDiabetes))
PimaIndiansDiabetes.mat$`(Intercept)` <- NULL
PimaIndiansDiabetes.mat$diabetes <- PimaIndiansDiabetes$diabetes
PimaIndiansDiabetes.mat[,1:ncol(PimaIndiansDiabetes.mat)] <- sapply(PimaIndiansDiabetes.mat,as.numeric)
fnames <- colnames(PimaIndiansDiabetes.mat)
fnames <- str_replace_all(fnames," ","_")
fnames <- str_replace_all(fnames,"/","_")
fnames <- str_replace_all(fnames,":","_x_")
colnames(PimaIndiansDiabetes.mat) <- fnames
table(PimaIndiansDiabetes.mat$diabetes)
#> 
#>   0   1 
#> 262 130

We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction


ExperimentName <- "PimaIndiansDiabetes"
theData <- PimaIndiansDiabetes.mat;
theOutcome <- "diabetes";
reps <- 60;
fraction <- 0.50;

CVFileName <- paste(ExperimentName,"CVMethod_v2.RDATA",sep = "_")
op <- par(no.readonly = TRUE)

1.3 Gradient boosting from the gbm package

Creating model fitting wrapper for the graddient boost function of thegbm package. The following code will will take the gbm function API, and convert into FRESA.CAD API.


GBM_fit <- function(formula = formula, data=NULL,distribution = "bernoulli",n.trees = 1000,
                  shrinkage = 0.01, interaction.depth = 4,...)
{
  fit <- gbm(formula=formula,data = data,distribution = distribution,n.trees = n.trees,
                  shrinkage = shrinkage, interaction.depth = interaction.depth,...);
  selectedfeatures <- summary(fit,plotit = FALSE);
  sum <- 0;
  sfeat = 1;
  while (sum < 90) {sum <- sum + selectedfeatures[sfeat,2]; sfeat <- sfeat + 1;} #keep the ones that add to 90%

    result <- list(fit=fit,n.trees=n.trees,selectedfeatures=rownames(selectedfeatures[1:sfeat,]))
    class(result) <- "GBM_FIT";
    return(result)
}

We need a proper predict function for the boosting algorithm

predict.GBM_FIT <- function(object,...) 
{
        parameters <- list(...);
        testData <- parameters[[1]];
        n.trees = seq(from=(0.1*object$n.trees),to=object$n.trees, by=object$n.trees/25) #no of trees-a vector of 25 values 
        pLS <- predict(object$fit,testData,n.trees = n.trees);
        pLS <- 1.0/(1.0+exp(-apply(pLS,1,mean)))
        return(pLS);
}

Let check that evething is working:

gfit <- GBM_fit(formula = formula(paste(theOutcome," ~ .")),theData)
gfit$selectedfeatures
pr <- predict(gfit,theData)
bs <- predictionStats_binary(cbind(theData$diabetes,pr),"Gradient Boost: Training")

Now let me check the cross validation performance of the gradient boosting method


GradBOOSTcv <- randomCV(theData,theOutcome,GBM_fit,
                        trainFraction = fraction,
                        repetitions = reps)
bs <- predictionStats_binary(GradBOOSTcv$medianTest,"Gradient Boost: CV")

1.4 Cross-Validation of different ML-Methods

The FRESA.CAD::randomCV will be run several machine learning methods. But first let run the cross-validation on the BSWiMS method.
We will use the same train and testing partition on the other ML methods

BSWiMScv <- randomCV(fittingFunction=BSWiMS.model,
                     trainSampleSets=GradBOOSTcv$trainSamplesSets)
#> .[+-].[+-].[++].[++].[+-].[++-].[++].[++].[++].[+-].[++-].[++-].[++].[++].[++-].[++-].[+-].[++-].[+-].[++-].[+-].[+-].[++-].[++].[++].[+-].[+-].[++].[++-].[++].[+-].[++].[++-].[++].[++-].[+-].[+++].[+-].[++-].[+-].[++-].[++-].[++].[+-].[++-].[++].[++].[++].[++].[++-].[+-].[++-].[++-].[++].[+-].[+-].[++-].[++-].[++-].[++]

bs <- predictionStats_binary(BSWiMScv$medianTest,"BSWiMS")
#> BSWiMS

Let us do the same for another set of common ML methods

eBSWiMScv <- randomCV(fittingFunction=BSWiMS.model,
                      trainSampleSets=GradBOOSTcv$trainSamplesSets,
                      NumberofRepeats = -1)

bs <- predictionStats_binary(eBSWiMScv$medianTest,"eBSWiMS")



QDAcv <- randomCV(fittingFunction=MASS::qda,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = BSWiMScv$selectedFeaturesSet,
                  method="mve")

bs <- predictionStats_binary(QDAcv$medianTest,"QDA")


LDAcv <- randomCV(fittingFunction=MASS::lda,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = BSWiMScv$selectedFeaturesSet)

bs <- predictionStats_binary(LDAcv$medianTest,"LDA")



NAIVEBAYEScv <- randomCV(fittingFunction=NAIVE_BAYES,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = BSWiMScv$selectedFeaturesSet,pca=TRUE,usekernel = TRUE)

bs <- predictionStats_binary(NAIVEBAYEScv$medianTest,"Naive Bayes:PCA")


RawNAIVEBAYEScv <- randomCV(fittingFunction=NAIVE_BAYES,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = BSWiMScv$selectedFeaturesSet,pca=FALSE)
bs <- predictionStats_binary(RawNAIVEBAYEScv$medianTest,"Naive Bayes:RAW")


ADABOOSTcv <- randomCV(fittingFunction=adaboost,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = univariate_Wilcoxon,
                  featureSelection.control = list(thr = 0.95),asFactor = TRUE,nIter=10
)
bs <- predictionStats_binary(ADABOOSTcv$medianTest,"Ada Boost with Wilcoxon Filter")


BESScv <- randomCV(fittingFunction=BESS,
                   trainSampleSets=GradBOOSTcv$trainSamplesSets)
bs <- predictionStats_binary(BESScv$medianTest,"BeSS Sequential")


BESSGoldencv <- randomCV(fittingFunction=BESS,
                         trainSampleSets=GradBOOSTcv$trainSamplesSets,
                         method="gsection")
bs <- predictionStats_binary(BESSGoldencv$medianTest,"BeSS Golden Section")

1.5 FRESA.CAD::BinaryBenchmark and Comparing Methods

By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM

par(op);

par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(Gradient_Boost = GradBOOSTcv,
                                         BSWiMS = BSWiMScv,
                                         eBSWiMS = eBSWiMScv,
                                         LDA = LDAcv,
                                         QDA = QDAcv,
                                         BeSS_Seq = BESScv,
                                         BeSS_Golden = BESSGoldencv,
                                         ADA_BOOST = ADABOOSTcv,
                                         NAIVEBAYES_PCA = NAIVEBAYEScv,
                                         NAIVEBAYES_RAW = RawNAIVEBAYEScv))


save(cp, file = CVFileName)
load(file = CVFileName)

1.6 Reporting the results of the Benchmark procedure

Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.

#par(op);
par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)

pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)
Classifier Performance (continued below)
  ADA_BOOST RPART RF SVM ENS Gradient_Boost
BER 0.21 0.213 0.215 0.228 0.235 0.237
ACC 0.781 0.781 0.783 0.765 0.768 0.758
AUC 0.85 0.838 0.849 0.836 0.847 0.844
SEN 0.815 0.8 0.792 0.792 0.754 0.777
SPE 0.763 0.771 0.779 0.752 0.775 0.748
CIDX 0.814 0.774 0.844 0.831 0.848 0.839
Table continues below
  BeSS_Golden BeSS_Seq LASSO LDA BSWiMS NAIVEBAYES_PCA
BER 0.243 0.243 0.247 0.248 0.249 0.252
ACC 0.763 0.768 0.763 0.765 0.76 0.745
AUC 0.845 0.84 0.847 0.845 0.842 0.82
SEN 0.738 0.723 0.723 0.708 0.723 0.754
SPE 0.775 0.79 0.782 0.794 0.779 0.74
CIDX 0.845 0.839 0.848 0.846 0.837 0.806
  eBSWiMS KNN NAIVEBAYES_RAW QDA
BER 0.255 0.255 0.259 0.271
ACC 0.758 0.737 0.76 0.704
AUC 0.844 0.83 0.828 0.8
SEN 0.708 0.769 0.685 0.808
SPE 0.782 0.721 0.798 0.653
CIDX 0.833 0.824 0.827 0.798
pander::pander(prBenchmark$mcnemar,caption = "McNemar's Test",round = 3)
McNemar’s Test (continued below)
  Gradient_Boost BSWiMS eBSWiMS LDA QDA
Gradient_Boost 1 0.019 0.004 0.001 0.001
BSWiMS 0.019 1 0.405 0.134 0
eBSWiMS 0.004 0.405 1 0.18 0
LDA 0.001 0.134 0.18 1 0
QDA 0.001 0 0 0 1
BeSS_Seq 0.005 0.405 1 0.083 0
BeSS_Golden 0.058 0.491 0.083 0.013 0
ADA_BOOST 0.827 0.016 0.003 0.001 0.001
NAIVEBAYES_PCA 0.893 0.027 0.006 0.002 0
NAIVEBAYES_RAW 0.001 0.025 0.209 0.48 0
RF 0.201 0.128 0.04 0.014 0
LASSO 0 0 0 0 0
RPART 0.631 0.083 0.032 0.011 0
KNN 0 0.077 0.25 0.527 0
SVM.mRMR 0.873 0.005 0.001 0 0
ENS 0.086 0.132 0.011 0.002 0
Table continues below
  BeSS_Seq BeSS_Golden ADA_BOOST NAIVEBAYES_PCA
Gradient_Boost 0.005 0.058 0.827 0.893
BSWiMS 0.405 0.491 0.016 0.027
eBSWiMS 1 0.083 0.003 0.006
LDA 0.083 0.013 0.001 0.002
QDA 0 0 0.001 0
BeSS_Seq 1 0.083 0.005 0.008
BeSS_Golden 0.083 1 0.042 0.093
ADA_BOOST 0.005 0.042 1 0.789
NAIVEBAYES_PCA 0.008 0.093 0.789 1
NAIVEBAYES_RAW 0.209 0.033 0.001 0
RF 0.046 0.343 0.144 0.466
LASSO 0 0 0 0
RPART 0.029 0.208 0.546 0.799
KNN 0.25 0.037 0 0
SVM.mRMR 0 0.012 1 0.746
ENS 0.021 0.593 0.063 0.128
Table continues below
  NAIVEBAYES_RAW RF LASSO RPART KNN SVM.mRMR
Gradient_Boost 0.001 0.201 0 0.631 0 0.873
BSWiMS 0.025 0.128 0 0.083 0.077 0.005
eBSWiMS 0.209 0.04 0 0.032 0.25 0.001
LDA 0.48 0.014 0 0.011 0.527 0
QDA 0 0 0 0 0 0
BeSS_Seq 0.209 0.046 0 0.029 0.25 0
BeSS_Golden 0.033 0.343 0 0.208 0.037 0.012
ADA_BOOST 0.001 0.144 0 0.546 0 1
NAIVEBAYES_PCA 0 0.466 0 0.799 0 0.746
NAIVEBAYES_RAW 1 0.007 0 0.003 1 0
RF 0.007 1 0 0.631 0.003 0.223
LASSO 0 0 1 0 0 0
RPART 0.003 0.631 0 1 0.004 0.564
KNN 1 0.003 0 0.004 1 0
SVM.mRMR 0 0.223 0 0.564 0 1
ENS 0.003 0.45 0 0.307 0.009 0.022
  ENS
Gradient_Boost 0.086
BSWiMS 0.132
eBSWiMS 0.011
LDA 0.002
QDA 0
BeSS_Seq 0.021
BeSS_Golden 0.593
ADA_BOOST 0.063
NAIVEBAYES_PCA 0.128
NAIVEBAYES_RAW 0.003
RF 0.45
LASSO 0
RPART 0.307
KNN 0.009
SVM.mRMR 0.022
ENS 1

Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot


par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend

cp$featureSelectionFrequency$FS_QDA <- NULL
cp$featureSelectionFrequency$FS_LDA <- NULL
cp$featureSelectionFrequency$FS_NAIVEBAYES_PCA <- NULL
cp$featureSelectionFrequency$FS_NAIVEBAYES_RAW <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)
gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.75,srtCol = 25)