1 FRESA.CAD PimaIndiansDiabetes Benchmark

We will use FRESA.CAD to evaluate the performance of classifiers on the PimaIndiansDiabetes data set. The following classifiers will be evaluated:

BSWiMS and eBSWiMS, Discriminant Analysis: LDA and QDA, Naive Bayes: Raw and PCA, Boosting ADABOOST and Gradient Boost, BESS: Golden Section and Sequential, LASSO, RPART, Random Forest, SVM: Default, KNN, and the ensemble of Methods.

Furthermore, this vignette will teach how to interface a ML method into the FRESA.CAD::randomCV function.

1.1 The required libraries

library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)

1.2 PimaIndiansDiabetes Data Set

Loading the data from the mlbech package.

op <- par(no.readonly = TRUE,mar=c(5,5,5,5))
data("PimaIndiansDiabetes2", package = "mlbench")

We will test second-order interactions between features:

PimaIndiansDiabetes  <- PimaIndiansDiabetes2[complete.cases(PimaIndiansDiabetes2),]
PimaIndiansDiabetes$diabetes <- 1*(PimaIndiansDiabetes$diabetes == "pos")

PimaIndiansDiabetes.mat <- as.data.frame(model.matrix(diabetes~.*.,PimaIndiansDiabetes))
#PimaIndiansDiabetes.mat <- as.data.frame(model.matrix(diabetes ~ .,PimaIndiansDiabetes))
PimaIndiansDiabetes.mat$`(Intercept)` <- NULL
PimaIndiansDiabetes.mat$diabetes <- PimaIndiansDiabetes$diabetes
PimaIndiansDiabetes.mat[,1:ncol(PimaIndiansDiabetes.mat)] <- sapply(PimaIndiansDiabetes.mat,as.numeric)
fnames <- colnames(PimaIndiansDiabetes.mat)
fnames <- str_replace_all(fnames," ","_")
fnames <- str_replace_all(fnames,"/","_")
fnames <- str_replace_all(fnames,":","_x_")
colnames(PimaIndiansDiabetes.mat) <- fnames
table(PimaIndiansDiabetes.mat$diabetes)
#> 
#>   0   1 
#> 262 130

We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction


ExperimentName <- "PimaIndiansDiabetes"
theData <- PimaIndiansDiabetes.mat;
theOutcome <- "diabetes";
reps <- 100;
fraction <- "Bootstrap";

CVFileName <- paste(ExperimentName,"CVMethod_BootStrap.RDATA",sep = "_")

1.3 Gradient boosting from the gbm package

Creating model fitting wrapper for the graddient boost function of thegbm package. The following code will will take the gbm function API, and convert into FRESA.CAD API.


GBM_fit <- function(formula = formula, data=NULL,distribution = "bernoulli",n.trees = 1000,
                  shrinkage = 0.01, interaction.depth = 4,...)
{
  fit <- gbm(formula=formula,data = data,distribution = distribution,n.trees = n.trees,
                  shrinkage = shrinkage, interaction.depth = interaction.depth,...);
  selectedfeatures <- summary(fit,plotit = FALSE);
  sum <- 0;
  sfeat = 1;
  while (sum < 90) {sum <- sum + selectedfeatures[sfeat,2]; sfeat <- sfeat + 1;} #keep the ones that add to 90%

    result <- list(fit=fit,n.trees=n.trees,selectedfeatures=rownames(selectedfeatures[1:sfeat,]))
    class(result) <- "GBM_FIT";
    return(result)
}

We need a proper predict function for the boosting algorithm

predict.GBM_FIT <- function(object,...) 
{
        parameters <- list(...);
        testData <- parameters[[1]];
        n.trees = seq(from=(0.1*object$n.trees),to=object$n.trees, by=object$n.trees/25) #no of trees-a vector of 25 values 
        pLS <- predict(object$fit,testData,n.trees = n.trees);
        pLS <- 1.0/(1.0+exp(-apply(pLS,1,mean)))
        return(pLS);
}

Let check that evething is working:

gfit <- GBM_fit(formula = formula(paste(theOutcome," ~ .")),theData)
gfit$selectedfeatures
pr <- predict(gfit,theData)
bs <- predictionStats_binary(cbind(theData$diabetes,pr),"Gradient Boost: Training")

Now let me check the cross validation performance of the gradient boosting method


GradBOOSTcv <- randomCV(theData,theOutcome,GBM_fit,
                        trainFraction = fraction,
                        repetitions = reps)
bs <- predictionStats_binary(GradBOOSTcv$medianTest,"Gradient Boost: CV")

1.4 Cross-Validation of different ML-Methods

The FRESA.CAD::randomCV will be run several machine learning methods. But first let run the cross-validation on the BSWiMS method.
We will use the same train and testing partition on the other ML methods

BSWiMScv <- randomCV(theData,theOutcome,
                  BSWiMS.model,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets)
#> .[+++++++-].[+++++-].[++++++-].[++++-].[++++++-].[+++++-].[++++++++-].[++++++-].[+++++++-].[++++++++-].[+++++++++-].[++++++++-].[+++++-].[+++++++++-].[++++++-].[++++++-].[++-].[++++++-].[++++++-].[++++++++++-]..[++++++-].[+++++++-].[+++++++-].[+++++-].[+++++++-].[++++++-].[++++-].[++++++++-].[+++++++-].[++++++-].[+++++++-].[+++++-].[+++++++++-].[+++++++-].[+++++++-].[+++++-].[+++++++++-].[+++++++-].[++++++-].[+++++++-].[+++++-].[++++++++-].[+++++++-].[+++++-].[+++++++++-].[++++++-].[++++++++++-]..[++++++-].[++++++-].[+++++++++-].[++++++++-].[+++++++++-].[+++++++-].[+++++++++-].[+++++-].[++++-].[+++++-].[+++++++-].[+++++-].[+++++++-].[++++++++-].[++++-].[+++++++++-].[+++++++-].[+++++-].[+++++++-].[++++-].[+++++-].[+++++-].[+++++-].[+++++-].[++++++-].[++++++++-].[+++++++++-].[++++++-].[+++++++-].[+++++++++-].[++++++-].[++++++-].[++++++-].[+++++-].[++++++-].[+++++++-].[+++++-].[++-].[++++-].[+++++++-].[++++++-].[+++-].[++++++++-].[+++++-].[++++++++-].[+++++++-].[+++++++++-].[++++++-].[+++++++-].[++++-].[+++++++-].[++++-].[++++++++-]
bs <- predictionStats_binary(BSWiMScv$medianTest,"BSWiMS")
#> BSWiMS

Let us do the same for another set of common ML methods

eBSWiMScv <- randomCV(theData,theOutcome,
                  BSWiMS.model,trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  NumberofRepeats = -1)

bs <- predictionStats_binary(eBSWiMScv$medianTest,"eBSWiMS")



QDAcv <- randomCV(theData,theOutcome,
                  MASS::qda,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = BSWiMScv$selectedFeaturesSet,method="mve")
bs <- predictionStats_binary(QDAcv$medianTest,"QDA")


LDAcv <- randomCV(theData,theOutcome,
                  MASS::lda,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = BSWiMScv$selectedFeaturesSet)
bs <- predictionStats_binary(LDAcv$medianTest,"LDA")



NAIVEBAYEScv <- randomCV(theData,theOutcome,
                  NAIVE_BAYES,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = BSWiMScv$selectedFeaturesSet,pca=TRUE,usekernel = TRUE)
bs <- predictionStats_binary(NAIVEBAYEScv$medianTest,"Naive Bayes:PCA")


RawNAIVEBAYEScv <- randomCV(theData,theOutcome,
                  NAIVE_BAYES,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = BSWiMScv$selectedFeaturesSet,pca=FALSE)
bs <- predictionStats_binary(RawNAIVEBAYEScv$medianTest,"Naive Bayes:RAW")


ADABOOSTcv <- randomCV(theData,theOutcome,
                  adaboost,
                  trainSampleSets=GradBOOSTcv$trainSamplesSets,
                  featureSelectionFunction = univariate_Wilcoxon,
                  featureSelection.control = list(thr = 0.95),asFactor = TRUE,nIter=10
)
bs <- predictionStats_binary(ADABOOSTcv$medianTest,"Ada Boost with Wilcoxon Filter")


BESScv <- randomCV(theData,theOutcome,BESS,trainSampleSets=GradBOOSTcv$trainSamplesSets)
bs <- predictionStats_binary(BESScv$medianTest,"BeSS Sequential")

BESSGoldencv <- randomCV(theData,theOutcome,BESS,trainSampleSets=GradBOOSTcv$trainSamplesSets,method="gsection")
bs <- predictionStats_binary(BESSGoldencv$medianTest,"BeSS Golden Section")

1.5 FRESA.CAD::BinaryBenchmark and Comparing Methods

By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM

par(op);

par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(Gradient_Boost = GradBOOSTcv,
                                         BSWiMS = BSWiMScv,
                                         eBSWiMS = eBSWiMScv,
                                         LDA = LDAcv,
                                         QDA = QDAcv,
                                         BeSS_Seq = BESScv,
                                         BeSS_Golden = BESSGoldencv,
                                         ADA_BOOST = ADABOOSTcv,
                                         NAIVEBAYES_PCA = NAIVEBAYEScv,
                                         NAIVEBAYES_RAW = RawNAIVEBAYEScv))


save(cp, file = CVFileName)
load(file = CVFileName)

1.6 Reporting the results of the Benchmark procedure

Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.

par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)

pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)
Classifier Performance (continued below)
  SVM LDA BeSS_Seq NAIVEBAYES_PCA RPART ENS
BER 0.232 0.239 0.241 0.247 0.247 0.247
ACC 0.773 0.763 0.768 0.753 0.765 0.76
AUC 0.833 0.842 0.844 0.816 0.831 0.845
SEN 0.754 0.754 0.731 0.754 0.715 0.731
SPE 0.782 0.767 0.786 0.752 0.79 0.775
CIDX 0.823 0.821 0.826 0.772 0.756 0.845
Table continues below
  ADA_BOOST LASSO KNN Gradient_Boost RF QDA
BER 0.253 0.254 0.254 0.255 0.256 0.256
ACC 0.776 0.755 0.735 0.763 0.768 0.737
AUC 0.833 0.837 0.827 0.84 0.833 0.798
SEN 0.662 0.715 0.777 0.692 0.669 0.762
SPE 0.832 0.775 0.714 0.798 0.817 0.725
CIDX 0.795 0.823 0.812 0.836 0.831 0.744
  BeSS_Golden BSWiMS NAIVEBAYES_RAW eBSWiMS
BER 0.258 0.259 0.261 0.261
ACC 0.753 0.758 0.77 0.755
AUC 0.812 0.847 0.835 0.85
SEN 0.708 0.692 0.646 0.692
SPE 0.775 0.79 0.832 0.786
CIDX 0.781 0.841 0.835 0.842

Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot


par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend

cp$featureSelectionFrequency$FS_QDA <- NULL
cp$featureSelectionFrequency$FS_LDA <- NULL
cp$featureSelectionFrequency$FS_NAIVEBAYES_PCA <- NULL
cp$featureSelectionFrequency$FS_NAIVEBAYES_RAW <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)

gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.5,srtCol = 25)