We will use FRESA.CAD to evaluate the performance of classifiers on the PimaIndiansDiabetes data set. The following classifiers will be evaluated:
BSWiMS and eBSWiMS, Discriminant Analysis: LDA and QDA, Naive Bayes: Raw and PCA, Boosting ADABOOST and Gradient Boost, BESS: Golden Section and Sequential, LASSO, RPART, Random Forest, SVM: Default, KNN, and the ensemble of Methods.
Furthermore, this vignette will teach how to interface a ML method into the FRESA.CAD::randomCV function.
library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)
Loading the data from the mlbech package.
data("PimaIndiansDiabetes2", package = "mlbench")
We will test second-order interactions between features:
PimaIndiansDiabetes <- PimaIndiansDiabetes2[complete.cases(PimaIndiansDiabetes2),]
PimaIndiansDiabetes$diabetes <- 1*(PimaIndiansDiabetes$diabetes == "pos")
#PimaIndiansDiabetes.mat <- as.data.frame(model.matrix(diabetes~.*.*.*.,PimaIndiansDiabetes))
PimaIndiansDiabetes.mat <- as.data.frame(model.matrix(diabetes ~ .,PimaIndiansDiabetes))
PimaIndiansDiabetes.mat$`(Intercept)` <- NULL
PimaIndiansDiabetes.mat$diabetes <- PimaIndiansDiabetes$diabetes
PimaIndiansDiabetes.mat[,1:ncol(PimaIndiansDiabetes.mat)] <- sapply(PimaIndiansDiabetes.mat,as.numeric)
fnames <- colnames(PimaIndiansDiabetes.mat)
fnames <- str_replace_all(fnames," ","_")
fnames <- str_replace_all(fnames,"/","_")
fnames <- str_replace_all(fnames,":","_x_")
colnames(PimaIndiansDiabetes.mat) <- fnames
table(PimaIndiansDiabetes.mat$diabetes)
#>
#> 0 1
#> 262 130
We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction
ExperimentName <- "PimaIndiansDiabetes"
theData <- PimaIndiansDiabetes.mat;
theOutcome <- "diabetes";
reps <- 60;
fraction <- 0.50;
CVFileName <- paste(ExperimentName,"CVMethod_v2.RDATA",sep = "_")
op <- par(no.readonly = TRUE)
Creating model fitting wrapper for the graddient boost function of thegbm package. The following code will will take the gbm function API, and convert into FRESA.CAD API.
GBM_fit <- function(formula = formula, data=NULL,distribution = "bernoulli",n.trees = 1000,
shrinkage = 0.01, interaction.depth = 4,...)
{
fit <- gbm(formula=formula,data = data,distribution = distribution,n.trees = n.trees,
shrinkage = shrinkage, interaction.depth = interaction.depth,...);
selectedfeatures <- summary(fit,plotit = FALSE);
sum <- 0;
sfeat = 1;
while (sum < 90) {sum <- sum + selectedfeatures[sfeat,2]; sfeat <- sfeat + 1;} #keep the ones that add to 90%
result <- list(fit=fit,n.trees=n.trees,selectedfeatures=rownames(selectedfeatures[1:sfeat,]))
class(result) <- "GBM_FIT";
return(result)
}
We need a proper predict function for the boosting algorithm
predict.GBM_FIT <- function(object,...)
{
parameters <- list(...);
testData <- parameters[[1]];
n.trees = seq(from=(0.1*object$n.trees),to=object$n.trees, by=object$n.trees/25) #no of trees-a vector of 25 values
pLS <- predict(object$fit,testData,n.trees = n.trees);
pLS <- 1.0/(1.0+exp(-apply(pLS,1,mean)))
return(pLS);
}
Let check that evething is working:
gfit <- GBM_fit(formula = formula(paste(theOutcome," ~ .")),theData)
gfit$selectedfeatures
pr <- predict(gfit,theData)
bs <- predictionStats_binary(cbind(theData$diabetes,pr),"Gradient Boost: Training")
Now let me check the cross validation performance of the gradient boosting method
GradBOOSTcv <- randomCV(theData,theOutcome,GBM_fit,
trainFraction = fraction,
repetitions = reps)
bs <- predictionStats_binary(GradBOOSTcv$medianTest,"Gradient Boost: CV")
The FRESA.CAD::randomCV will be run several machine learning methods. But first let run the cross-validation on the BSWiMS method.
We will use the same train and testing partition on the other ML methods
BSWiMScv <- randomCV(fittingFunction=BSWiMS.model,
trainSampleSets=GradBOOSTcv$trainSamplesSets)
#> .[+-].[+-].[++].[++].[+-].[++-].[++].[++].[++].[+-].[++-].[++-].[++].[++].[++-].[++-].[+-].[++-].[+-].[++-].[+-].[+-].[++-].[++].[++].[+-].[+-].[++].[++-].[++].[+-].[++].[++-].[++].[++-].[+-].[+++].[+-].[++-].[+-].[++-].[++-].[++].[+-].[++-].[++].[++].[++].[++].[++-].[+-].[++-].[++-].[++].[+-].[+-].[++-].[++-].[++-].[++]
bs <- predictionStats_binary(BSWiMScv$medianTest,"BSWiMS")
#> BSWiMS
Let us do the same for another set of common ML methods
eBSWiMScv <- randomCV(fittingFunction=BSWiMS.model,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
NumberofRepeats = -1)
bs <- predictionStats_binary(eBSWiMScv$medianTest,"eBSWiMS")
QDAcv <- randomCV(fittingFunction=MASS::qda,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = BSWiMScv$selectedFeaturesSet,
method="mve")
bs <- predictionStats_binary(QDAcv$medianTest,"QDA")
LDAcv <- randomCV(fittingFunction=MASS::lda,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = BSWiMScv$selectedFeaturesSet)
bs <- predictionStats_binary(LDAcv$medianTest,"LDA")
NAIVEBAYEScv <- randomCV(fittingFunction=NAIVE_BAYES,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = BSWiMScv$selectedFeaturesSet,pca=TRUE,usekernel = TRUE)
bs <- predictionStats_binary(NAIVEBAYEScv$medianTest,"Naive Bayes:PCA")
RawNAIVEBAYEScv <- randomCV(fittingFunction=NAIVE_BAYES,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = BSWiMScv$selectedFeaturesSet,pca=FALSE)
bs <- predictionStats_binary(RawNAIVEBAYEScv$medianTest,"Naive Bayes:RAW")
ADABOOSTcv <- randomCV(fittingFunction=adaboost,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = univariate_Wilcoxon,
featureSelection.control = list(thr = 0.95),asFactor = TRUE,nIter=10
)
bs <- predictionStats_binary(ADABOOSTcv$medianTest,"Ada Boost with Wilcoxon Filter")
BESScv <- randomCV(fittingFunction=BESS,
trainSampleSets=GradBOOSTcv$trainSamplesSets)
bs <- predictionStats_binary(BESScv$medianTest,"BeSS Sequential")
BESSGoldencv <- randomCV(fittingFunction=BESS,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
method="gsection")
bs <- predictionStats_binary(BESSGoldencv$medianTest,"BeSS Golden Section")
By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM
par(op);
par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(Gradient_Boost = GradBOOSTcv,
BSWiMS = BSWiMScv,
eBSWiMS = eBSWiMScv,
LDA = LDAcv,
QDA = QDAcv,
BeSS_Seq = BESScv,
BeSS_Golden = BESSGoldencv,
ADA_BOOST = ADABOOSTcv,
NAIVEBAYES_PCA = NAIVEBAYEScv,
NAIVEBAYES_RAW = RawNAIVEBAYEScv))
save(cp, file = CVFileName)
load(file = CVFileName)
Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.
#par(op);
par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)
pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)
| ADA_BOOST | RPART | RF | SVM | ENS | Gradient_Boost | |
|---|---|---|---|---|---|---|
| BER | 0.21 | 0.213 | 0.215 | 0.228 | 0.235 | 0.237 |
| ACC | 0.781 | 0.781 | 0.783 | 0.765 | 0.768 | 0.758 |
| AUC | 0.85 | 0.838 | 0.849 | 0.836 | 0.847 | 0.844 |
| SEN | 0.815 | 0.8 | 0.792 | 0.792 | 0.754 | 0.777 |
| SPE | 0.763 | 0.771 | 0.779 | 0.752 | 0.775 | 0.748 |
| CIDX | 0.814 | 0.774 | 0.844 | 0.831 | 0.848 | 0.839 |
| BeSS_Golden | BeSS_Seq | LASSO | LDA | BSWiMS | NAIVEBAYES_PCA | |
|---|---|---|---|---|---|---|
| BER | 0.243 | 0.243 | 0.247 | 0.248 | 0.249 | 0.252 |
| ACC | 0.763 | 0.768 | 0.763 | 0.765 | 0.76 | 0.745 |
| AUC | 0.845 | 0.84 | 0.847 | 0.845 | 0.842 | 0.82 |
| SEN | 0.738 | 0.723 | 0.723 | 0.708 | 0.723 | 0.754 |
| SPE | 0.775 | 0.79 | 0.782 | 0.794 | 0.779 | 0.74 |
| CIDX | 0.845 | 0.839 | 0.848 | 0.846 | 0.837 | 0.806 |
| eBSWiMS | KNN | NAIVEBAYES_RAW | QDA | |
|---|---|---|---|---|
| BER | 0.255 | 0.255 | 0.259 | 0.271 |
| ACC | 0.758 | 0.737 | 0.76 | 0.704 |
| AUC | 0.844 | 0.83 | 0.828 | 0.8 |
| SEN | 0.708 | 0.769 | 0.685 | 0.808 |
| SPE | 0.782 | 0.721 | 0.798 | 0.653 |
| CIDX | 0.833 | 0.824 | 0.827 | 0.798 |
pander::pander(prBenchmark$mcnemar,caption = "McNemar's Test",round = 3)
| Gradient_Boost | BSWiMS | eBSWiMS | LDA | QDA | |
|---|---|---|---|---|---|
| Gradient_Boost | 1 | 0.019 | 0.004 | 0.001 | 0.001 |
| BSWiMS | 0.019 | 1 | 0.405 | 0.134 | 0 |
| eBSWiMS | 0.004 | 0.405 | 1 | 0.18 | 0 |
| LDA | 0.001 | 0.134 | 0.18 | 1 | 0 |
| QDA | 0.001 | 0 | 0 | 0 | 1 |
| BeSS_Seq | 0.005 | 0.405 | 1 | 0.083 | 0 |
| BeSS_Golden | 0.058 | 0.491 | 0.083 | 0.013 | 0 |
| ADA_BOOST | 0.827 | 0.016 | 0.003 | 0.001 | 0.001 |
| NAIVEBAYES_PCA | 0.893 | 0.027 | 0.006 | 0.002 | 0 |
| NAIVEBAYES_RAW | 0.001 | 0.025 | 0.209 | 0.48 | 0 |
| RF | 0.201 | 0.128 | 0.04 | 0.014 | 0 |
| LASSO | 0 | 0 | 0 | 0 | 0 |
| RPART | 0.631 | 0.083 | 0.032 | 0.011 | 0 |
| KNN | 0 | 0.077 | 0.25 | 0.527 | 0 |
| SVM.mRMR | 0.873 | 0.005 | 0.001 | 0 | 0 |
| ENS | 0.086 | 0.132 | 0.011 | 0.002 | 0 |
| BeSS_Seq | BeSS_Golden | ADA_BOOST | NAIVEBAYES_PCA | |
|---|---|---|---|---|
| Gradient_Boost | 0.005 | 0.058 | 0.827 | 0.893 |
| BSWiMS | 0.405 | 0.491 | 0.016 | 0.027 |
| eBSWiMS | 1 | 0.083 | 0.003 | 0.006 |
| LDA | 0.083 | 0.013 | 0.001 | 0.002 |
| QDA | 0 | 0 | 0.001 | 0 |
| BeSS_Seq | 1 | 0.083 | 0.005 | 0.008 |
| BeSS_Golden | 0.083 | 1 | 0.042 | 0.093 |
| ADA_BOOST | 0.005 | 0.042 | 1 | 0.789 |
| NAIVEBAYES_PCA | 0.008 | 0.093 | 0.789 | 1 |
| NAIVEBAYES_RAW | 0.209 | 0.033 | 0.001 | 0 |
| RF | 0.046 | 0.343 | 0.144 | 0.466 |
| LASSO | 0 | 0 | 0 | 0 |
| RPART | 0.029 | 0.208 | 0.546 | 0.799 |
| KNN | 0.25 | 0.037 | 0 | 0 |
| SVM.mRMR | 0 | 0.012 | 1 | 0.746 |
| ENS | 0.021 | 0.593 | 0.063 | 0.128 |
| NAIVEBAYES_RAW | RF | LASSO | RPART | KNN | SVM.mRMR | |
|---|---|---|---|---|---|---|
| Gradient_Boost | 0.001 | 0.201 | 0 | 0.631 | 0 | 0.873 |
| BSWiMS | 0.025 | 0.128 | 0 | 0.083 | 0.077 | 0.005 |
| eBSWiMS | 0.209 | 0.04 | 0 | 0.032 | 0.25 | 0.001 |
| LDA | 0.48 | 0.014 | 0 | 0.011 | 0.527 | 0 |
| QDA | 0 | 0 | 0 | 0 | 0 | 0 |
| BeSS_Seq | 0.209 | 0.046 | 0 | 0.029 | 0.25 | 0 |
| BeSS_Golden | 0.033 | 0.343 | 0 | 0.208 | 0.037 | 0.012 |
| ADA_BOOST | 0.001 | 0.144 | 0 | 0.546 | 0 | 1 |
| NAIVEBAYES_PCA | 0 | 0.466 | 0 | 0.799 | 0 | 0.746 |
| NAIVEBAYES_RAW | 1 | 0.007 | 0 | 0.003 | 1 | 0 |
| RF | 0.007 | 1 | 0 | 0.631 | 0.003 | 0.223 |
| LASSO | 0 | 0 | 1 | 0 | 0 | 0 |
| RPART | 0.003 | 0.631 | 0 | 1 | 0.004 | 0.564 |
| KNN | 1 | 0.003 | 0 | 0.004 | 1 | 0 |
| SVM.mRMR | 0 | 0.223 | 0 | 0.564 | 0 | 1 |
| ENS | 0.003 | 0.45 | 0 | 0.307 | 0.009 | 0.022 |
| ENS | |
|---|---|
| Gradient_Boost | 0.086 |
| BSWiMS | 0.132 |
| eBSWiMS | 0.011 |
| LDA | 0.002 |
| QDA | 0 |
| BeSS_Seq | 0.021 |
| BeSS_Golden | 0.593 |
| ADA_BOOST | 0.063 |
| NAIVEBAYES_PCA | 0.128 |
| NAIVEBAYES_RAW | 0.003 |
| RF | 0.45 |
| LASSO | 0 |
| RPART | 0.307 |
| KNN | 0.009 |
| SVM.mRMR | 0.022 |
| ENS | 1 |
Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot
par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend
cp$featureSelectionFrequency$FS_QDA <- NULL
cp$featureSelectionFrequency$FS_LDA <- NULL
cp$featureSelectionFrequency$FS_NAIVEBAYES_PCA <- NULL
cp$featureSelectionFrequency$FS_NAIVEBAYES_RAW <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)
gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.75,srtCol = 25)