We will use FRESA.CAD to evaluate the performance of classifiers on the PimaIndiansDiabetes data set. The following classifiers will be evaluated:
BSWiMS and eBSWiMS, Discriminant Analysis: LDA and QDA, Naive Bayes: Raw and PCA, Boosting ADABOOST and Gradient Boost, BESS: Golden Section and Sequential, LASSO, RPART, Random Forest, SVM: Default, KNN, and the ensemble of Methods.
Furthermore, this vignette will teach how to interface a ML method into the FRESA.CAD::randomCV function.
library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)
Loading the data from the mlbech package.
op <- par(no.readonly = TRUE,mar=c(5,5,5,5))
data("PimaIndiansDiabetes2", package = "mlbench")
We will test second-order interactions between features:
PimaIndiansDiabetes <- PimaIndiansDiabetes2[complete.cases(PimaIndiansDiabetes2),]
PimaIndiansDiabetes$diabetes <- 1*(PimaIndiansDiabetes$diabetes == "pos")
PimaIndiansDiabetes.mat <- as.data.frame(model.matrix(diabetes~.*.,PimaIndiansDiabetes))
#PimaIndiansDiabetes.mat <- as.data.frame(model.matrix(diabetes ~ .,PimaIndiansDiabetes))
PimaIndiansDiabetes.mat$`(Intercept)` <- NULL
PimaIndiansDiabetes.mat$diabetes <- PimaIndiansDiabetes$diabetes
PimaIndiansDiabetes.mat[,1:ncol(PimaIndiansDiabetes.mat)] <- sapply(PimaIndiansDiabetes.mat,as.numeric)
fnames <- colnames(PimaIndiansDiabetes.mat)
fnames <- str_replace_all(fnames," ","_")
fnames <- str_replace_all(fnames,"/","_")
fnames <- str_replace_all(fnames,":","_x_")
colnames(PimaIndiansDiabetes.mat) <- fnames
table(PimaIndiansDiabetes.mat$diabetes)
#>
#> 0 1
#> 262 130
We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction
ExperimentName <- "PimaIndiansDiabetes"
theData <- PimaIndiansDiabetes.mat;
theOutcome <- "diabetes";
reps <- 100;
fraction <- "Bootstrap";
CVFileName <- paste(ExperimentName,"CVMethod_BootStrap.RDATA",sep = "_")
Creating model fitting wrapper for the graddient boost function of thegbm package. The following code will will take the gbm function API, and convert into FRESA.CAD API.
GBM_fit <- function(formula = formula, data=NULL,distribution = "bernoulli",n.trees = 1000,
shrinkage = 0.01, interaction.depth = 4,...)
{
fit <- gbm(formula=formula,data = data,distribution = distribution,n.trees = n.trees,
shrinkage = shrinkage, interaction.depth = interaction.depth,...);
selectedfeatures <- summary(fit,plotit = FALSE);
sum <- 0;
sfeat = 1;
while (sum < 90) {sum <- sum + selectedfeatures[sfeat,2]; sfeat <- sfeat + 1;} #keep the ones that add to 90%
result <- list(fit=fit,n.trees=n.trees,selectedfeatures=rownames(selectedfeatures[1:sfeat,]))
class(result) <- "GBM_FIT";
return(result)
}
We need a proper predict function for the boosting algorithm
predict.GBM_FIT <- function(object,...)
{
parameters <- list(...);
testData <- parameters[[1]];
n.trees = seq(from=(0.1*object$n.trees),to=object$n.trees, by=object$n.trees/25) #no of trees-a vector of 25 values
pLS <- predict(object$fit,testData,n.trees = n.trees);
pLS <- 1.0/(1.0+exp(-apply(pLS,1,mean)))
return(pLS);
}
Let check that evething is working:
gfit <- GBM_fit(formula = formula(paste(theOutcome," ~ .")),theData)
gfit$selectedfeatures
pr <- predict(gfit,theData)
bs <- predictionStats_binary(cbind(theData$diabetes,pr),"Gradient Boost: Training")
Now let me check the cross validation performance of the gradient boosting method
GradBOOSTcv <- randomCV(theData,theOutcome,GBM_fit,
trainFraction = fraction,
repetitions = reps)
bs <- predictionStats_binary(GradBOOSTcv$medianTest,"Gradient Boost: CV")
The FRESA.CAD::randomCV will be run several machine learning methods. But first let run the cross-validation on the BSWiMS method.
We will use the same train and testing partition on the other ML methods
BSWiMScv <- randomCV(theData,theOutcome,
BSWiMS.model,
trainSampleSets=GradBOOSTcv$trainSamplesSets)
#> .[+++++++-].[+++++-].[++++++-].[++++-].[++++++-].[+++++-].[++++++++-].[++++++-].[+++++++-].[++++++++-].[+++++++++-].[++++++++-].[+++++-].[+++++++++-].[++++++-].[++++++-].[++-].[++++++-].[++++++-].[++++++++++-]..[++++++-].[+++++++-].[+++++++-].[+++++-].[+++++++-].[++++++-].[++++-].[++++++++-].[+++++++-].[++++++-].[+++++++-].[+++++-].[+++++++++-].[+++++++-].[+++++++-].[+++++-].[+++++++++-].[+++++++-].[++++++-].[+++++++-].[+++++-].[++++++++-].[+++++++-].[+++++-].[+++++++++-].[++++++-].[++++++++++-]..[++++++-].[++++++-].[+++++++++-].[++++++++-].[+++++++++-].[+++++++-].[+++++++++-].[+++++-].[++++-].[+++++-].[+++++++-].[+++++-].[+++++++-].[++++++++-].[++++-].[+++++++++-].[+++++++-].[+++++-].[+++++++-].[++++-].[+++++-].[+++++-].[+++++-].[+++++-].[++++++-].[++++++++-].[+++++++++-].[++++++-].[+++++++-].[+++++++++-].[++++++-].[++++++-].[++++++-].[+++++-].[++++++-].[+++++++-].[+++++-].[++-].[++++-].[+++++++-].[++++++-].[+++-].[++++++++-].[+++++-].[++++++++-].[+++++++-].[+++++++++-].[++++++-].[+++++++-].[++++-].[+++++++-].[++++-].[++++++++-]
bs <- predictionStats_binary(BSWiMScv$medianTest,"BSWiMS")
#> BSWiMS
Let us do the same for another set of common ML methods
eBSWiMScv <- randomCV(theData,theOutcome,
BSWiMS.model,trainSampleSets=GradBOOSTcv$trainSamplesSets,
NumberofRepeats = -1)
bs <- predictionStats_binary(eBSWiMScv$medianTest,"eBSWiMS")
QDAcv <- randomCV(theData,theOutcome,
MASS::qda,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = BSWiMScv$selectedFeaturesSet,method="mve")
bs <- predictionStats_binary(QDAcv$medianTest,"QDA")
LDAcv <- randomCV(theData,theOutcome,
MASS::lda,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = BSWiMScv$selectedFeaturesSet)
bs <- predictionStats_binary(LDAcv$medianTest,"LDA")
NAIVEBAYEScv <- randomCV(theData,theOutcome,
NAIVE_BAYES,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = BSWiMScv$selectedFeaturesSet,pca=TRUE,usekernel = TRUE)
bs <- predictionStats_binary(NAIVEBAYEScv$medianTest,"Naive Bayes:PCA")
RawNAIVEBAYEScv <- randomCV(theData,theOutcome,
NAIVE_BAYES,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = BSWiMScv$selectedFeaturesSet,pca=FALSE)
bs <- predictionStats_binary(RawNAIVEBAYEScv$medianTest,"Naive Bayes:RAW")
ADABOOSTcv <- randomCV(theData,theOutcome,
adaboost,
trainSampleSets=GradBOOSTcv$trainSamplesSets,
featureSelectionFunction = univariate_Wilcoxon,
featureSelection.control = list(thr = 0.95),asFactor = TRUE,nIter=10
)
bs <- predictionStats_binary(ADABOOSTcv$medianTest,"Ada Boost with Wilcoxon Filter")
BESScv <- randomCV(theData,theOutcome,BESS,trainSampleSets=GradBOOSTcv$trainSamplesSets)
bs <- predictionStats_binary(BESScv$medianTest,"BeSS Sequential")
BESSGoldencv <- randomCV(theData,theOutcome,BESS,trainSampleSets=GradBOOSTcv$trainSamplesSets,method="gsection")
bs <- predictionStats_binary(BESSGoldencv$medianTest,"BeSS Golden Section")
By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM
par(op);
par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(Gradient_Boost = GradBOOSTcv,
BSWiMS = BSWiMScv,
eBSWiMS = eBSWiMScv,
LDA = LDAcv,
QDA = QDAcv,
BeSS_Seq = BESScv,
BeSS_Golden = BESSGoldencv,
ADA_BOOST = ADABOOSTcv,
NAIVEBAYES_PCA = NAIVEBAYEScv,
NAIVEBAYES_RAW = RawNAIVEBAYEScv))
save(cp, file = CVFileName)
load(file = CVFileName)
Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.
par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)
pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)
| SVM | LDA | BeSS_Seq | NAIVEBAYES_PCA | RPART | ENS | |
|---|---|---|---|---|---|---|
| BER | 0.232 | 0.239 | 0.241 | 0.247 | 0.247 | 0.247 |
| ACC | 0.773 | 0.763 | 0.768 | 0.753 | 0.765 | 0.76 |
| AUC | 0.833 | 0.842 | 0.844 | 0.816 | 0.831 | 0.845 |
| SEN | 0.754 | 0.754 | 0.731 | 0.754 | 0.715 | 0.731 |
| SPE | 0.782 | 0.767 | 0.786 | 0.752 | 0.79 | 0.775 |
| CIDX | 0.823 | 0.821 | 0.826 | 0.772 | 0.756 | 0.845 |
| ADA_BOOST | LASSO | KNN | Gradient_Boost | RF | QDA | |
|---|---|---|---|---|---|---|
| BER | 0.253 | 0.254 | 0.254 | 0.255 | 0.256 | 0.256 |
| ACC | 0.776 | 0.755 | 0.735 | 0.763 | 0.768 | 0.737 |
| AUC | 0.833 | 0.837 | 0.827 | 0.84 | 0.833 | 0.798 |
| SEN | 0.662 | 0.715 | 0.777 | 0.692 | 0.669 | 0.762 |
| SPE | 0.832 | 0.775 | 0.714 | 0.798 | 0.817 | 0.725 |
| CIDX | 0.795 | 0.823 | 0.812 | 0.836 | 0.831 | 0.744 |
| BeSS_Golden | BSWiMS | NAIVEBAYES_RAW | eBSWiMS | |
|---|---|---|---|---|
| BER | 0.258 | 0.259 | 0.261 | 0.261 |
| ACC | 0.753 | 0.758 | 0.77 | 0.755 |
| AUC | 0.812 | 0.847 | 0.835 | 0.85 |
| SEN | 0.708 | 0.692 | 0.646 | 0.692 |
| SPE | 0.775 | 0.79 | 0.832 | 0.786 |
| CIDX | 0.781 | 0.841 | 0.835 | 0.842 |
Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot
par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend
cp$featureSelectionFrequency$FS_QDA <- NULL
cp$featureSelectionFrequency$FS_LDA <- NULL
cp$featureSelectionFrequency$FS_NAIVEBAYES_PCA <- NULL
cp$featureSelectionFrequency$FS_NAIVEBAYES_RAW <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)
gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.5,srtCol = 25)