1 CARET FRESA.CAD Sonar Benchmark

We will show hot to integrate caret models into FRESA benchmarking

1.1 The required libraries

library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)
library(caret)
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)

1.2 Sonar Data Set

Loading the data from the mlbech package.

data(Sonar, package = "mlbench")
Sonar$Class <- 1*(Sonar$Class == "M")
table(Sonar$Class)

We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction

SonarF <- Sonar
SonarF$Class <- as.factor(SonarF$Class)


ExperimentName <- "Sonar_Caret"
theData <- Sonar;
theOutcome <- "Class";
reps <- 20;
fraction <- 0.75;

tranSet <- sample(nrow(SonarF),fraction*nrow(SonarF))
sonarTrain <- SonarF[tranSet,]
sonarTest <- SonarF[-tranSet,]

sonarNTrain <- Sonar[tranSet,]
sonarNTest <- Sonar[-tranSet,]

CVFileName <- paste(ExperimentName,"CVMethod_CARET_web.RDATA",sep = "_")

1.3 Caret taining control


tunningctrl <- trainControl(
  method = "repeatedcv", 
  number = 5,
  repeats = 3
)

noTuningControl <- trainControl(method = "none")

1.4 Simple hold-out validation of caret methods


set.seed(107)
gbm_fit <- train(Class ~ .,sonarTrain, 
             method = "gbm",  
             trControl = tunningctrl,
             preProc = c("center", "scale"),
             verbose = FALSE)


set.seed(107)
avNNet_fit <- train(Class ~ .,sonarTrain, 
             method = "avNNet",
             trControl = tunningctrl,
             trace = FALSE
             )


set.seed(107)
lda2_fit <- train(Class ~ .,sonarTrain, 
             method = "lda2",
             trControl = tunningctrl
             )

set.seed(107)
mda_fit <- train(Class ~ .,sonarTrain, 
             method = "mda",
             trControl = tunningctrl
             )

1.4.1 Compare to FRESA.CAD latent class modeling

LCmodel <- HLCM_EM(Class ~ .,sonarNTrain)

1.4.2 ROC Plots and performance

par(mfrow = c(2,2),cex = 0.6);

gbm_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(gbm_fit,sonarTest,type="prob")[,"1"]),
                                 "caret gbm",
                                 cex=0.8)
#> caret gbm
avNNet_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(avNNet_fit,sonarTest,type="prob")[,"1"]),
                                   "caret avNNet",
                                   cex=0.8)
#> caret avNNet
lda2_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(lda2_fit,sonarTest,type="prob")[,"1"]),
                                  "caret lda2",
                                  cex=0.8)
#> caret lda2
mda_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
                                   predict(mda_fit,sonarTest,type="prob")[,"1"]),
                                 "caret Mixture Discriminant Analysis",
                                 cex=0.8)
#> caret Mixture Discriminant Analysis

HLCM_EM_bs <- predictionStats_binary(cbind(sonarNTest$Class,
                                   predict(LCmodel,sonarNTest)),
                                   "Latent Class Model",
                                   cex=0.8)
#> Latent Class Model

par(mfrow = c(1,1),cex = 1.0);

1.4.3 Print the results


resamps <- resamples(list(gbm = gbm_fit, avNNet = avNNet_fit,lda2=lda2_fit,mda_fit))
dotplot(resamps,metric="Kappa")


pander::pander(summary(resamps))

values:

Table continues below
gbm~Accuracy	gbm~Kappa	avNNet~Accuracy	avNNet~Kappa	lda2~Accuracy
0.8125	0.6206	0.625	0.2411	0.5625
0.8065	0.6092	0.871	0.7362	0.7419
0.8387	0.6764	0.9355	0.8703	0.8065
0.8333	0.6606	0.9	0.7964	0.8
0.9355	0.8708	0.8387	0.6764	0.7419
0.6875	0.3774	0.7188	0.4331	0.5625
0.9355	0.8697	0.9355	0.8697	0.7742
0.8438	0.685	0.8438	0.6825	0.7812
0.8065	0.6092	0.8065	0.6043	0.6774
0.8387	0.6804	0.8387	0.6723	0.8387
0.871	0.7362	0.7419	0.4585	0.6129
0.8065	0.5991	0.871	0.7362	0.7097
0.7812	0.552	0.7812	0.5556	0.625
0.6774	0.3487	0.7742	0.5353	0.5806
0.8387	0.6681	0.8065	0.6043	0.7419

lda2~Kappa	Model4~Accuracy	Model4~Kappa
0.1284	0.625	0.2529
0.479	0.7419	0.4723
0.6125	0.8387	0.6791
0.5982	0.8333	0.6637
0.4812	0.8387	0.6764
0.1351	0.625	0.2587
0.5353	0.7097	0.3948
0.5625	0.75	0.4941
0.3568	0.7097	0.4101
0.6764	0.8387	0.6681
0.177	0.8387	0.6764
0.4175	0.7097	0.4175
0.2289	0.6875	0.3676
0.1587	0.5484	0.09959
0.4723	0.8065	0.5991

call: summary.resamples(object = resamps)

statistics:

Accuracy:

	Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
gbm	0.6774	0.8065	0.8333	0.8209	0.8412	0.9355
avNNet	0.625	0.7777	0.8387	0.8192	0.871	0.9355
lda2	0.5625	0.619	0.7419	0.7038	0.7777	0.8387
Model4	0.5484	0.6986	0.7419	0.7401	0.836	0.8387

Kappa:

	Min.	1st Qu.	Median	Mean	3rd Qu.	Max.
gbm	0.3487	0.6042	0.6606	0.6376	0.6827	0.8708
avNNet	0.2411	0.5454	0.6723	0.6315	0.7362	0.8703
lda2	0.1284	0.203	0.4723	0.4013	0.5489	0.6764
Model4	0.09959	0.3812	0.4723	0.4754	0.6659	0.6791

models: gbm, avNNet, lda2 and Model4
metrics: Accuracy and Kappa
methods:

gbm avNNet lda2

gbm avNNet lda2 mda

gbm	avNNet	lda2
gbm	avNNet	lda2	mda


HouldOutaccuracys <- rbind(gbm = gbm_bs$accc, 
                           avNNet = avNNet_bs$accc,
                           lda2=lda2_bs$accc,
                           mda=mda_bs$accc,
                           HLCM_EM=HLCM_EM_bs$accc)
pander::pander(HouldOutaccuracys)

	est	lower	upper
gbm	0.8462	0.7192	0.9312
avNNet	0.8269	0.6967	0.9177
lda2	0.8462	0.7192	0.9312
mda	0.8269	0.6967	0.9177
HLCM_EM	0.75	0.6105	0.8597

1.5 FRESA.CAD Cross-validation of Caret methods

Let us do a repeated cross validation using all data

par(mfrow = c(2,2),cex = 0.6);
caretlda2cv <- randomCV(theData,theOutcome,
                  train,
                  trainFraction = fraction,
                  repetitions = reps,
                  asFactor = TRUE,  
                  method = "lda2",
                  preProc = c("center", "scale"),
                  featureSelectionFunction = univariate_Wilcoxon,
                  featureSelection.control = list(thr = 0.95,limit=0.1)
                  )


caregbmcv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "gbm",
                  verbose = FALSE
                  )


careavNNetcv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "avNNet",
                  trControl = tunningctrl,
                  trace = FALSE
                  )


caregbmNocv <- randomCV(fittingFunction=train,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  asFactor = TRUE,
                  method = "gbm",
                  trControl = noTuningControl,
                  tuneGrid = data.frame(interaction.depth = 3,
                                       n.trees = 75,
                                       shrinkage = .1,
                                       n.minobsinnode = 20),
                  verbose = FALSE
                  )

ADABOOSTcv <- randomCV(fittingFunction=adaboost,
                  trainSampleSets=caretlda2cv$trainSamplesSets,
                  featureSelectionFunction = univariate_Wilcoxon,
                  featureSelection.control = list(thr = 0.95),
                  asFactor = TRUE,
                  nIter=10
)


HCLAS_BSWiMScv <- randomCV(fittingFunction=HLCM_EM,
                   trainSampleSets=caretlda2cv$trainSamplesSets,hysteresis = 0.10)
par(mfrow = c(1,1),cex = 1.0);

1.6 FRESA.CAD::BinaryBenchmark and Comparing Methods

By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM


cStats <- predictionStats_binary(HCLAS_BSWiMScv$medianTest,plotname = "CHK",center = TRUE,cex=0.8);


  
par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(lda2=caretlda2cv,
                                         gbm = caregbmcv,
                                         avNNet = careavNNetcv,
                                         gbmNot = caregbmNocv,
                                         adaboost = ADABOOSTcv,
                                         LCLAS = HCLAS_BSWiMScv
                                         ))


save(cp, file = CVFileName)
#load(file = CVFileName)

par(mfrow = c(1,1),cex = 1.0);

1.7 Reporting the results of the Benchmark procedure

Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.

par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)

pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)

Classifier Performance (continued below)
	gbm	gbmNot	avNNet	RF	ENS	adaboost	KNN	lda2
BER	0.151	0.161	0.175	0.181	0.189	0.195	0.2	0.212
ACC	0.851	0.841	0.827	0.822	0.812	0.808	0.803	0.784
AUC	0.93	0.916	0.9	0.927	0.92	0.904	0.908	0.864
SEN	0.892	0.892	0.883	0.874	0.856	0.856	0.847	0.757
SPE	0.804	0.784	0.763	0.763	0.763	0.753	0.753	0.814
CIDX	0.925	0.913	0.899	0.926	0.921	0.879	0.886	0.859

	RPART	LCLAS	SVM	LASSO
BER	0.227	0.231	0.232	0.245
ACC	0.774	0.769	0.769	0.755
AUC	0.818	0.868	0.886	0.83
SEN	0.811	0.766	0.802	0.748
SPE	0.732	0.773	0.732	0.763
CIDX	0.772	0.864	0.876	0.842

Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot


par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend

cp$featureSelectionFrequency$FS_gbm <- NULL
cp$featureSelectionFrequency$FS_avNNet <- NULL
cp$featureSelectionFrequency$FS_gbmNot <- NULL
cp$featureSelectionFrequency$FS_adaboost <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)

gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.5,srtCol = 25)

Cross-Validation with CARET and FRESA.CAD

José Tamez-Peña

Feb 18, 2019