We will show hot to integrate caret models into FRESA benchmarking
library("FRESA.CAD")
library("mlbench")
library(fastAdaboost)
library(gbm)
library(caret)
library(doParallel)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
Loading the data from the mlbech package.
data(Sonar, package = "mlbench")
Sonar$Class <- 1*(Sonar$Class == "M")
table(Sonar$Class)
We will set the experiment name. The number of times that the experiment will be carried out. And the training fraction
SonarF <- Sonar
SonarF$Class <- as.factor(SonarF$Class)
ExperimentName <- "Sonar_Caret"
theData <- Sonar;
theOutcome <- "Class";
reps <- 20;
fraction <- 0.75;
tranSet <- sample(nrow(SonarF),fraction*nrow(SonarF))
sonarTrain <- SonarF[tranSet,]
sonarTest <- SonarF[-tranSet,]
sonarNTrain <- Sonar[tranSet,]
sonarNTest <- Sonar[-tranSet,]
CVFileName <- paste(ExperimentName,"CVMethod_CARET_web.RDATA",sep = "_")
tunningctrl <- trainControl(
method = "repeatedcv",
number = 5,
repeats = 3
)
noTuningControl <- trainControl(method = "none")
set.seed(107)
gbm_fit <- train(Class ~ .,sonarTrain,
method = "gbm",
trControl = tunningctrl,
preProc = c("center", "scale"),
verbose = FALSE)
set.seed(107)
avNNet_fit <- train(Class ~ .,sonarTrain,
method = "avNNet",
trControl = tunningctrl,
trace = FALSE
)
set.seed(107)
lda2_fit <- train(Class ~ .,sonarTrain,
method = "lda2",
trControl = tunningctrl
)
set.seed(107)
mda_fit <- train(Class ~ .,sonarTrain,
method = "mda",
trControl = tunningctrl
)
LCmodel <- HLCM_EM(Class ~ .,sonarNTrain)
par(mfrow = c(2,2),cex = 0.6);
gbm_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
predict(gbm_fit,sonarTest,type="prob")[,"1"]),
"caret gbm",
cex=0.8)
#> caret gbm
avNNet_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
predict(avNNet_fit,sonarTest,type="prob")[,"1"]),
"caret avNNet",
cex=0.8)
#> caret avNNet
lda2_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
predict(lda2_fit,sonarTest,type="prob")[,"1"]),
"caret lda2",
cex=0.8)
#> caret lda2
mda_bs <- predictionStats_binary(cbind(as.numeric(as.character(sonarTest$Class)),
predict(mda_fit,sonarTest,type="prob")[,"1"]),
"caret Mixture Discriminant Analysis",
cex=0.8)
#> caret Mixture Discriminant Analysis
HLCM_EM_bs <- predictionStats_binary(cbind(sonarNTest$Class,
predict(LCmodel,sonarNTest)),
"Latent Class Model",
cex=0.8)
#> Latent Class Model
par(mfrow = c(1,1),cex = 1.0);
resamps <- resamples(list(gbm = gbm_fit, avNNet = avNNet_fit,lda2=lda2_fit,mda_fit))
dotplot(resamps,metric="Kappa")
pander::pander(summary(resamps))
values:
| gbm~Accuracy | gbm~Kappa | avNNet~Accuracy | avNNet~Kappa | lda2~Accuracy |
|---|---|---|---|---|
| 0.8125 | 0.6206 | 0.625 | 0.2411 | 0.5625 |
| 0.8065 | 0.6092 | 0.871 | 0.7362 | 0.7419 |
| 0.8387 | 0.6764 | 0.9355 | 0.8703 | 0.8065 |
| 0.8333 | 0.6606 | 0.9 | 0.7964 | 0.8 |
| 0.9355 | 0.8708 | 0.8387 | 0.6764 | 0.7419 |
| 0.6875 | 0.3774 | 0.7188 | 0.4331 | 0.5625 |
| 0.9355 | 0.8697 | 0.9355 | 0.8697 | 0.7742 |
| 0.8438 | 0.685 | 0.8438 | 0.6825 | 0.7812 |
| 0.8065 | 0.6092 | 0.8065 | 0.6043 | 0.6774 |
| 0.8387 | 0.6804 | 0.8387 | 0.6723 | 0.8387 |
| 0.871 | 0.7362 | 0.7419 | 0.4585 | 0.6129 |
| 0.8065 | 0.5991 | 0.871 | 0.7362 | 0.7097 |
| 0.7812 | 0.552 | 0.7812 | 0.5556 | 0.625 |
| 0.6774 | 0.3487 | 0.7742 | 0.5353 | 0.5806 |
| 0.8387 | 0.6681 | 0.8065 | 0.6043 | 0.7419 |
| lda2~Kappa | Model4~Accuracy | Model4~Kappa |
|---|---|---|
| 0.1284 | 0.625 | 0.2529 |
| 0.479 | 0.7419 | 0.4723 |
| 0.6125 | 0.8387 | 0.6791 |
| 0.5982 | 0.8333 | 0.6637 |
| 0.4812 | 0.8387 | 0.6764 |
| 0.1351 | 0.625 | 0.2587 |
| 0.5353 | 0.7097 | 0.3948 |
| 0.5625 | 0.75 | 0.4941 |
| 0.3568 | 0.7097 | 0.4101 |
| 0.6764 | 0.8387 | 0.6681 |
| 0.177 | 0.8387 | 0.6764 |
| 0.4175 | 0.7097 | 0.4175 |
| 0.2289 | 0.6875 | 0.3676 |
| 0.1587 | 0.5484 | 0.09959 |
| 0.4723 | 0.8065 | 0.5991 |
call: summary.resamples(object = resamps)
statistics:
Accuracy:
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | NA’s | |
|---|---|---|---|---|---|---|---|
| gbm | 0.6774 | 0.8065 | 0.8333 | 0.8209 | 0.8412 | 0.9355 | 0 |
| avNNet | 0.625 | 0.7777 | 0.8387 | 0.8192 | 0.871 | 0.9355 | 0 |
| lda2 | 0.5625 | 0.619 | 0.7419 | 0.7038 | 0.7777 | 0.8387 | 0 |
| Model4 | 0.5484 | 0.6986 | 0.7419 | 0.7401 | 0.836 | 0.8387 | 0 |
Kappa:
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | NA’s | |
|---|---|---|---|---|---|---|---|
| gbm | 0.3487 | 0.6042 | 0.6606 | 0.6376 | 0.6827 | 0.8708 | 0 |
| avNNet | 0.2411 | 0.5454 | 0.6723 | 0.6315 | 0.7362 | 0.8703 | 0 |
| lda2 | 0.1284 | 0.203 | 0.4723 | 0.4013 | 0.5489 | 0.6764 | 0 |
| Model4 | 0.09959 | 0.3812 | 0.4723 | 0.4754 | 0.6659 | 0.6791 | 0 |
models: gbm, avNNet, lda2 and Model4
metrics: Accuracy and Kappa
methods:
| gbm | avNNet | lda2 | |
|---|---|---|---|
| gbm | avNNet | lda2 | mda |
HouldOutaccuracys <- rbind(gbm = gbm_bs$accc,
avNNet = avNNet_bs$accc,
lda2=lda2_bs$accc,
mda=mda_bs$accc,
HLCM_EM=HLCM_EM_bs$accc)
pander::pander(HouldOutaccuracys)
| est | lower | upper | |
|---|---|---|---|
| gbm | 0.8462 | 0.7192 | 0.9312 |
| avNNet | 0.8269 | 0.6967 | 0.9177 |
| lda2 | 0.8462 | 0.7192 | 0.9312 |
| mda | 0.8269 | 0.6967 | 0.9177 |
| HLCM_EM | 0.75 | 0.6105 | 0.8597 |
Let us do a repeated cross validation using all data
par(mfrow = c(2,2),cex = 0.6);
caretlda2cv <- randomCV(theData,theOutcome,
train,
trainFraction = fraction,
repetitions = reps,
asFactor = TRUE,
method = "lda2",
preProc = c("center", "scale"),
featureSelectionFunction = univariate_Wilcoxon,
featureSelection.control = list(thr = 0.95,limit=0.1)
)
caregbmcv <- randomCV(fittingFunction=train,
trainSampleSets=caretlda2cv$trainSamplesSets,
asFactor = TRUE,
method = "gbm",
verbose = FALSE
)
careavNNetcv <- randomCV(fittingFunction=train,
trainSampleSets=caretlda2cv$trainSamplesSets,
asFactor = TRUE,
method = "avNNet",
trControl = tunningctrl,
trace = FALSE
)
caregbmNocv <- randomCV(fittingFunction=train,
trainSampleSets=caretlda2cv$trainSamplesSets,
asFactor = TRUE,
method = "gbm",
trControl = noTuningControl,
tuneGrid = data.frame(interaction.depth = 3,
n.trees = 75,
shrinkage = .1,
n.minobsinnode = 20),
verbose = FALSE
)
ADABOOSTcv <- randomCV(fittingFunction=adaboost,
trainSampleSets=caretlda2cv$trainSamplesSets,
featureSelectionFunction = univariate_Wilcoxon,
featureSelection.control = list(thr = 0.95),
asFactor = TRUE,
nIter=10
)
HCLAS_BSWiMScv <- randomCV(fittingFunction=HLCM_EM,
trainSampleSets=caretlda2cv$trainSamplesSets,hysteresis = 0.10)
par(mfrow = c(1,1),cex = 1.0);
By running the FRESA.CAD::BinaryBenchmark function will compare the performance to RF, RPART, LASSO,KNN, and SVM
cStats <- predictionStats_binary(HCLAS_BSWiMScv$medianTest,plotname = "CHK",center = TRUE,cex=0.8);
par(mfrow = c(2,2),cex = 0.6);
cp <- BinaryBenchmark(referenceCV = list(lda2=caretlda2cv,
gbm = caregbmcv,
avNNet = careavNNetcv,
gbmNot = caregbmNocv,
adaboost = ADABOOSTcv,
LCLAS = HCLAS_BSWiMScv
))
save(cp, file = CVFileName)
#load(file = CVFileName)
par(mfrow = c(1,1),cex = 1.0);
Once done, we can compare the CV test results using the plot() function. The plot function also generates summary tables of the CV results.
par(mfrow = c(1,1),cex = 1.0,xpd = T,pty='m', mar = c(3,3,3,10)) # Making space for the legend
prBenchmark <- plot(cp)
pander::pander(prBenchmark$metrics,caption = "Classifier Performance",round = 3)
| gbm | gbmNot | avNNet | RF | ENS | adaboost | KNN | lda2 | |
|---|---|---|---|---|---|---|---|---|
| BER | 0.151 | 0.161 | 0.175 | 0.181 | 0.189 | 0.195 | 0.2 | 0.212 |
| ACC | 0.851 | 0.841 | 0.827 | 0.822 | 0.812 | 0.808 | 0.803 | 0.784 |
| AUC | 0.93 | 0.916 | 0.9 | 0.927 | 0.92 | 0.904 | 0.908 | 0.864 |
| SEN | 0.892 | 0.892 | 0.883 | 0.874 | 0.856 | 0.856 | 0.847 | 0.757 |
| SPE | 0.804 | 0.784 | 0.763 | 0.763 | 0.763 | 0.753 | 0.753 | 0.814 |
| CIDX | 0.925 | 0.913 | 0.899 | 0.926 | 0.921 | 0.879 | 0.886 | 0.859 |
| RPART | LCLAS | SVM | LASSO | |
|---|---|---|---|---|
| BER | 0.227 | 0.231 | 0.232 | 0.245 |
| ACC | 0.774 | 0.769 | 0.769 | 0.755 |
| AUC | 0.818 | 0.868 | 0.886 | 0.83 |
| SEN | 0.811 | 0.766 | 0.802 | 0.748 |
| SPE | 0.732 | 0.773 | 0.732 | 0.763 |
| CIDX | 0.772 | 0.864 | 0.876 | 0.842 |
Finally, I will compare the selected features of the different methods by plotting the selection frequency in a heat-map plot
par(mfrow = c(1,1),cex = 1.0,xpd = F,pty='m', mar = c(3,3,3,3)) # Making space for the legend
cp$featureSelectionFrequency$FS_gbm <- NULL
cp$featureSelectionFrequency$FS_avNNet <- NULL
cp$featureSelectionFrequency$FS_gbmNot <- NULL
cp$featureSelectionFrequency$FS_adaboost <- NULL
topf <- apply(cp$featureSelectionFrequency,1,mean)
gplots::heatmap.2((as.matrix(cp$featureSelectionFrequency[order(-topf),])),Rowv=FALSE,dendrogram = "column",trace = "none",mar = c(5,10),main = "Features",cexRow = 0.5,cexCol = 0.5,srtCol = 25)