meta_vs_vanilla-algorithms.Rmd

Libraries

library(pander)
library(FRESA.CAD)
library("mlbench")
library(dplyr)
library(beepr)
library(twosamples)


models <-c(BSWiMS.model,NAIVE_BAYES,LASSO_1SE,LASSO_MIN,GLMNET_RIDGE_MIN,GLMNET_ELASTICNET_MIN)
modelsnames <- c("BSWiMS.model","NAIVE_BAYES","LASSO_1SE","LASSO_MIN",
                 "GLMNET_RIDGE_MIN","GLMNET_ELASTICNET_MIN")

20x cv using 70% training and 30% holdout (for LC models)

lc.cvlist <- list()

lc.filteredFitcv <- randomCV_V3(theDataset,
    theOutcome,
    HLCM_EM,
    trainFraction = 0.7,
    repetitions = 5,
    method = filteredFit,
    hysteresis=0.1,
    fitmethod=glm,
    filtermethod=univariate_BinEnsemble,
    filtermethod.control = list(pvalue=0.05),
    family = "binomial")




save(lc.filteredFitcv,file="LC_filteredFitcv.RDATA")        
lc.cvlist[["LC_filteredFit"]] <-lc.filteredFitcv           

i=1 #
for (model in models){
  modelname= paste0("LC_",modelname)
  
  cv <- randomCV_V3(theDataset,
                theOutcome,
                HLCM_EM,
                trainSampleSets  = lc.filteredFitcv$trainSamplesSets,
                method = model,
                hysteresis=0.1)
  
  save(cv,file=paste0(modelname,"cv.RDATA"))        
            
  lc.cvlist[[modelname]] <-cv
  i = i+1
                    
  }

save(lc.cvlist, file = "lc.cvlist.RData")

cvlist <- list()
i=1 #starts from filteredfit
filteredFitcv <- randomCV(AdjustedFrame,
               "theClass",
                filteredFit,
                trainFraction = 0.7,
                repetitions = 20,
                fitmethod=glm,
                filtermethod=univariate_BinEnsemble,
                filtermethod.control = list(pvalue=0.05),
                family = "binomial")

save(filteredFitcv,file="filteredFitcv.RDATA")        
cvlist[["filteredFit"]] <-filteredFitcv           

for (model in models){
  modelname= modelsnames[i]
  
  cv <- randomCV(theDataset,
              theOutcome,
                model,
                trainSampleSets  = filteredFitcv$trainSamplesSets)
  
  save(cv,file=paste0(modelname,"cv.RDATA"))        
            
  cvlist[[modelname]] <-cv
  i = i+1
                    
  }

save(cvlist, file = "cvlist.RData")

ROC plots (latent class AUC vs vanilla AUC)

par(mfrow = c(1,2),cex = 0.5);
#combine and adapt the cvlists into one combided 
combined.cvlist <- combine.cvlist(lc.cvlist,cvlist)
cp.combined <- BinaryBenchmark(referenceCV = combined.cvlist)

save(cp.combined, file = "cp.combined.RData")

AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CVs

load("lc.cvlist.RData")
load("cvlist.RData")
load("cp.combined.RData")

#combine and adapt the cvlists into one combided 
combined.cvlist <- combine.cvlist(lc.cvlist,cvlist)

synt_auctable <- get.combined_auctable(cp.combined,lc.cvlist)
pander::pander(synt_auctable,
               caption = "AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CV",round = 3)

AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CV (continued below)
	AUC	CI	LC_AUC	LC_CI
filteredFit	0.975	[0.971,0.979]	0.964	[0.956,0.972]
BSWiMS.model	0.977	[0.974,0.98]	0.976	[0.971,0.981]
NAIVE_BAYES	0.841	[0.826,0.857]	0.822	[0.796,0.848]
LASSO_1SE	0.971	[0.967,0.975]	0.941	[0.927,0.955]
LASSO_MIN	0.97	[0.966,0.974]	0.958	[0.947,0.969]
GLMNET_RIDGE_MIN	0.969	[0.964,0.973]	0.958	[0.948,0.968]
GLMNET_ELASTICNET_MIN	0.969	[0.964,0.973]	0.958	[0.948,0.968]
RF	0.98	[0.976,0.984]	-	-
SVM	0.98	[0.976,0.985]	-	-

Table continues below
	train mean obs. LC1, LC2, LC3
filteredFit	4593 (92%), 377 (8%), 0 (0%)
BSWiMS.model	4904 (99%), 65 (1%), 1 (0%)
NAIVE_BAYES	4962 (100%), 8 (0%), 0 (0%)
LASSO_1SE	4523 (91%), 30 (1%), 417 (8%)
LASSO_MIN	3078 (62%), 41 (1%), 1851 (37%)
GLMNET_RIDGE_MIN	3037 (61%), 10 (0%), 1923 (39%)
GLMNET_ELASTICNET_MIN	3038 (61%), 9 (0%), 1923 (39%)
RF	-
SVM	-

	test mean obs. LC1, LC2, LC3
filteredFit	2846 (93%), 218 (7%), 0 (0%)
BSWiMS.model	3040 (99%), 24 (1%), 0 (0%)
NAIVE_BAYES	3061 (100%), 3 (0%), 0 (0%)
LASSO_1SE	2597 (85%), 33 (1%), 434 (14%)
LASSO_MIN	1067 (35%), 48 (2%), 1949 (64%)
GLMNET_RIDGE_MIN	1014 (33%), 9 (0%), 2041 (67%)
GLMNET_ELASTICNET_MIN	1015 (33%), 8 (0%), 2041 (67%)
RF	-
SVM	-

write.csv(synt_auctable, "TADPOLE_merged_auctable.csv")

Statitistics to acces the difference between the classes found by the LC scheme

ks.list =  list()
dts.list = list()
wilcox.list = list()
concat.table = data.frame(matrix(ncol=length(modelsnames)+1,nrow=0,
                                   dimnames=
                                   list(NULL, c("filteredFit",modelsnames))))

load("lc.sets.RData")
result.stats <- get.lc.statistics(lc.sets, AdjustedFrame)
save(result.stats, file = "result.stats.RData")

#load("result.stats.RData")

pander::pander(result.stats$concat.table,
               caption = "compressed table of statistical significant features per test per method")

compressed table of statistical significant features per test per method (continued below)
	filteredFit	BSWiMS.model	LASSO_1SE	LASSO_MIN
KS	168/352	128/352	0/352	0/352
DTS	136/352	89/352	0/352	0/352
Wilcox	169/352	138/352	0/352	0/352

	GLMNET_RIDGE_MIN	GLMNET_ELASTICNET_MIN
KS	0	0/352
DTS	0	0/352
Wilcox	0	0/352

write.csv(result.stats$concat.table,"concat.table.csv")

ks.list <- result.stats$ks.list
pander::pander(ks.list[[1]],
               caption = "statistical significant features using KS test", round = 3)

statistical significant features using KS test
ks.res[ks.ss.rows, ]
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.007
0
0
0
0
0
0
0
0
0
0.03
0.007
0
0.002
0
0.003
0
0
0
0
0
0
0
0
0
0
0
0
0
0.017
0
0.036
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.015
0.02
0
0
0
0
0
0.003
0.006
0.01
0.035
0
0
0.043
0.003
0.009
0.023
0.019
0.011
0.019
0.01
0.002
0
0
0.026
0
0.014
0.016
0.013
0
0.021
0.016
0
0
0.001
0.018
0.012
0.002
0
0.01
0
0.002
0.043
0.041
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.001
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.007
0.016
0.02
0.002
0.009
0
0.005
0.002
0.009
0
0.011
0.001
0.015
0.018
0.034
0
0.014
0.001
0
0
0
0
0
0
0
0

dts.list <- result.stats$dts.list
pander::pander(dts.list[[1]],
               caption = "statistical significant features using DTS test", round = 3)

statistical significant features using DTS test
dts.res[dts.ss.rows, ]
0.004
0.022
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.006
0.004
0.004
0.004
0.004
0.006
0.004
0.004
0.004
0.004
0.006
0.006
0.047
0.004
0.012
0.004
0.028
0.006
0.004
0.004
0.004
0.004
0.004
0.004
0.032
0.004
0.004
0.004
0.004
0.004
0.004
0.006
0.004
0.004
0.012
0.006
0.004
0.004
0.004
0.004
0.004
0.004
0.012
0.004
0.012
0.012
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.017
0.022
0.006
0.004
0.004
0.004
0.006
0.006
0.004
0.006
0.006
0.047
0.004
0.004
0.004
0.006
0.004
0.004
0.004
0.004
0.004
0.017
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.012
0.004
0.012
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.032
0.028
0.017
0.004
0.004
0.004
0.043
0.022
0.032
0.006
0.017
0.047
0.022
0.006
0.004
0.022
0.047
0.012
0.004
0.004
0.004

wilcox.list <- result.stats$wilcox.list
pander::pander(wilcox.list[[1]],
               caption = "statistical significant features using Wilcoxon test", round =3 )

statistical significant features using Wilcoxon test
wilcox.res[wilcox.ss.rows, ]
0
0.007
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.001
0
0
0
0
0
0
0
0
0
0.002
0.022
0
0.009
0
0.024
0
0.038
0
0
0
0
0
0
0
0
0
0
0
0
0
0.006
0
0.017
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.008
0
0
0.026
0
0
0
0
0
0
0.001
0.005
0.008
0.017
0.001
0
0.025
0.038
0.001
0.002
0.012
0.009
0.031
0.005
0
0
0.01
0
0.005
0.002
0.003
0.039
0
0
0.012
0.041
0.005
0
0.027
0.036
0
0
0.009
0.038
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.001
0.002
0.004
0.006
0.012
0
0.006
0.002
0.004
0
0.016
0
0.032
0.034
0.035
0
0.034
0.008
0.004
0
0
0.001
0
0
0
0

save(ks.list, file ="ks.list.RData")
save(dts.list, file ="dts.list.RData")
save(wilcox.list , file ="wilcox.list.RData")

par(mfrow = c(1,1),
    cex = 0.7,
    xpd = T, 
    pty = 'm', #maximal plotting region
    mar = c(3,3,3,10))

cp.combined <-trim.cp(cp.combined)

prBenchmark <- plot(cp.combined)

Perfomance metrics of LC CV

pander::pander(prBenchmark$metrics,
               caption = "Lc vs vanilla Classifier Performance",round = 3)

Lc vs vanilla Classifier Performance (continued below)
	RF	SVM	BSWiMS.model	filteredFit	LC_BSWiMS.model
BER	0.077	0.084	0.086	0.087	0.093
ACC	0.925	0.937	0.919	0.925	0.917
AUC	0.98	0.98	0.977	0.975	0.976
SEN	0.919	0.892	0.904	0.894	0.897
SPE	0.926	0.94	0.922	0.931	0.919
CIDX	0.98	0.98	0.975	0.971	0.975

Table continues below
	LC_filteredFit	LC_LASSO_MIN	LC_GLMNET_RIDGE_MIN
BER	0.094	0.115	0.115
ACC	0.921	0.845	0.849
AUC	0.964	0.958	0.958
SEN	0.886	0.93	0.924
SPE	0.923	0.839	0.844
CIDX	0.961	0.956	0.957

Table continues below
	LC_GLMNET_ELASTICNET_MIN	NAIVE_BAYES	LC_NAIVE_BAYES
BER	0.116	0.308	0.323
ACC	0.849	0.879	0.92
AUC	0.958	0.841	0.822
SEN	0.924	0.416	0.398
SPE	0.844	0.969	0.956
CIDX	0.958	0.79	0.819

Table continues below
	LC_LASSO_1SE	LASSO_MIN	GLMNET_ELASTICNET_MIN	GLMNET_RIDGE_MIN
BER	0.352	0.369	0.375	0.375
ACC	0.355	0.383	0.371	0.371
AUC	0.941	0.97	0.969	0.969
SEN	0.984	1	1	1
SPE	0.312	0.264	0.249	0.249
CIDX	0.963	0.97	0.968	0.968

	LASSO_1SE
BER	0.385
ACC	0.356
AUC	0.971
SEN	1
SPE	0.231
CIDX	0.97

metrics <- data.frame(prBenchmark$metrics)

write.csv(metrics,"TADPOLE_metrics.csv")
beep()