Libraries

library(pander)
library(FRESA.CAD)
library("mlbench")
library(dplyr)
library(beepr)
library(twosamples)


models <-c(BSWiMS.model,NAIVE_BAYES,LASSO_1SE,LASSO_MIN,GLMNET_RIDGE_MIN,GLMNET_ELASTICNET_MIN)
modelsnames <- c("BSWiMS.model","NAIVE_BAYES","LASSO_1SE","LASSO_MIN",
                 "GLMNET_RIDGE_MIN","GLMNET_ELASTICNET_MIN")

20x cv using 70% training and 30% holdout (for LC models)

lc.cvlist <- list()

lc.filteredFitcv <- randomCV_V3(theDataset,
    theOutcome,
    HLCM_EM,
    trainFraction = 0.7,
    repetitions = 5,
    method = filteredFit,
    hysteresis=0.1,
    fitmethod=glm,
    filtermethod=univariate_BinEnsemble,
    filtermethod.control = list(pvalue=0.05),
    family = "binomial")




save(lc.filteredFitcv,file="LC_filteredFitcv.RDATA")        
lc.cvlist[["LC_filteredFit"]] <-lc.filteredFitcv           

i=1 #
for (model in models){
  modelname= paste0("LC_",modelname)
  
  cv <- randomCV_V3(theDataset,
                theOutcome,
                HLCM_EM,
                trainSampleSets  = lc.filteredFitcv$trainSamplesSets,
                method = model,
                hysteresis=0.1)
  
  save(cv,file=paste0(modelname,"cv.RDATA"))        
            
  lc.cvlist[[modelname]] <-cv
  i = i+1
                    
  }

save(lc.cvlist, file = "lc.cvlist.RData")
cvlist <- list()
i=1 #starts from filteredfit
filteredFitcv <- randomCV(AdjustedFrame,
               "theClass",
                filteredFit,
                trainFraction = 0.7,
                repetitions = 20,
                fitmethod=glm,
                filtermethod=univariate_BinEnsemble,
                filtermethod.control = list(pvalue=0.05),
                family = "binomial")

save(filteredFitcv,file="filteredFitcv.RDATA")        
cvlist[["filteredFit"]] <-filteredFitcv           

for (model in models){
  modelname= modelsnames[i]
  
  cv <- randomCV(theDataset,
              theOutcome,
                model,
                trainSampleSets  = filteredFitcv$trainSamplesSets)
  
  save(cv,file=paste0(modelname,"cv.RDATA"))        
            
  cvlist[[modelname]] <-cv
  i = i+1
                    
  }

save(cvlist, file = "cvlist.RData")

ROC plots (latent class AUC vs vanilla AUC)

par(mfrow = c(1,2),cex = 0.5);
#combine and adapt the cvlists into one combided 
combined.cvlist <- combine.cvlist(lc.cvlist,cvlist)
cp.combined <- BinaryBenchmark(referenceCV = combined.cvlist)

save(cp.combined, file = "cp.combined.RData")

AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CVs

load("lc.cvlist.RData")
load("cvlist.RData")
load("cp.combined.RData")

#combine and adapt the cvlists into one combided 
combined.cvlist <- combine.cvlist(lc.cvlist,cvlist)

synt_auctable <- get.combined_auctable(cp.combined,lc.cvlist)
pander::pander(synt_auctable,
               caption = "AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CV",round = 3)
AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CV (continued below)
  AUC CI LC_AUC LC_CI
filteredFit 0.975 [0.971,0.979] 0.964 [0.956,0.972]
BSWiMS.model 0.977 [0.974,0.98] 0.976 [0.971,0.981]
NAIVE_BAYES 0.841 [0.826,0.857] 0.822 [0.796,0.848]
LASSO_1SE 0.971 [0.967,0.975] 0.941 [0.927,0.955]
LASSO_MIN 0.97 [0.966,0.974] 0.958 [0.947,0.969]
GLMNET_RIDGE_MIN 0.969 [0.964,0.973] 0.958 [0.948,0.968]
GLMNET_ELASTICNET_MIN 0.969 [0.964,0.973] 0.958 [0.948,0.968]
RF 0.98 [0.976,0.984] - -
SVM 0.98 [0.976,0.985] - -
Table continues below
  train mean obs. LC1, LC2, LC3
filteredFit 4593 (92%), 377 (8%), 0 (0%)
BSWiMS.model 4904 (99%), 65 (1%), 1 (0%)
NAIVE_BAYES 4962 (100%), 8 (0%), 0 (0%)
LASSO_1SE 4523 (91%), 30 (1%), 417 (8%)
LASSO_MIN 3078 (62%), 41 (1%), 1851 (37%)
GLMNET_RIDGE_MIN 3037 (61%), 10 (0%), 1923 (39%)
GLMNET_ELASTICNET_MIN 3038 (61%), 9 (0%), 1923 (39%)
RF -
SVM -
  test mean obs. LC1, LC2, LC3
filteredFit 2846 (93%), 218 (7%), 0 (0%)
BSWiMS.model 3040 (99%), 24 (1%), 0 (0%)
NAIVE_BAYES 3061 (100%), 3 (0%), 0 (0%)
LASSO_1SE 2597 (85%), 33 (1%), 434 (14%)
LASSO_MIN 1067 (35%), 48 (2%), 1949 (64%)
GLMNET_RIDGE_MIN 1014 (33%), 9 (0%), 2041 (67%)
GLMNET_ELASTICNET_MIN 1015 (33%), 8 (0%), 2041 (67%)
RF -
SVM -
write.csv(synt_auctable, "TADPOLE_merged_auctable.csv")

Statitistics to acces the difference between the classes found by the LC scheme

ks.list =  list()
dts.list = list()
wilcox.list = list()
concat.table = data.frame(matrix(ncol=length(modelsnames)+1,nrow=0,
                                   dimnames=
                                   list(NULL, c("filteredFit",modelsnames))))

load("lc.sets.RData")
result.stats <- get.lc.statistics(lc.sets, AdjustedFrame)
save(result.stats, file = "result.stats.RData")

#load("result.stats.RData")

pander::pander(result.stats$concat.table,
               caption = "compressed table of statistical significant features per test per method")
compressed table of statistical significant features per test per method (continued below)
  filteredFit BSWiMS.model NAIVE_BAYES LASSO_1SE LASSO_MIN
KS 168/352 128/352 0 0/352 0/352
DTS 136/352 89/352 0 0/352 0/352
Wilcox 169/352 138/352 0 0/352 0/352
  GLMNET_RIDGE_MIN GLMNET_ELASTICNET_MIN
KS 0 0/352
DTS 0 0/352
Wilcox 0 0/352
write.csv(result.stats$concat.table,"concat.table.csv")

ks.list <- result.stats$ks.list
pander::pander(ks.list[[1]],
               caption = "statistical significant features using KS test", round = 3)
statistical significant features using KS test
ks.res[ks.ss.rows, ]
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.007
0
0
0
0
0
0
0
0
0
0.03
0.007
0
0.002
0
0.003
0
0
0
0
0
0
0
0
0
0
0
0
0
0.017
0
0.036
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.015
0.02
0
0
0
0
0
0.003
0.006
0.01
0.035
0
0
0.043
0.003
0.009
0.023
0.019
0.011
0.019
0.01
0.002
0
0
0.026
0
0.014
0.016
0.013
0
0.021
0.016
0
0
0.001
0.018
0.012
0.002
0
0.01
0
0.002
0.043
0.041
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.001
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.007
0.016
0.02
0.002
0.009
0
0.005
0.002
0.009
0
0.011
0.001
0.015
0.018
0.034
0
0.014
0.001
0
0
0
0
0
0
0
0
dts.list <- result.stats$dts.list
pander::pander(dts.list[[1]],
               caption = "statistical significant features using DTS test", round = 3)
statistical significant features using DTS test
dts.res[dts.ss.rows, ]
0.004
0.022
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.006
0.004
0.004
0.004
0.004
0.006
0.004
0.004
0.004
0.004
0.006
0.006
0.047
0.004
0.012
0.004
0.028
0.006
0.004
0.004
0.004
0.004
0.004
0.004
0.032
0.004
0.004
0.004
0.004
0.004
0.004
0.006
0.004
0.004
0.012
0.006
0.004
0.004
0.004
0.004
0.004
0.004
0.012
0.004
0.012
0.012
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.017
0.022
0.006
0.004
0.004
0.004
0.006
0.006
0.004
0.006
0.006
0.047
0.004
0.004
0.004
0.006
0.004
0.004
0.004
0.004
0.004
0.017
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.012
0.004
0.012
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.004
0.032
0.028
0.017
0.004
0.004
0.004
0.043
0.022
0.032
0.006
0.017
0.047
0.022
0.006
0.004
0.022
0.047
0.012
0.004
0.004
0.004
wilcox.list <- result.stats$wilcox.list
pander::pander(wilcox.list[[1]],
               caption = "statistical significant features using Wilcoxon test", round =3 )
statistical significant features using Wilcoxon test
wilcox.res[wilcox.ss.rows, ]
0
0.007
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.001
0
0
0
0
0
0
0
0
0
0.002
0.022
0
0.009
0
0.024
0
0.038
0
0
0
0
0
0
0
0
0
0
0
0
0
0.006
0
0.017
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.008
0
0
0.026
0
0
0
0
0
0
0.001
0.005
0.008
0.017
0.001
0
0.025
0.038
0.001
0.002
0.012
0.009
0.031
0.005
0
0
0.01
0
0.005
0.002
0.003
0.039
0
0
0.012
0.041
0.005
0
0.027
0.036
0
0
0.009
0.038
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0.001
0.002
0.004
0.006
0.012
0
0.006
0.002
0.004
0
0.016
0
0.032
0.034
0.035
0
0.034
0.008
0.004
0
0
0.001
0
0
0
0
save(ks.list, file ="ks.list.RData")
save(dts.list, file ="dts.list.RData")
save(wilcox.list , file ="wilcox.list.RData")
par(mfrow = c(1,1),
    cex = 0.7,
    xpd = T, 
    pty = 'm', #maximal plotting region
    mar = c(3,3,3,10))

cp.combined <-trim.cp(cp.combined)

prBenchmark <- plot(cp.combined)

Perfomance metrics of LC CV

pander::pander(prBenchmark$metrics,
               caption = "Lc vs vanilla Classifier Performance",round = 3)
Lc vs vanilla Classifier Performance (continued below)
  RF SVM BSWiMS.model filteredFit LC_BSWiMS.model
BER 0.077 0.084 0.086 0.087 0.093
ACC 0.925 0.937 0.919 0.925 0.917
AUC 0.98 0.98 0.977 0.975 0.976
SEN 0.919 0.892 0.904 0.894 0.897
SPE 0.926 0.94 0.922 0.931 0.919
CIDX 0.98 0.98 0.975 0.971 0.975
Table continues below
  LC_filteredFit LC_LASSO_MIN LC_GLMNET_RIDGE_MIN
BER 0.094 0.115 0.115
ACC 0.921 0.845 0.849
AUC 0.964 0.958 0.958
SEN 0.886 0.93 0.924
SPE 0.923 0.839 0.844
CIDX 0.961 0.956 0.957
Table continues below
  LC_GLMNET_ELASTICNET_MIN NAIVE_BAYES LC_NAIVE_BAYES
BER 0.116 0.308 0.323
ACC 0.849 0.879 0.92
AUC 0.958 0.841 0.822
SEN 0.924 0.416 0.398
SPE 0.844 0.969 0.956
CIDX 0.958 0.79 0.819
Table continues below
  LC_LASSO_1SE LASSO_MIN GLMNET_ELASTICNET_MIN GLMNET_RIDGE_MIN
BER 0.352 0.369 0.375 0.375
ACC 0.355 0.383 0.371 0.371
AUC 0.941 0.97 0.969 0.969
SEN 0.984 1 1 1
SPE 0.312 0.264 0.249 0.249
CIDX 0.963 0.97 0.968 0.968
  LASSO_1SE
BER 0.385
ACC 0.356
AUC 0.971
SEN 1
SPE 0.231
CIDX 0.97
metrics <- data.frame(prBenchmark$metrics)

write.csv(metrics,"TADPOLE_metrics.csv")
beep()