Libraries

library(FRESA.CAD)
library("mlbench")
library("ggplot2")
library(pander)
library(beepr)
library(twosamples)


models <-c(BSWiMS.model,NAIVE_BAYES,LASSO_1SE,LASSO_MIN,GLMNET_RIDGE_MIN,GLMNET_ELASTICNET_MIN)
modelsnames <- c("BSWiMS.model","NAIVE_BAYES","LASSO_1SE","LASSO_MIN",
                 "GLMNET_RIDGE_MIN","GLMNET_ELASTICNET_MIN")

20x cv using 70% training and 30% holdout (for LC models)

lc.cvlist <- list()
lc.filteredFitcv <- randomCV_V3(dataSet,
    "label",
    HLCM_EM,
    trainFraction = 0.7,
    repetitions = 20,
    method = filteredFit,
    hysteresis=0.1,
    fitmethod=glm,
    filtermethod=univariate_BinEnsemble,
    filtermethod.control = list(pvalue=0.05),
    family = "binomial")
lc.cvlist[["LC_filteredFit"]] <-lc.filteredFitcv           
i=1 #

for (model in models){
  modelname= paste0("LC_",modelsnames[i])
  cv <- randomCV_V3(dataSet,
                "label",
                HLCM_EM,
                trainSampleSets  = lc.filteredFitcv$trainSamplesSets,
                method = model,
                hysteresis=0.1)
  
  lc.cvlist[[modelname]] <-cv
  i = i+1
                    
  }
save(lc.cvlist, file = "lc.cvlist.RData")

20x cv using 70% training and 30% holdout (for vanilla models)

cvlist <- list()
filteredFitcv <- randomCV(dataSet,
               "label",
                filteredFit,
                trainSampleSets  = lc.filteredFitcv$trainSamplesSets,
                fitmethod=glm,
                filtermethod=univariate_BinEnsemble,
                filtermethod.control = list(pvalue=0.05),
                family = "binomial")

cvlist[["filteredFit"]] <-filteredFitcv           

save(filteredFitcv, file = "filteredFitcv.RData")

i=1 #starts from filteredfit
for (model in models){
  modelname= modelsnames[i]
  #beep()
  cv <- randomCV(dataSet,
                "label",
                model,
                trainSampleSets  = lc.filteredFitcv$trainSamplesSets)
  
  cvlist[[modelname]] <-cv
  i = i+1
                    
  }
save(cvlist, file = "cvlist.RData")

ROC plots (latent class AUC vs vanilla AUC)

par(mfrow = c(1,2),cex = 0.5);
#combine and adapt the cvlists into one combided 
combined.cvlist <- combine.cvlist(lc.cvlist,cvlist)
cp.combined <- BinaryBenchmark(referenceCV = combined.cvlist)

save(cp.combined, file = "cp.combined.RData")

AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CVs

load("lc.cvlist.RData")
load("cvlist.RData")
load("cp.combined.RData")

#combine and adapt the cvlists into one combided 
combined.cvlist <- combine.cvlist(lc.cvlist,cvlist)

synt_auctable <- get.combined_auctable(cp.combined,lc.cvlist)
pander::pander(synt_auctable,
               caption = "AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CV",round = 3)
AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CV (continued below)
  AUC CI LC_AUC LC_CI
filteredFit 0.728 [0.71,0.746] 0.857 [0.845,0.87]
BSWiMS.model 0.732 [0.714,0.75] 0.983 [0.979,0.987]
NAIVE_BAYES 0.676 [0.658,0.695] 0.685 [0.667,0.704]
LASSO_1SE 0.723 [0.705,0.741] 0.989 [0.985,0.993]
LASSO_MIN 0.728 [0.71,0.746] 0.836 [0.822,0.85]
GLMNET_RIDGE_MIN 0.701 [0.683,0.719] 0.724 [0.707,0.742]
GLMNET_ELASTICNET_MIN 0.701 [0.683,0.719] 0.725 [0.707,0.742]
RF 0.984 [0.98,0.988] - -
SVM 0.899 [0.889,0.91] - -
Table continues below
  train mean obs. LC1, LC2, LC3
filteredFit 2020 (90%), 220 (10%), 0 (0%)
BSWiMS.model 1664 (74%), 576 (26%), 0 (0%)
NAIVE_BAYES 2240 (100%), 0 (0%), 0 (0%)
LASSO_1SE 1051 (47%), 340 (15%), 850 (38%)
LASSO_MIN 970 (43%), 886 (40%), 383 (17%)
GLMNET_RIDGE_MIN 908 (41%), 215 (10%), 1117 (50%)
GLMNET_ELASTICNET_MIN 970 (43%), 206 (9%), 1064 (48%)
RF -
SVM -
  test mean obs. LC1, LC2, LC3
filteredFit 867 (90%), 93 (10%), 0 (0%)
BSWiMS.model 727 (76%), 233 (24%), 0 (0%)
NAIVE_BAYES 960 (100%), 0 (0%), 0 (0%)
LASSO_1SE 267 (28%), 178 (19%), 515 (54%)
LASSO_MIN 294 (31%), 463 (48%), 203 (21%)
GLMNET_RIDGE_MIN 322 (34%), 92 (10%), 546 (57%)
GLMNET_ELASTICNET_MIN 351 (37%), 89 (9%), 519 (54%)
RF -
SVM -

Statitistics to acces the difference between the classes found by the LC scheme

ks.list =  list()
dts.list = list()
wilcox.list = list()
concat.table = data.frame(matrix(ncol=length(modelsnames)+1,nrow=0,
                                   dimnames=list(NULL, c("filteredFit", modelsnames))))

for (i in 1:length(lc.cvlist)) {
  
  lc.sets <- split_df_into_lc.sets(lc.cvlist[[i]],dataSet)
  if (is.null(lc.sets)){
  concat.table[1,i] <- 0
  concat.table[2,i] <- 0
  concat.table[3,i] <- 0

  ks.list[[i]] <- 0
  dts.list[[i]] <- 0
  wilcox.list[[i]] <- 0

  }else{
  ####################### KS############################
  ks.res <- get_ks.test(lc.sets)
  ks.res <- apply(ks.res, 2,p.adjust,method ="BH")

  total.obs <- apply(matrix(dim(ks.res)), 2, prod) 
  ks.ss.obs <- sum(ks.res <0.05)
  concat.table[1,i] <- paste0(ks.ss.obs,"/",total.obs)
  #rows that have a pvalue lower than 0.05 (statistically significant) 
  ks.ss.rows<-  sort(unique(which(ks.res < 0.05) %% nrow(ks.res)))
  
  colnames(ks.res) <- apply(combn(1:max(ncol(ks.res),2), 2), 2, 
                            function(x) {paste0("LC",paste0(x,collapse = " VS ")) }) 
  
  ks.list[[i]] <- ks.res[ks.ss.rows,]
  ####################### DTS############################
  dts.res <- get_dts.test(lc.sets)
  dts.res <- apply(dts.res, 2,p.adjust,method ="BH")

  dts.ss.obs <- sum(dts.res <0.05)
  concat.table[2,i] <- paste0(dts.ss.obs,"/",total.obs)
  #rows that have a pvalue lower than 0.05 (statistically significant) 
  dts.ss.rows<-  sort(unique(which(dts.res < 0.05) %% nrow(dts.res)))
  colnames(dts.res) <- apply(combn(1:max(ncol(ks.res),2), 2), 2, 
                            function(x) {paste0("LC",paste0(x,collapse = " VS ")) }) 
  dts.list[[i]] <- dts.res[dts.ss.rows,]
  ####################### wilcox############################
  wilcox.res <- get_wilcox.test(lc.sets)
  wilcox.res <- apply(wilcox.res, 2,p.adjust,method ="BH")
  #apply(wilcox.res.adj, 2, function(x){table(x<0.5)})
  #apply(wilcox.res, 2, function(x){table(x<0.5)})
  
  wilcox.ss.obs <- sum(wilcox.res <0.05)
  concat.table[3,i] <- paste0(wilcox.ss.obs,"/",total.obs)
  #rows that have a pvalue lower than 0.05 (statistically significant) 
  wilcox.ss.rows<-  sort(unique(which(wilcox.res < 0.05) %% nrow(wilcox.res)))
  colnames(wilcox.res) <- apply(combn(1:max(ncol(ks.res),2), 2), 2, 
                            function(x) {paste0("LC",paste0(x,collapse = " VS ")) }) 
  wilcox.list[[i]] <- wilcox.res[wilcox.ss.rows,]
  wilcox.list.adj <- lapply(wilcox.list,function(x){prod(x,total.obs)} )
  
  }
}
rownames(concat.table) <- c("KS","DTS","Wilcox")

pander::pander(concat.table,
               caption = "compressed table of statistical significant features per test per method")
compressed table of statistical significant features per test per method (continued below)
  filteredFit BSWiMS.model NAIVE_BAYES LASSO_1SE LASSO_MIN
KS 5/104 3/104 0 12/312 5/104
DTS 5/104 5/104 0 14/312 4/104
Wilcox 5/104 4/104 0 12/312 4/104
  GLMNET_RIDGE_MIN GLMNET_ELASTICNET_MIN
KS 14/312 16/312
DTS 16/312 14/312
Wilcox 14/312 13/312
write.csv(concat.table,"concat.table.csv")
pander::pander(ks.res[ks.ss.rows,],
               caption = "statistical significant features using KS test", round = 3)
statistical significant features using KS test
LC1 VS 2 LC1 VS 3 LC2 VS 3
0 0 0
0 0 0
0 0 0
0 0 0
0.979 0.133 0.046
0.595 0.04 0.447
0.132 0.04 0.959
0.777 0.04 0.216
pander::pander( dts.res[dts.ss.rows,],
               caption = "statistical significant features using DTS test", round = 3)
statistical significant features using DTS test
LC1 VS 2 LC1 VS 3 LC2 VS 3
0.026 0.021 0.026
0.026 0.021 0.026
0.026 0.021 0.026
0.026 0.021 0.026
0.636 0.035 0.288
0.208 0.021 0.967
pander::pander(as.data.frame(wilcox.list[[1]]),
               caption = "statistical significant features using Wilcoxon test", round =3 )
statistical significant features using Wilcoxon test
wilcox.list[[1]]
0
0
0
0
0.003
save(ks.list, file ="ks.list.RData")
save(dts.list, file ="dts.list.RData")
save(wilcox.list , file ="wilcox.list.RData")
par(mfrow = c(1,1),
    cex = 0.7,
    xpd = T, 
    pty = 'm', #maximal plotting region
    mar = c(3,3,3,10))

cp.combined <-trim.cp(cp.combined)

prBenchmark <- plot(cp.combined)

Perfomance metrics of LC CV

pander::pander(prBenchmark$metrics,
               caption = "Lc vs vanilla Classifier Performance",round = 3)
Lc vs vanilla Classifier Performance (continued below)
  LC_LASSO_1SE LC_BSWiMS.model RF SVM LC_filteredFit
BER 0.026 0.075 0.091 0.166 0.271
ACC 0.975 0.928 0.899 0.83 0.731
AUC 0.989 0.983 0.984 0.899 0.857
SEN 0.968 0.908 0.99 0.867 0.713
SPE 0.981 0.943 0.828 0.801 0.745
CIDX 0.99 0.982 0.982 0.886 0.841
Table continues below
  BSWiMS.model filteredFit LASSO_MIN LASSO_1SE
BER 0.291 0.294 0.297 0.304
ACC 0.715 0.711 0.71 0.703
AUC 0.732 0.728 0.728 0.723
SEN 0.663 0.662 0.657 0.646
SPE 0.755 0.749 0.751 0.747
CIDX 0.728 0.721 0.722 0.721
Table continues below
  GLMNET_RIDGE_MIN GLMNET_ELASTICNET_MIN LC_NAIVE_BAYES
BER 0.336 0.337 0.351
ACC 0.668 0.668 0.651
AUC 0.701 0.701 0.685
SEN 0.629 0.627 0.635
SPE 0.699 0.699 0.662
CIDX 0.69 0.688 0.628
Table continues below
  NAIVE_BAYES LC_LASSO_MIN LC_GLMNET_ELASTICNET_MIN
BER 0.367 0.396 0.403
ACC 0.634 0.643 0.628
AUC 0.676 0.836 0.725
SEN 0.621 0.29 0.349
SPE 0.643 0.916 0.842
CIDX 0.607 0.834 0.714
  LC_GLMNET_RIDGE_MIN
BER 0.406
ACC 0.627
AUC 0.724
SEN 0.341
SPE 0.848
CIDX 0.715
metrics <- data.frame(prBenchmark$metrics)

write.csv(metrics,"synt_metrics.csv")
## [++][+][+-]( 1147 )[+][+-]( 149 )[+][+]( 39 )[+][+]( 12 )[+][+]( 2 )[+][+]( 0 )< 2082 , 2125 , 1084 , 0 >
## 
##    1    2 
## 2124 1076