R Notebook

Libraries

library(FRESA.CAD)
library("mlbench")
library("ggplot2")
library(pander)
library(beepr)
library(twosamples)


models <-c(BSWiMS.model,NAIVE_BAYES,LASSO_1SE,LASSO_MIN,GLMNET_RIDGE_MIN,GLMNET_ELASTICNET_MIN)
modelsnames <- c("BSWiMS.model","NAIVE_BAYES","LASSO_1SE","LASSO_MIN",
                 "GLMNET_RIDGE_MIN","GLMNET_ELASTICNET_MIN")

20x cv using 70% training and 30% holdout (for LC models)

lc.cvlist <- list()
lc.filteredFitcv <- randomCV_V3(dataSet,
    "label",
    HLCM_EM,
    trainFraction = 0.7,
    repetitions = 20,
    method = filteredFit,
    hysteresis=0.1,
    fitmethod=glm,
    filtermethod=univariate_BinEnsemble,
    filtermethod.control = list(pvalue=0.05),
    family = "binomial")
lc.cvlist[["LC_filteredFit"]] <-lc.filteredFitcv           
i=1 #

for (model in models){
  modelname= paste0("LC_",modelsnames[i])
  cv <- randomCV_V3(dataSet,
                "label",
                HLCM_EM,
                trainSampleSets  = lc.filteredFitcv$trainSamplesSets,
                method = model,
                hysteresis=0.1)
  
  lc.cvlist[[modelname]] <-cv
  i = i+1
                    
  }
save(lc.cvlist, file = "lc.cvlist.RData")

20x cv using 70% training and 30% holdout (for vanilla models)

cvlist <- list()
filteredFitcv <- randomCV(dataSet,
               "label",
                filteredFit,
                trainSampleSets  = lc.filteredFitcv$trainSamplesSets,
                fitmethod=glm,
                filtermethod=univariate_BinEnsemble,
                filtermethod.control = list(pvalue=0.05),
                family = "binomial")

cvlist[["filteredFit"]] <-filteredFitcv           

save(filteredFitcv, file = "filteredFitcv.RData")

i=1 #starts from filteredfit
for (model in models){
  modelname= modelsnames[i]
  #beep()
  cv <- randomCV(dataSet,
                "label",
                model,
                trainSampleSets  = lc.filteredFitcv$trainSamplesSets)
  
  cvlist[[modelname]] <-cv
  i = i+1
                    
  }
save(cvlist, file = "cvlist.RData")

ROC plots (latent class AUC vs vanilla AUC)

par(mfrow = c(1,2),cex = 0.5);
#combine and adapt the cvlists into one combided 
combined.cvlist <- combine.cvlist(lc.cvlist,cvlist)
cp.combined <- BinaryBenchmark(referenceCV = combined.cvlist)

save(cp.combined, file = "cp.combined.RData")

AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CVs

load("lc.cvlist.RData")
load("cvlist.RData")
load("cp.combined.RData")

#combine and adapt the cvlists into one combided 
combined.cvlist <- combine.cvlist(lc.cvlist,cvlist)

synt_auctable <- get.combined_auctable(cp.combined,lc.cvlist)
pander::pander(synt_auctable,
               caption = "AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CV",round = 3)

AUC table, vanilla vs latent class classifier and mean proportion of classes formed during the CV (continued below)
	AUC	CI	LC_AUC	LC_CI
filteredFit	0.728	[0.71,0.746]	0.857	[0.845,0.87]
BSWiMS.model	0.732	[0.714,0.75]	0.983	[0.979,0.987]
NAIVE_BAYES	0.676	[0.658,0.695]	0.685	[0.667,0.704]
LASSO_1SE	0.723	[0.705,0.741]	0.989	[0.985,0.993]
LASSO_MIN	0.728	[0.71,0.746]	0.836	[0.822,0.85]
GLMNET_RIDGE_MIN	0.701	[0.683,0.719]	0.724	[0.707,0.742]
GLMNET_ELASTICNET_MIN	0.701	[0.683,0.719]	0.725	[0.707,0.742]
RF	0.984	[0.98,0.988]	-	-
SVM	0.899	[0.889,0.91]	-	-

Table continues below
	train mean obs. LC1, LC2, LC3
filteredFit	2020 (90%), 220 (10%), 0 (0%)
BSWiMS.model	1664 (74%), 576 (26%), 0 (0%)
NAIVE_BAYES	2240 (100%), 0 (0%), 0 (0%)
LASSO_1SE	1051 (47%), 340 (15%), 850 (38%)
LASSO_MIN	970 (43%), 886 (40%), 383 (17%)
GLMNET_RIDGE_MIN	908 (41%), 215 (10%), 1117 (50%)
GLMNET_ELASTICNET_MIN	970 (43%), 206 (9%), 1064 (48%)
RF	-
SVM	-

	test mean obs. LC1, LC2, LC3
filteredFit	867 (90%), 93 (10%), 0 (0%)
BSWiMS.model	727 (76%), 233 (24%), 0 (0%)
NAIVE_BAYES	960 (100%), 0 (0%), 0 (0%)
LASSO_1SE	267 (28%), 178 (19%), 515 (54%)
LASSO_MIN	294 (31%), 463 (48%), 203 (21%)
GLMNET_RIDGE_MIN	322 (34%), 92 (10%), 546 (57%)
GLMNET_ELASTICNET_MIN	351 (37%), 89 (9%), 519 (54%)
RF	-
SVM	-

Statitistics to acces the difference between the classes found by the LC scheme

ks.list =  list()
dts.list = list()
wilcox.list = list()
concat.table = data.frame(matrix(ncol=length(modelsnames)+1,nrow=0,
                                   dimnames=list(NULL, c("filteredFit", modelsnames))))

for (i in 1:length(lc.cvlist)) {
  
  lc.sets <- split_df_into_lc.sets(lc.cvlist[[i]],dataSet)
  if (is.null(lc.sets)){
  concat.table[1,i] <- 0
  concat.table[2,i] <- 0
  concat.table[3,i] <- 0

  ks.list[[i]] <- 0
  dts.list[[i]] <- 0
  wilcox.list[[i]] <- 0

  }else{
  ####################### KS############################
  ks.res <- get_ks.test(lc.sets)
  ks.res <- apply(ks.res, 2,p.adjust,method ="BH")

  total.obs <- apply(matrix(dim(ks.res)), 2, prod) 
  ks.ss.obs <- sum(ks.res <0.05)
  concat.table[1,i] <- paste0(ks.ss.obs,"/",total.obs)
  #rows that have a pvalue lower than 0.05 (statistically significant) 
  ks.ss.rows<-  sort(unique(which(ks.res < 0.05) %% nrow(ks.res)))
  
  colnames(ks.res) <- apply(combn(1:max(ncol(ks.res),2), 2), 2, 
                            function(x) {paste0("LC",paste0(x,collapse = " VS ")) }) 
  
  ks.list[[i]] <- ks.res[ks.ss.rows,]
  ####################### DTS############################
  dts.res <- get_dts.test(lc.sets)
  dts.res <- apply(dts.res, 2,p.adjust,method ="BH")

  dts.ss.obs <- sum(dts.res <0.05)
  concat.table[2,i] <- paste0(dts.ss.obs,"/",total.obs)
  #rows that have a pvalue lower than 0.05 (statistically significant) 
  dts.ss.rows<-  sort(unique(which(dts.res < 0.05) %% nrow(dts.res)))
  colnames(dts.res) <- apply(combn(1:max(ncol(ks.res),2), 2), 2, 
                            function(x) {paste0("LC",paste0(x,collapse = " VS ")) }) 
  dts.list[[i]] <- dts.res[dts.ss.rows,]
  ####################### wilcox############################
  wilcox.res <- get_wilcox.test(lc.sets)
  wilcox.res <- apply(wilcox.res, 2,p.adjust,method ="BH")
  #apply(wilcox.res.adj, 2, function(x){table(x<0.5)})
  #apply(wilcox.res, 2, function(x){table(x<0.5)})
  
  wilcox.ss.obs <- sum(wilcox.res <0.05)
  concat.table[3,i] <- paste0(wilcox.ss.obs,"/",total.obs)
  #rows that have a pvalue lower than 0.05 (statistically significant) 
  wilcox.ss.rows<-  sort(unique(which(wilcox.res < 0.05) %% nrow(wilcox.res)))
  colnames(wilcox.res) <- apply(combn(1:max(ncol(ks.res),2), 2), 2, 
                            function(x) {paste0("LC",paste0(x,collapse = " VS ")) }) 
  wilcox.list[[i]] <- wilcox.res[wilcox.ss.rows,]
  wilcox.list.adj <- lapply(wilcox.list,function(x){prod(x,total.obs)} )
  
  }
}
rownames(concat.table) <- c("KS","DTS","Wilcox")

pander::pander(concat.table,
               caption = "compressed table of statistical significant features per test per method")

compressed table of statistical significant features per test per method (continued below)
	filteredFit	BSWiMS.model	LASSO_1SE	LASSO_MIN
KS	5/104	3/104	12/312	5/104
DTS	5/104	5/104	14/312	4/104
Wilcox	5/104	4/104	12/312	4/104

	GLMNET_RIDGE_MIN	GLMNET_ELASTICNET_MIN
KS	14/312	16/312
DTS	16/312	14/312
Wilcox	14/312	13/312

write.csv(concat.table,"concat.table.csv")
pander::pander(ks.res[ks.ss.rows,],
               caption = "statistical significant features using KS test", round = 3)

statistical significant features using KS test
LC1 VS 2	LC1 VS 3	LC2 VS 3
0	0	0
0	0	0
0	0	0
0	0	0
0.979	0.133	0.046
0.595	0.04	0.447
0.132	0.04	0.959
0.777	0.04	0.216

pander::pander( dts.res[dts.ss.rows,],
               caption = "statistical significant features using DTS test", round = 3)

statistical significant features using DTS test
LC1 VS 2	LC1 VS 3	LC2 VS 3
0.026	0.021	0.026
0.026	0.021	0.026
0.026	0.021	0.026
0.026	0.021	0.026
0.636	0.035	0.288
0.208	0.021	0.967

pander::pander(as.data.frame(wilcox.list[[1]]),
               caption = "statistical significant features using Wilcoxon test", round =3 )

statistical significant features using Wilcoxon test
wilcox.list[[1]]
0
0
0
0
0.003

save(ks.list, file ="ks.list.RData")
save(dts.list, file ="dts.list.RData")
save(wilcox.list , file ="wilcox.list.RData")

par(mfrow = c(1,1),
    cex = 0.7,
    xpd = T, 
    pty = 'm', #maximal plotting region
    mar = c(3,3,3,10))

cp.combined <-trim.cp(cp.combined)

prBenchmark <- plot(cp.combined)

Perfomance metrics of LC CV

pander::pander(prBenchmark$metrics,
               caption = "Lc vs vanilla Classifier Performance",round = 3)

Lc vs vanilla Classifier Performance (continued below)
	LC_LASSO_1SE	LC_BSWiMS.model	RF	SVM	LC_filteredFit
BER	0.026	0.075	0.091	0.166	0.271
ACC	0.975	0.928	0.899	0.83	0.731
AUC	0.989	0.983	0.984	0.899	0.857
SEN	0.968	0.908	0.99	0.867	0.713
SPE	0.981	0.943	0.828	0.801	0.745
CIDX	0.99	0.982	0.982	0.886	0.841

Table continues below
	BSWiMS.model	filteredFit	LASSO_MIN	LASSO_1SE
BER	0.291	0.294	0.297	0.304
ACC	0.715	0.711	0.71	0.703
AUC	0.732	0.728	0.728	0.723
SEN	0.663	0.662	0.657	0.646
SPE	0.755	0.749	0.751	0.747
CIDX	0.728	0.721	0.722	0.721

Table continues below
	GLMNET_RIDGE_MIN	GLMNET_ELASTICNET_MIN	LC_NAIVE_BAYES
BER	0.336	0.337	0.351
ACC	0.668	0.668	0.651
AUC	0.701	0.701	0.685
SEN	0.629	0.627	0.635
SPE	0.699	0.699	0.662
CIDX	0.69	0.688	0.628

Table continues below
	NAIVE_BAYES	LC_LASSO_MIN	LC_GLMNET_ELASTICNET_MIN
BER	0.367	0.396	0.403
ACC	0.634	0.643	0.628
AUC	0.676	0.836	0.725
SEN	0.621	0.29	0.349
SPE	0.643	0.916	0.842
CIDX	0.607	0.834	0.714

	LC_GLMNET_RIDGE_MIN
BER	0.406
ACC	0.627
AUC	0.724
SEN	0.341
SPE	0.848
CIDX	0.715

metrics <- data.frame(prBenchmark$metrics)

write.csv(metrics,"synt_metrics.csv")

## [++][+][+-]( 1147 )[+][+-]( 149 )[+][+]( 39 )[+][+]( 12 )[+][+]( 2 )[+][+]( 0 )< 2082 , 2125 , 1084 , 0 >

## 
##    1    2 
## 2124 1076