rm(list = ls())
###############################input data 
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\DATA\\"
dir_path_name <- list.files(pattern = ".*csv",dir_path,full.names = T, recursive = T)
#dir_path_name
data_1 <- read.csv(grep("data_for_consensus.csv",dir_path_name,value = T),header = T,stringsAsFactors = F)
data_1 <- data_1[data_1$search_label == "HCV", ]
dim(data_1);head(data_1)
## [1] 2542   13
##   search_label     Mapping.ID   source NB_g SVM_g RF_g xg_g nnet_g NB_S NNET_S
## 1          HCV AAALVYBICLMAMA Negative    0     0    0    0      0    0      0
## 2          HCV AAKDPDFZMNYDLR Negative    0     1    1    1      1    0      0
## 3          HCV AAQOQKQBGPPFNS Negative    0     0    1    0      1    0      0
## 4          HCV AAXVEMMRQDVLJB Negative    1     1    1    1      1    1      1
## 5          HCV AAZMHPMNAVEBRE Negative    0     1    1    0      0    0      0
## 6          HCV ABWNNKAYIRRMIV Negative    0     0    1    0      0    0      0
##   SVM_S RF_S xg_S
## 1     0    0    0
## 2     0    0    0
## 3     0    0    0
## 4     1    0    0
## 5     1    0    0
## 6     0    0    0
############################################
g1 = colnames(data_1)[grep("_g", colnames(data_1))]
g2 <- do.call("c",lapply(seq_along(g1),function(i) utils::combn(g1,i,FUN=list)))
s1 = colnames(data_1)[grep("_S", colnames(data_1))]
s2 <- do.call("c",lapply(seq_along(s1),function(i) utils::combn(s1,i,FUN=list)))
g2
## [[1]]
## [1] "NB_g"
## 
## [[2]]
## [1] "SVM_g"
## 
## [[3]]
## [1] "RF_g"
## 
## [[4]]
## [1] "xg_g"
## 
## [[5]]
## [1] "nnet_g"
## 
## [[6]]
## [1] "NB_g"  "SVM_g"
## 
## [[7]]
## [1] "NB_g" "RF_g"
## 
## [[8]]
## [1] "NB_g" "xg_g"
## 
## [[9]]
## [1] "NB_g"   "nnet_g"
## 
## [[10]]
## [1] "SVM_g" "RF_g" 
## 
## [[11]]
## [1] "SVM_g" "xg_g" 
## 
## [[12]]
## [1] "SVM_g"  "nnet_g"
## 
## [[13]]
## [1] "RF_g" "xg_g"
## 
## [[14]]
## [1] "RF_g"   "nnet_g"
## 
## [[15]]
## [1] "xg_g"   "nnet_g"
## 
## [[16]]
## [1] "NB_g"  "SVM_g" "RF_g" 
## 
## [[17]]
## [1] "NB_g"  "SVM_g" "xg_g" 
## 
## [[18]]
## [1] "NB_g"   "SVM_g"  "nnet_g"
## 
## [[19]]
## [1] "NB_g" "RF_g" "xg_g"
## 
## [[20]]
## [1] "NB_g"   "RF_g"   "nnet_g"
## 
## [[21]]
## [1] "NB_g"   "xg_g"   "nnet_g"
## 
## [[22]]
## [1] "SVM_g" "RF_g"  "xg_g" 
## 
## [[23]]
## [1] "SVM_g"  "RF_g"   "nnet_g"
## 
## [[24]]
## [1] "SVM_g"  "xg_g"   "nnet_g"
## 
## [[25]]
## [1] "RF_g"   "xg_g"   "nnet_g"
## 
## [[26]]
## [1] "NB_g"  "SVM_g" "RF_g"  "xg_g" 
## 
## [[27]]
## [1] "NB_g"   "SVM_g"  "RF_g"   "nnet_g"
## 
## [[28]]
## [1] "NB_g"   "SVM_g"  "xg_g"   "nnet_g"
## 
## [[29]]
## [1] "NB_g"   "RF_g"   "xg_g"   "nnet_g"
## 
## [[30]]
## [1] "SVM_g"  "RF_g"   "xg_g"   "nnet_g"
## 
## [[31]]
## [1] "NB_g"   "SVM_g"  "RF_g"   "xg_g"   "nnet_g"
############################################
data_list <- list()
for (i in 1:length(g2)) {
  #i = 1
  #g2[[i]]
  print(i)
 data_41 <- data_1[,c("search_label","Mapping.ID","source",g2[[i]])]
 head(data_41)
 #data_41 <- data_41[]
 data_41 <- data_41[rowSums(data_41[-c(1,2,3)]) == length(g2[[i]]), ]
 dim(data_41)
 head(data_41)
 data_list_1 <- list() 
  for (j in 1:length(s2)) {
    #j = 10
    data_42 <- data_1[,c("search_label","Mapping.ID","source",s2[[j]])]
    head(data_42)
    data_42 <- data_42[rowSums(data_42[-c(1,2,3)]) == length(s2[[j]]), ]
    data_4 <- merge(data_41, data_42, by = c("search_label","Mapping.ID","source"))
    dim(data_4)
    data_4 <- unique(data_4)
    stat <- data.frame(t(data.frame(table(data_4$source))))
    colnames(stat) <- stat[1,]
    stat <- stat[2,]
    #str(stat)
    stat[] <- sapply(stat[], as.numeric)
    stat$ppv <- stat$Positive/sum(stat$Negative,stat$Positive)
    stat$source_specific <- paste(s2[[j]], collapse = ", ")
    stat$source_general <- paste(g2[[i]], collapse = ", ")
    data_list_1[[j]] <- stat
  }
 data_list[[i]] <- do.call("rbind", data_list_1)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
data_5 <- do.call("rbind", data_list)
#View(data_5)
data_5$sum <- data_5$Negative + data_5$Positive
dim(data_5); head(data_5,20)
## [1] 961   6
##        Negative Positive       ppv     source_specific source_general sum
## Freq        131       44 0.2514286                NB_S           NB_g 175
## Freq1       228       90 0.2830189              NNET_S           NB_g 318
## Freq2       197       80 0.2888087               SVM_S           NB_g 277
## Freq3        71       84 0.5419355                RF_S           NB_g 155
## Freq4       103       87 0.4578947                xg_S           NB_g 190
## Freq5        57       35 0.3804348        NB_S, NNET_S           NB_g  92
## Freq6        38        6 0.1363636         NB_S, SVM_S           NB_g  44
## Freq7        21       37 0.6379310          NB_S, RF_S           NB_g  58
## Freq8        21       39 0.6500000          NB_S, xg_S           NB_g  60
## Freq9        77       47 0.3790323       NNET_S, SVM_S           NB_g 124
## Freq10       38       64 0.6274510        NNET_S, RF_S           NB_g 102
## Freq11       54       61 0.5304348        NNET_S, xg_S           NB_g 115
## Freq12       28       46 0.6216216         SVM_S, RF_S           NB_g  74
## Freq13       49       43 0.4673913         SVM_S, xg_S           NB_g  92
## Freq14       34       72 0.6792453          RF_S, xg_S           NB_g 106
## Freq15       14        4 0.2222222 NB_S, NNET_S, SVM_S           NB_g  18
## Freq16       11       35 0.7608696  NB_S, NNET_S, RF_S           NB_g  46
## Freq17       11       35 0.7608696  NB_S, NNET_S, xg_S           NB_g  46
## Freq18       11        4 0.2666667   NB_S, SVM_S, RF_S           NB_g  15
## Freq19       10        4 0.2857143   NB_S, SVM_S, xg_S           NB_g  14
write.csv(data_5, paste0(dir_path,Sys.Date(),"-","PPV_consensus.csv"),row.names = FALSE,na = "")
###################PLOT
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
data_6 <- spread(data_5[,3:5], key = source_specific, value = ppv )
#View(data_6)
rownames(data_6) <- data_6$source_general
data_6$source_general <- NULL
data_7 <- data_6[rownames(data_6)[order(nchar(rownames(data_6)))], 
                 colnames(data_6)[order(nchar(colnames(data_6)))]]
head(data_7,10)  
##                  NB_S      RF_S      xg_S     SVM_S    NNET_S NB_S, RF_S
## NB_g        0.2514286 0.5419355 0.4578947 0.2888087 0.2830189  0.6379310
## RF_g        0.2594340 0.4759358 0.3948498 0.2539683 0.2337963  0.5492958
## xg_g        0.2538071 0.4715909 0.3893805 0.2630058 0.2404092  0.5362319
## SVM_g       0.2810458 0.5419355 0.4689266 0.3180212 0.2954545  0.6034483
## nnet_g      0.3054187 0.4832215 0.4093264 0.2289562 0.2294118  0.6000000
## NB_g, RF_g  0.2704403 0.5804196 0.4802260 0.3170732 0.2962963  0.6666667
## NB_g, xg_g  0.2876712 0.5683453 0.4795322 0.3234043 0.3018182  0.6481481
## RF_g, xg_g  0.2793296 0.4940476 0.4131455 0.2875399 0.2493225  0.5692308
## NB_g, SVM_g 0.2962963 0.5955882 0.5194805 0.3303571 0.3201581  0.6862745
## SVM_g, RF_g 0.2885906 0.5490196 0.4715909 0.3272727 0.2983607  0.6034483
##             NB_S, xg_S RF_S, xg_S NB_S, SVM_S SVM_S, RF_S SVM_S, xg_S
## NB_g         0.6500000  0.6792453   0.1363636   0.6216216   0.4673913
## RF_g         0.5526316  0.5853659   0.1403509   0.5164835   0.3813559
## xg_g         0.5125000  0.5882353   0.1346154   0.5232558   0.3771930
## SVM_g        0.6229508  0.6571429   0.1521739   0.6025641   0.4526316
## nnet_g       0.5974026  0.5980392   0.1739130   0.4915254   0.3478261
## NB_g, RF_g   0.6666667  0.7029703   0.1500000   0.6478873   0.4883721
## NB_g, xg_g   0.6607143  0.7040816   0.1714286   0.6470588   0.4823529
## RF_g, xg_g   0.5694444  0.6034483   0.1521739   0.5421687   0.4018692
## NB_g, SVM_g  0.7115385  0.7263158   0.1578947   0.6666667   0.5125000
## SVM_g, RF_g  0.6229508  0.6571429   0.1555556   0.6103896   0.4526316
##             NB_S, NNET_S NNET_S, RF_S NNET_S, xg_S NNET_S, SVM_S
## NB_g           0.3804348    0.6274510    0.5304348     0.3790323
## RF_g           0.3423423    0.5076923    0.4183007     0.3038674
## xg_g           0.3425926    0.5123967    0.4236111     0.3086420
## SVM_g          0.3750000    0.5887850    0.5042017     0.3758865
## nnet_g         0.4000000    0.5208333    0.4308943     0.2481203
## NB_g, RF_g     0.3977273    0.6530612    0.5454545     0.3931624
## NB_g, xg_g     0.4216867    0.6559140    0.5471698     0.3944954
## RF_g, xg_g     0.3700000    0.5344828    0.4420290     0.3311258
## NB_g, SVM_g    0.4216867    0.6774194    0.5858586     0.4166667
## SVM_g, RF_g    0.3789474    0.5943396    0.5042017     0.3785714
##             NB_S, RF_S, xg_S NB_S, SVM_S, RF_S NB_S, SVM_S, xg_S
## NB_g               0.7254902         0.2666667         0.2857143
## RF_g               0.6166667         0.1666667         0.2083333
## xg_g               0.6206897         0.1666667         0.1923077
## SVM_g              0.6730769         0.2222222         0.2631579
## nnet_g             0.6500000         0.2105263         0.2272727
## NB_g, RF_g         0.7500000         0.2666667         0.2857143
## NB_g, xg_g         0.7446809         0.2666667         0.2857143
## RF_g, xg_g         0.6428571         0.1818182         0.2272727
## NB_g, SVM_g        0.7608696         0.3076923         0.3076923
## SVM_g, RF_g        0.6730769         0.2222222         0.2631579
##             SVM_S, RF_S, xg_S NB_S, NNET_S, RF_S NB_S, NNET_S, xg_S
## NB_g                0.6862745          0.7608696          0.7608696
## RF_g                0.5468750          0.6166667          0.6271186
## xg_g                0.5555556          0.6101695          0.5873016
## SVM_g               0.6140351          0.7000000          0.7058824
## nnet_g              0.5217391          0.6666667          0.6440678
## NB_g, RF_g          0.6862745          0.7777778          0.7777778
## NB_g, xg_g          0.6862745          0.7777778          0.7777778
## RF_g, xg_g          0.5645161          0.6428571          0.6491228
## NB_g, SVM_g         0.7000000          0.8139535          0.8333333
## SVM_g, RF_g         0.6140351          0.7000000          0.7058824
##             NNET_S, RF_S, xg_S NB_S, NNET_S, SVM_S NNET_S, SVM_S, RF_S
## NB_g                 0.7051282           0.2222222           0.6521739
## RF_g                 0.5714286           0.1724138           0.4918033
## xg_g                 0.5789474           0.1562500           0.5000000
## SVM_g                0.6506024           0.2083333           0.5882353
## nnet_g               0.5875000           0.2000000           0.3888889
## NB_g, RF_g           0.7142857           0.2222222           0.6666667
## NB_g, xg_g           0.7200000           0.2222222           0.6829268
## RF_g, xg_g           0.5913978           0.1785714           0.5185185
## NB_g, SVM_g          0.7397260           0.2500000           0.6976744
## SVM_g, RF_g          0.6506024           0.2083333           0.5882353
##             NNET_S, SVM_S, xg_S NB_S, SVM_S, RF_S, xg_S
## NB_g                  0.4821429               0.3333333
## RF_g                  0.3766234               0.2105263
## xg_g                  0.3802817               0.2222222
## SVM_g                 0.4426230               0.2666667
## nnet_g                0.3157895               0.2352941
## NB_g, RF_g            0.4905660               0.3333333
## NB_g, xg_g            0.4901961               0.3333333
## RF_g, xg_g            0.3970588               0.2352941
## NB_g, SVM_g           0.5208333               0.3636364
## SVM_g, RF_g           0.4426230               0.2666667
##             NB_S, NNET_S, RF_S, xg_S NB_S, NNET_S, SVM_S, RF_S
## NB_g                       0.7954545                 0.3636364
## RF_g                       0.6666667                 0.2105263
## xg_g                       0.6666667                 0.2000000
## SVM_g                      0.7291667                 0.2857143
## nnet_g                     0.6851852                 0.2352941
## NB_g, RF_g                 0.8139535                 0.3636364
## NB_g, xg_g                 0.8139535                 0.3636364
## RF_g, xg_g                 0.6923077                 0.2222222
## NB_g, SVM_g                0.8333333                 0.4444444
## SVM_g, RF_g                0.7291667                 0.2857143
##             NB_S, NNET_S, SVM_S, xg_S NNET_S, SVM_S, RF_S, xg_S
## NB_g                        0.4000000                 0.6562500
## RF_g                        0.2777778                 0.4772727
## xg_g                        0.2500000                 0.4772727
## SVM_g                       0.3333333                 0.5526316
## nnet_g                      0.2777778                 0.4000000
## NB_g, RF_g                  0.4000000                 0.6562500
## NB_g, xg_g                  0.4000000                 0.6562500
## RF_g, xg_g                  0.2941176                 0.4883721
## NB_g, SVM_g                 0.4444444                 0.6774194
## SVM_g, RF_g                 0.3333333                 0.5526316
##             NB_S, NNET_S, SVM_S, RF_S, xg_S
## NB_g                              0.4000000
## RF_g                              0.2500000
## xg_g                              0.2500000
## SVM_g                             0.3076923
## nnet_g                            0.2500000
## NB_g, RF_g                        0.4000000
## NB_g, xg_g                        0.4000000
## RF_g, xg_g                        0.2666667
## NB_g, SVM_g                       0.4444444
## SVM_g, RF_g                       0.3076923
#View(data_7)
library(pheatmap)
pheatmap(data_7, cluster_rows =F, cluster_cols = F, display_numbers = T, fontsize = 5,main = "HCV")

max(data_7)
## [1] 0.8333333
min(data_7)
## [1] 0.1346154