rm(list = ls())
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\DATA\\"
dir_path_name <- list.files(pattern = ".*csv",dir_path,full.names = T, recursive = T)
data_1 <- read.csv(grep("data_for_consensus.csv",dir_path_name,value = T),header = T,stringsAsFactors = F)
data_1 <- data_1[data_1$search_label == "HCV", ]
dim(data_1);head(data_1)
## [1] 2542 13
## search_label Mapping.ID source NB_g SVM_g RF_g xg_g nnet_g NB_S NNET_S
## 1 HCV AAALVYBICLMAMA Negative 0 0 0 0 0 0 0
## 2 HCV AAKDPDFZMNYDLR Negative 0 1 1 1 1 0 0
## 3 HCV AAQOQKQBGPPFNS Negative 0 0 1 0 1 0 0
## 4 HCV AAXVEMMRQDVLJB Negative 1 1 1 1 1 1 1
## 5 HCV AAZMHPMNAVEBRE Negative 0 1 1 0 0 0 0
## 6 HCV ABWNNKAYIRRMIV Negative 0 0 1 0 0 0 0
## SVM_S RF_S xg_S
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 1 0 0
## 5 1 0 0
## 6 0 0 0
g1 = colnames(data_1)[grep("_g", colnames(data_1))]
g2 <- do.call("c",lapply(seq_along(g1),function(i) utils::combn(g1,i,FUN=list)))
s1 = colnames(data_1)[grep("_S", colnames(data_1))]
s2 <- do.call("c",lapply(seq_along(s1),function(i) utils::combn(s1,i,FUN=list)))
g2
## [[1]]
## [1] "NB_g"
##
## [[2]]
## [1] "SVM_g"
##
## [[3]]
## [1] "RF_g"
##
## [[4]]
## [1] "xg_g"
##
## [[5]]
## [1] "nnet_g"
##
## [[6]]
## [1] "NB_g" "SVM_g"
##
## [[7]]
## [1] "NB_g" "RF_g"
##
## [[8]]
## [1] "NB_g" "xg_g"
##
## [[9]]
## [1] "NB_g" "nnet_g"
##
## [[10]]
## [1] "SVM_g" "RF_g"
##
## [[11]]
## [1] "SVM_g" "xg_g"
##
## [[12]]
## [1] "SVM_g" "nnet_g"
##
## [[13]]
## [1] "RF_g" "xg_g"
##
## [[14]]
## [1] "RF_g" "nnet_g"
##
## [[15]]
## [1] "xg_g" "nnet_g"
##
## [[16]]
## [1] "NB_g" "SVM_g" "RF_g"
##
## [[17]]
## [1] "NB_g" "SVM_g" "xg_g"
##
## [[18]]
## [1] "NB_g" "SVM_g" "nnet_g"
##
## [[19]]
## [1] "NB_g" "RF_g" "xg_g"
##
## [[20]]
## [1] "NB_g" "RF_g" "nnet_g"
##
## [[21]]
## [1] "NB_g" "xg_g" "nnet_g"
##
## [[22]]
## [1] "SVM_g" "RF_g" "xg_g"
##
## [[23]]
## [1] "SVM_g" "RF_g" "nnet_g"
##
## [[24]]
## [1] "SVM_g" "xg_g" "nnet_g"
##
## [[25]]
## [1] "RF_g" "xg_g" "nnet_g"
##
## [[26]]
## [1] "NB_g" "SVM_g" "RF_g" "xg_g"
##
## [[27]]
## [1] "NB_g" "SVM_g" "RF_g" "nnet_g"
##
## [[28]]
## [1] "NB_g" "SVM_g" "xg_g" "nnet_g"
##
## [[29]]
## [1] "NB_g" "RF_g" "xg_g" "nnet_g"
##
## [[30]]
## [1] "SVM_g" "RF_g" "xg_g" "nnet_g"
##
## [[31]]
## [1] "NB_g" "SVM_g" "RF_g" "xg_g" "nnet_g"
data_list <- list()
for (i in 1:length(g2)) {
print(i)
data_41 <- data_1[,c("search_label","Mapping.ID","source",g2[[i]])]
head(data_41)
data_41 <- data_41[rowSums(data_41[-c(1,2,3)]) == length(g2[[i]]), ]
dim(data_41)
head(data_41)
data_list_1 <- list()
for (j in 1:length(s2)) {
data_42 <- data_1[,c("search_label","Mapping.ID","source",s2[[j]])]
head(data_42)
data_42 <- data_42[rowSums(data_42[-c(1,2,3)]) == length(s2[[j]]), ]
data_4 <- merge(data_41, data_42, by = c("search_label","Mapping.ID","source"))
dim(data_4)
data_4 <- unique(data_4)
stat <- data.frame(t(data.frame(table(data_4$source))))
colnames(stat) <- stat[1,]
stat <- stat[2,]
stat[] <- sapply(stat[], as.numeric)
stat$ppv <- stat$Positive/sum(stat$Negative,stat$Positive)
stat$source_specific <- paste(s2[[j]], collapse = ", ")
stat$source_general <- paste(g2[[i]], collapse = ", ")
data_list_1[[j]] <- stat
}
data_list[[i]] <- do.call("rbind", data_list_1)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
data_5 <- do.call("rbind", data_list)
data_5$sum <- data_5$Negative + data_5$Positive
dim(data_5); head(data_5,20)
## [1] 961 6
## Negative Positive ppv source_specific source_general sum
## Freq 131 44 0.2514286 NB_S NB_g 175
## Freq1 228 90 0.2830189 NNET_S NB_g 318
## Freq2 197 80 0.2888087 SVM_S NB_g 277
## Freq3 71 84 0.5419355 RF_S NB_g 155
## Freq4 103 87 0.4578947 xg_S NB_g 190
## Freq5 57 35 0.3804348 NB_S, NNET_S NB_g 92
## Freq6 38 6 0.1363636 NB_S, SVM_S NB_g 44
## Freq7 21 37 0.6379310 NB_S, RF_S NB_g 58
## Freq8 21 39 0.6500000 NB_S, xg_S NB_g 60
## Freq9 77 47 0.3790323 NNET_S, SVM_S NB_g 124
## Freq10 38 64 0.6274510 NNET_S, RF_S NB_g 102
## Freq11 54 61 0.5304348 NNET_S, xg_S NB_g 115
## Freq12 28 46 0.6216216 SVM_S, RF_S NB_g 74
## Freq13 49 43 0.4673913 SVM_S, xg_S NB_g 92
## Freq14 34 72 0.6792453 RF_S, xg_S NB_g 106
## Freq15 14 4 0.2222222 NB_S, NNET_S, SVM_S NB_g 18
## Freq16 11 35 0.7608696 NB_S, NNET_S, RF_S NB_g 46
## Freq17 11 35 0.7608696 NB_S, NNET_S, xg_S NB_g 46
## Freq18 11 4 0.2666667 NB_S, SVM_S, RF_S NB_g 15
## Freq19 10 4 0.2857143 NB_S, SVM_S, xg_S NB_g 14
write.csv(data_5, paste0(dir_path,Sys.Date(),"-","PPV_consensus.csv"),row.names = FALSE,na = "")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
data_6 <- spread(data_5[,3:5], key = source_specific, value = ppv )
rownames(data_6) <- data_6$source_general
data_6$source_general <- NULL
data_7 <- data_6[rownames(data_6)[order(nchar(rownames(data_6)))],
colnames(data_6)[order(nchar(colnames(data_6)))]]
head(data_7,10)
## NB_S RF_S xg_S SVM_S NNET_S NB_S, RF_S
## NB_g 0.2514286 0.5419355 0.4578947 0.2888087 0.2830189 0.6379310
## RF_g 0.2594340 0.4759358 0.3948498 0.2539683 0.2337963 0.5492958
## xg_g 0.2538071 0.4715909 0.3893805 0.2630058 0.2404092 0.5362319
## SVM_g 0.2810458 0.5419355 0.4689266 0.3180212 0.2954545 0.6034483
## nnet_g 0.3054187 0.4832215 0.4093264 0.2289562 0.2294118 0.6000000
## NB_g, RF_g 0.2704403 0.5804196 0.4802260 0.3170732 0.2962963 0.6666667
## NB_g, xg_g 0.2876712 0.5683453 0.4795322 0.3234043 0.3018182 0.6481481
## RF_g, xg_g 0.2793296 0.4940476 0.4131455 0.2875399 0.2493225 0.5692308
## NB_g, SVM_g 0.2962963 0.5955882 0.5194805 0.3303571 0.3201581 0.6862745
## SVM_g, RF_g 0.2885906 0.5490196 0.4715909 0.3272727 0.2983607 0.6034483
## NB_S, xg_S RF_S, xg_S NB_S, SVM_S SVM_S, RF_S SVM_S, xg_S
## NB_g 0.6500000 0.6792453 0.1363636 0.6216216 0.4673913
## RF_g 0.5526316 0.5853659 0.1403509 0.5164835 0.3813559
## xg_g 0.5125000 0.5882353 0.1346154 0.5232558 0.3771930
## SVM_g 0.6229508 0.6571429 0.1521739 0.6025641 0.4526316
## nnet_g 0.5974026 0.5980392 0.1739130 0.4915254 0.3478261
## NB_g, RF_g 0.6666667 0.7029703 0.1500000 0.6478873 0.4883721
## NB_g, xg_g 0.6607143 0.7040816 0.1714286 0.6470588 0.4823529
## RF_g, xg_g 0.5694444 0.6034483 0.1521739 0.5421687 0.4018692
## NB_g, SVM_g 0.7115385 0.7263158 0.1578947 0.6666667 0.5125000
## SVM_g, RF_g 0.6229508 0.6571429 0.1555556 0.6103896 0.4526316
## NB_S, NNET_S NNET_S, RF_S NNET_S, xg_S NNET_S, SVM_S
## NB_g 0.3804348 0.6274510 0.5304348 0.3790323
## RF_g 0.3423423 0.5076923 0.4183007 0.3038674
## xg_g 0.3425926 0.5123967 0.4236111 0.3086420
## SVM_g 0.3750000 0.5887850 0.5042017 0.3758865
## nnet_g 0.4000000 0.5208333 0.4308943 0.2481203
## NB_g, RF_g 0.3977273 0.6530612 0.5454545 0.3931624
## NB_g, xg_g 0.4216867 0.6559140 0.5471698 0.3944954
## RF_g, xg_g 0.3700000 0.5344828 0.4420290 0.3311258
## NB_g, SVM_g 0.4216867 0.6774194 0.5858586 0.4166667
## SVM_g, RF_g 0.3789474 0.5943396 0.5042017 0.3785714
## NB_S, RF_S, xg_S NB_S, SVM_S, RF_S NB_S, SVM_S, xg_S
## NB_g 0.7254902 0.2666667 0.2857143
## RF_g 0.6166667 0.1666667 0.2083333
## xg_g 0.6206897 0.1666667 0.1923077
## SVM_g 0.6730769 0.2222222 0.2631579
## nnet_g 0.6500000 0.2105263 0.2272727
## NB_g, RF_g 0.7500000 0.2666667 0.2857143
## NB_g, xg_g 0.7446809 0.2666667 0.2857143
## RF_g, xg_g 0.6428571 0.1818182 0.2272727
## NB_g, SVM_g 0.7608696 0.3076923 0.3076923
## SVM_g, RF_g 0.6730769 0.2222222 0.2631579
## SVM_S, RF_S, xg_S NB_S, NNET_S, RF_S NB_S, NNET_S, xg_S
## NB_g 0.6862745 0.7608696 0.7608696
## RF_g 0.5468750 0.6166667 0.6271186
## xg_g 0.5555556 0.6101695 0.5873016
## SVM_g 0.6140351 0.7000000 0.7058824
## nnet_g 0.5217391 0.6666667 0.6440678
## NB_g, RF_g 0.6862745 0.7777778 0.7777778
## NB_g, xg_g 0.6862745 0.7777778 0.7777778
## RF_g, xg_g 0.5645161 0.6428571 0.6491228
## NB_g, SVM_g 0.7000000 0.8139535 0.8333333
## SVM_g, RF_g 0.6140351 0.7000000 0.7058824
## NNET_S, RF_S, xg_S NB_S, NNET_S, SVM_S NNET_S, SVM_S, RF_S
## NB_g 0.7051282 0.2222222 0.6521739
## RF_g 0.5714286 0.1724138 0.4918033
## xg_g 0.5789474 0.1562500 0.5000000
## SVM_g 0.6506024 0.2083333 0.5882353
## nnet_g 0.5875000 0.2000000 0.3888889
## NB_g, RF_g 0.7142857 0.2222222 0.6666667
## NB_g, xg_g 0.7200000 0.2222222 0.6829268
## RF_g, xg_g 0.5913978 0.1785714 0.5185185
## NB_g, SVM_g 0.7397260 0.2500000 0.6976744
## SVM_g, RF_g 0.6506024 0.2083333 0.5882353
## NNET_S, SVM_S, xg_S NB_S, SVM_S, RF_S, xg_S
## NB_g 0.4821429 0.3333333
## RF_g 0.3766234 0.2105263
## xg_g 0.3802817 0.2222222
## SVM_g 0.4426230 0.2666667
## nnet_g 0.3157895 0.2352941
## NB_g, RF_g 0.4905660 0.3333333
## NB_g, xg_g 0.4901961 0.3333333
## RF_g, xg_g 0.3970588 0.2352941
## NB_g, SVM_g 0.5208333 0.3636364
## SVM_g, RF_g 0.4426230 0.2666667
## NB_S, NNET_S, RF_S, xg_S NB_S, NNET_S, SVM_S, RF_S
## NB_g 0.7954545 0.3636364
## RF_g 0.6666667 0.2105263
## xg_g 0.6666667 0.2000000
## SVM_g 0.7291667 0.2857143
## nnet_g 0.6851852 0.2352941
## NB_g, RF_g 0.8139535 0.3636364
## NB_g, xg_g 0.8139535 0.3636364
## RF_g, xg_g 0.6923077 0.2222222
## NB_g, SVM_g 0.8333333 0.4444444
## SVM_g, RF_g 0.7291667 0.2857143
## NB_S, NNET_S, SVM_S, xg_S NNET_S, SVM_S, RF_S, xg_S
## NB_g 0.4000000 0.6562500
## RF_g 0.2777778 0.4772727
## xg_g 0.2500000 0.4772727
## SVM_g 0.3333333 0.5526316
## nnet_g 0.2777778 0.4000000
## NB_g, RF_g 0.4000000 0.6562500
## NB_g, xg_g 0.4000000 0.6562500
## RF_g, xg_g 0.2941176 0.4883721
## NB_g, SVM_g 0.4444444 0.6774194
## SVM_g, RF_g 0.3333333 0.5526316
## NB_S, NNET_S, SVM_S, RF_S, xg_S
## NB_g 0.4000000
## RF_g 0.2500000
## xg_g 0.2500000
## SVM_g 0.3076923
## nnet_g 0.2500000
## NB_g, RF_g 0.4000000
## NB_g, xg_g 0.4000000
## RF_g, xg_g 0.2666667
## NB_g, SVM_g 0.4444444
## SVM_g, RF_g 0.3076923
library(pheatmap)
pheatmap(data_7, cluster_rows =F, cluster_cols = F, display_numbers = T, fontsize = 5,main = "HCV")

max(data_7)
## [1] 0.8333333
min(data_7)
## [1] 0.1346154