rm(list = ls())
###############################input data
dir_path <- "/Users/xut2/Desktop/bootstrap/"
dir_path_fisher <- dir(paste0(dir_path,"2_fisher exact test/"),pattern = ".*.csv",full.names = T, recursive = T)
#dir_path_fisher #
dir_path_original <- dir(paste0(dir_path,"1_original_data/"),pattern = ".*.csv",full.names = T, recursive = T)
#dir_path_original #
list_name <- unique(gsub("_.*","",gsub(".*.\\/","",dir_path_original)))
#list_name
dir.create(paste0(dir_path,"3_bootstrap_fisher_exact_test_100_times/"))
## Warning in dir.create(paste0(dir_path,
## "3_bootstrap_fisher_exact_test_100_times/")): '/Users/xut2/Desktop/bootstrap/
## 3_bootstrap_fisher_exact_test_100_times' already exists
output_file <- paste0(dir_path,"3_bootstrap_fisher_exact_test_100_times/")
for (n in 1:length(list_name)) {
n = 1
#print(n)
data_1 <- read.csv(grep(list_name[n],dir_path_fisher,value = T,fixed=TRUE),header = T,stringsAsFactors = F)
#print(dim(data_1)) #[1] 108 3
#print(head(data_1))
#print(max(data_1$pvalue))
data_2 <- read.csv(grep(list_name[n],dir_path_original,value = T,fixed=TRUE),header = T,stringsAsFactors = F)
#print(dim(data_2)) #[1] 3913 730
#print(colnames(data_2)[720:730])
#View(data_1)
data_3 <- data_2[,c(intersect(data_1$feature,colnames(data_2)),colnames(data_2)[ncol(data_2)])]
#dim(data_2);dim(data_3)
print(dim(data_3))
print(colnames(data_3)[100:109])
#View(data_3)
object_file_lisT_select <- list()
k = 100 ##########################times of boottrap
for (j in 1:k) {
#j = 1
print(paste0(n,"-",j))
set.seed(2020*j)
data_4 <- data_3
data_4[, ncol(data_4)] <- sample(data_4[,ncol(data_4)],replace = F, size = nrow(data_4))
#dim(data_321);table(data_321$endpoint)
a <- data_4
b <- NULL
for (mm in 1:(ncol(a)-1)) {
#mm=10
#dim(a) #[1] 156 1025
tox_taget <- a[which(a[,mm] == 1 & a[,ncol(a)] == 1), ]
tox_non_taget <- a[which(a[,mm] == 0 & a[,ncol(a)] == 1), ]
non_tox_target <- a[which(a[,mm] == 1 & a[,ncol(a)] == 0), ]
non_tox_non_taget <- a[which(a[,mm] == 0 & a[,ncol(a)] == 0), ]
#dim(tox_taget) #[1] 76 367
#dim(tox_non_taget) #[1] 132 367
#dim(non_tox_target) #[1] 164 367
#dim(non_tox_non_taget) #[1] 380 367
#nrow(tox_taget)
##################################fisher
tox_target <- matrix(c(nrow(tox_taget), nrow(non_tox_target), nrow(tox_non_taget), nrow(non_tox_non_taget)),nrow = 2,
dimnames = list(tox = c("tox", "non_tox"),gene = c("tar", "nontar")))
tox_target_pvalue <- fisher.test(tox_target, alternative = "two.sided")$p.value
b <- c(b,tox_target_pvalue)
}
e <- data.frame(b,colnames(a)[1:(ncol(a)-1)])
colnames(e) <- c("pvalue",colnames(a)[ncol(a)])
head(e)
#f <- e[which(e$pvalue < 0.05),]
e$label <- j
object_file_lisT_select[[j]] <- e
#write.csv(f, paste0(output_file,Sys.Date(),"-",colnames(a)[ncol(a)],".csv"),row.names = F)
}
data_pvalue <- do.call("rbind",object_file_lisT_select)
print(head(data_pvalue))
print(table(data_pvalue$label))
#write.csv(data_pvalue, paste0(output_file,Sys.Date(),"-bootstrap_fisher_exact_test_",gsub(".*.\\/","",dir_path_original[n])),row.names = F)
}
## [1] 50 109
## [1] "ring.hetero_.6._Z_1." "ring.hetero_.6._Z_1_2."
## [3] "ring.hetero_.6._Z_1_3." "ring.hetero_.6._Z_1_4."
## [5] "ring.hetero_.6._Z_generic" "ring.hetero_.6_6._N_isoquinoline"
## [7] "ring.hetero_.6_6._N_quinazoline" "ring.hetero_.6_6._N_quinoline"
## [9] "ring.hetero_.6_6._Z_generic" "CYP3A4.outcome.1"
## [1] "1-1"
## [1] "1-2"
## [1] "1-3"
## [1] "1-4"
## [1] "1-5"
## [1] "1-6"
## [1] "1-7"
## [1] "1-8"
## [1] "1-9"
## [1] "1-10"
## [1] "1-11"
## [1] "1-12"
## [1] "1-13"
## [1] "1-14"
## [1] "1-15"
## [1] "1-16"
## [1] "1-17"
## [1] "1-18"
## [1] "1-19"
## [1] "1-20"
## [1] "1-21"
## [1] "1-22"
## [1] "1-23"
## [1] "1-24"
## [1] "1-25"
## [1] "1-26"
## [1] "1-27"
## [1] "1-28"
## [1] "1-29"
## [1] "1-30"
## [1] "1-31"
## [1] "1-32"
## [1] "1-33"
## [1] "1-34"
## [1] "1-35"
## [1] "1-36"
## [1] "1-37"
## [1] "1-38"
## [1] "1-39"
## [1] "1-40"
## [1] "1-41"
## [1] "1-42"
## [1] "1-43"
## [1] "1-44"
## [1] "1-45"
## [1] "1-46"
## [1] "1-47"
## [1] "1-48"
## [1] "1-49"
## [1] "1-50"
## [1] "1-51"
## [1] "1-52"
## [1] "1-53"
## [1] "1-54"
## [1] "1-55"
## [1] "1-56"
## [1] "1-57"
## [1] "1-58"
## [1] "1-59"
## [1] "1-60"
## [1] "1-61"
## [1] "1-62"
## [1] "1-63"
## [1] "1-64"
## [1] "1-65"
## [1] "1-66"
## [1] "1-67"
## [1] "1-68"
## [1] "1-69"
## [1] "1-70"
## [1] "1-71"
## [1] "1-72"
## [1] "1-73"
## [1] "1-74"
## [1] "1-75"
## [1] "1-76"
## [1] "1-77"
## [1] "1-78"
## [1] "1-79"
## [1] "1-80"
## [1] "1-81"
## [1] "1-82"
## [1] "1-83"
## [1] "1-84"
## [1] "1-85"
## [1] "1-86"
## [1] "1-87"
## [1] "1-88"
## [1] "1-89"
## [1] "1-90"
## [1] "1-91"
## [1] "1-92"
## [1] "1-93"
## [1] "1-94"
## [1] "1-95"
## [1] "1-96"
## [1] "1-97"
## [1] "1-98"
## [1] "1-99"
## [1] "1-100"
## pvalue CYP3A4.outcome.1 label
## 1 0.6497876 bond.C..O.N_carboxamide_.NHR. 1
## 2 1.0000000 bond.C..O.N_carboxamide_.NR2. 1
## 3 0.1399237 bond.C..O.N_carboxamide_generic 1
## 4 1.0000000 bond.C..O.O_carboxylicEster_alkenyl 1
## 5 1.0000000 bond.C.N_imine_N.connect_noZ. 1
## 6 1.0000000 bond.C.N_nitrile 1
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108
############################################################
dir.create(paste0(dir_path,"4_final_results/"))
## Warning in dir.create(paste0(dir_path, "4_final_results/")): '/Users/xut2/
## Desktop/bootstrap/4_final_results' already exists
output_1 <- paste0(dir_path,"4_final_results/")
#stat_list <- list()
#for (i in 1:length(list_name)) {
#i =3
data_ori <- data_1
data_ori$source <- NULL
colnames(data_ori)[1:2] <- c("pvalue-ori","feature")
data_bs <- data_pvalue
colnames(data_bs)[1:2] <- c("pvalue-ran","feature")
#head(data_ori); head(data_bs)
#colnames(data_1);colnames(data_2)
#print(c(dim(data_ori),dim(data_bs)))
data_bs_ori <- merge(data_ori, data_bs, by = "feature", all = T)
print(head(data_bs_ori,2))
## feature pvalue-ori pvalue-ran label
## 1 bond.C..O.N_carboxamide_.NHR. 9.480633e-08 0.6497876 1
## 2 bond.C..O.N_carboxamide_.NHR. 9.480633e-08 0.3108724 61
dim(data_bs_ori) #[1] 10800 5
## [1] 10800 4
#print(i)
data_bs_ori <- data_bs_ori[which(data_bs_ori$`pvalue-ori` < 0.05),]
#dat_0.05 <- length(unique(data_bs_ori$endpoint)) #[1] 43
data_bs_ori$difference <- data_bs_ori$`pvalue-ran`- data_bs_ori$`pvalue-ori`
data_bs_ori$seq[data_bs_ori$difference < 0] <- 1
data_bs_ori$seq[data_bs_ori$difference >= 0] <- 0
table(data_bs_ori$seq)
##
## 0 1
## 10789 11
head(data_bs_ori)
## feature pvalue-ori pvalue-ran label difference seq
## 1 bond.C..O.N_carboxamide_.NHR. 9.480633e-08 0.6497876 1 0.6497875 0
## 2 bond.C..O.N_carboxamide_.NHR. 9.480633e-08 0.3108724 61 0.3108723 0
## 3 bond.C..O.N_carboxamide_.NHR. 9.480633e-08 0.6497876 82 0.6497875 0
## 4 bond.C..O.N_carboxamide_.NHR. 9.480633e-08 0.1625970 100 0.1625969 0
## 5 bond.C..O.N_carboxamide_.NHR. 9.480633e-08 1.0000000 91 0.9999999 0
## 6 bond.C..O.N_carboxamide_.NHR. 9.480633e-08 0.1625970 11 0.1625969 0
#data_bs_ori<- data_bs_ori[which(data_bs_ori$difference > 0),]
#data_bs_ori$label_1 <- 1
data_14 <- aggregate(data_bs_ori["seq"],by = list(feature = data_bs_ori$feature,
`pvalue-ori` = data_bs_ori$`pvalue-ori`),
FUN = sum, na.rm = T)
head(data_14[order(data_14$seq, decreasing = T),]); dim(data_14)
## feature pvalue-ori seq
## 86 bond.CN_amine_ter.N_aliphatic 1.376862e-02 4
## 87 bond.CN_amine_ter.N_generic 1.376862e-02 4
## 81 chain.alkaneLinear_ethyl_C2.H_gt_1. 8.394230e-03 3
## 1 bond.CN_amine_sec.NH_aromatic 1.000000e-20 0
## 2 bond.CX_halide_aromatic.X_generic 1.000000e-20 0
## 3 chain.aromaticAlkane_Ph.C1_acyclic_connect_H_gt_1 1.000000e-20 0
## [1] 108 3
#unique(data_bs_ori$source)
#data_14$label_1 <- 10 - data_14$label_1
#View(data_14)
data_15 <- data_14[which(data_14$seq < 3),]
dim(data_15); head(data_15)
## [1] 105 3
## feature pvalue-ori seq
## 1 bond.CN_amine_sec.NH_aromatic 1e-20 0
## 2 bond.CX_halide_aromatic.X_generic 1e-20 0
## 3 chain.aromaticAlkane_Ph.C1_acyclic_connect_H_gt_1 1e-20 0
## 4 chain.aromaticAlkane_Ph.C1_acyclic_connect_noDblBd 1e-20 0
## 5 chain.aromaticAlkane_Ph.C1_acyclic_generic 1e-20 0
## 6 CYP3A4.outcome 1e-20 0
#View(data_15)
#table(data_15$label_1)
write.csv(data_15,paste0(output_1,Sys.Date(),"-bootstrap_fisher_exact_test_final_result.csv"),row.names = F)
#write.csv(data_bs_ori,paste0(output_1,Sys.Date(),"-detail-",unique(dir_path_name_1)[i]),row.names = F)
#stat_list[[i]] <- data.frame(list_name[i],length(unique(data_15$feature)))
#}
#data_stat <- do.call("rbind",stat_list)
#colnames(data_stat) <- c("source","bs_pvalue_0.05_num")
#write.csv(data_stat,paste0(output_1,Sys.Date(),"-stat_p_0.05_BS.csv"),row.names = F)