bootstrap-fisher-exact-test

rm(list = ls())
###############################input data 
dir_path <- "/Users/xut2/Desktop/bootstrap/"
dir_path_fisher <- dir(paste0(dir_path,"2_fisher exact test/"),pattern = ".*.csv",full.names = T, recursive = T)
#dir_path_fisher #
dir_path_original <- dir(paste0(dir_path,"1_original_data/"),pattern = ".*.csv",full.names = T, recursive = T)
#dir_path_original #
list_name <- unique(gsub("_.*","",gsub(".*.\\/","",dir_path_original)))
#list_name
dir.create(paste0(dir_path,"3_bootstrap_fisher_exact_test_100_times/"))

## Warning in dir.create(paste0(dir_path,
## "3_bootstrap_fisher_exact_test_100_times/")): '/Users/xut2/Desktop/bootstrap/
## 3_bootstrap_fisher_exact_test_100_times' already exists

output_file <- paste0(dir_path,"3_bootstrap_fisher_exact_test_100_times/")
for (n in 1:length(list_name)) {
  n = 1
  #print(n)
  data_1 <- read.csv(grep(list_name[n],dir_path_fisher,value = T,fixed=TRUE),header = T,stringsAsFactors = F)
  #print(dim(data_1)) #[1] 108   3
  #print(head(data_1))
  #print(max(data_1$pvalue))
  data_2 <- read.csv(grep(list_name[n],dir_path_original,value = T,fixed=TRUE),header = T,stringsAsFactors = F)
  #print(dim(data_2)) #[1] 3913  730
  #print(colnames(data_2)[720:730])
  #View(data_1)
  data_3 <- data_2[,c(intersect(data_1$feature,colnames(data_2)),colnames(data_2)[ncol(data_2)])]
  #dim(data_2);dim(data_3)
  print(dim(data_3))
  print(colnames(data_3)[100:109])
  #View(data_3)
  object_file_lisT_select <- list()
  k = 100 ##########################times of boottrap
  for (j in 1:k) {
  #j = 1
  print(paste0(n,"-",j))
  set.seed(2020*j)
  data_4 <- data_3
  data_4[, ncol(data_4)] <- sample(data_4[,ncol(data_4)],replace = F, size = nrow(data_4))
  #dim(data_321);table(data_321$endpoint)
    a <- data_4
    b <- NULL
    for (mm in 1:(ncol(a)-1)) {
      #mm=10
      #dim(a) #[1]  156 1025
      tox_taget <- a[which(a[,mm] == 1 & a[,ncol(a)] == 1), ]
      tox_non_taget <- a[which(a[,mm] == 0 & a[,ncol(a)] == 1), ]
      non_tox_target <- a[which(a[,mm] == 1 & a[,ncol(a)] == 0), ]
      non_tox_non_taget <- a[which(a[,mm] == 0 & a[,ncol(a)] == 0), ]
      #dim(tox_taget) #[1]  76 367
      #dim(tox_non_taget) #[1] 132 367
      #dim(non_tox_target) #[1] 164 367
      #dim(non_tox_non_taget) #[1] 380 367
      #nrow(tox_taget)
      ##################################fisher
      tox_target <- matrix(c(nrow(tox_taget), nrow(non_tox_target), nrow(tox_non_taget), nrow(non_tox_non_taget)),nrow = 2,
                           dimnames = list(tox = c("tox", "non_tox"),gene = c("tar", "nontar")))
      tox_target_pvalue <- fisher.test(tox_target, alternative = "two.sided")$p.value
      b <- c(b,tox_target_pvalue)
    }
    e <- data.frame(b,colnames(a)[1:(ncol(a)-1)])
    colnames(e) <- c("pvalue",colnames(a)[ncol(a)])
    head(e)
      #f <- e[which(e$pvalue < 0.05),]
    e$label <- j
    object_file_lisT_select[[j]] <- e
    #write.csv(f, paste0(output_file,Sys.Date(),"-",colnames(a)[ncol(a)],".csv"),row.names = F)
  }
  data_pvalue <- do.call("rbind",object_file_lisT_select)
  print(head(data_pvalue))
  print(table(data_pvalue$label))
  #write.csv(data_pvalue, paste0(output_file,Sys.Date(),"-bootstrap_fisher_exact_test_",gsub(".*.\\/","",dir_path_original[n])),row.names = F)
}

## [1]  50 109
##  [1] "ring.hetero_.6._Z_1."             "ring.hetero_.6._Z_1_2."          
##  [3] "ring.hetero_.6._Z_1_3."           "ring.hetero_.6._Z_1_4."          
##  [5] "ring.hetero_.6._Z_generic"        "ring.hetero_.6_6._N_isoquinoline"
##  [7] "ring.hetero_.6_6._N_quinazoline"  "ring.hetero_.6_6._N_quinoline"   
##  [9] "ring.hetero_.6_6._Z_generic"      "CYP3A4.outcome.1"                
## [1] "1-1"
## [1] "1-2"
## [1] "1-3"
## [1] "1-4"
## [1] "1-5"
## [1] "1-6"
## [1] "1-7"
## [1] "1-8"
## [1] "1-9"
## [1] "1-10"
## [1] "1-11"
## [1] "1-12"
## [1] "1-13"
## [1] "1-14"
## [1] "1-15"
## [1] "1-16"
## [1] "1-17"
## [1] "1-18"
## [1] "1-19"
## [1] "1-20"
## [1] "1-21"
## [1] "1-22"
## [1] "1-23"
## [1] "1-24"
## [1] "1-25"
## [1] "1-26"
## [1] "1-27"
## [1] "1-28"
## [1] "1-29"
## [1] "1-30"
## [1] "1-31"
## [1] "1-32"
## [1] "1-33"
## [1] "1-34"
## [1] "1-35"
## [1] "1-36"
## [1] "1-37"
## [1] "1-38"
## [1] "1-39"
## [1] "1-40"
## [1] "1-41"
## [1] "1-42"
## [1] "1-43"
## [1] "1-44"
## [1] "1-45"
## [1] "1-46"
## [1] "1-47"
## [1] "1-48"
## [1] "1-49"
## [1] "1-50"
## [1] "1-51"
## [1] "1-52"
## [1] "1-53"
## [1] "1-54"
## [1] "1-55"
## [1] "1-56"
## [1] "1-57"
## [1] "1-58"
## [1] "1-59"
## [1] "1-60"
## [1] "1-61"
## [1] "1-62"
## [1] "1-63"
## [1] "1-64"
## [1] "1-65"
## [1] "1-66"
## [1] "1-67"
## [1] "1-68"
## [1] "1-69"
## [1] "1-70"
## [1] "1-71"
## [1] "1-72"
## [1] "1-73"
## [1] "1-74"
## [1] "1-75"
## [1] "1-76"
## [1] "1-77"
## [1] "1-78"
## [1] "1-79"
## [1] "1-80"
## [1] "1-81"
## [1] "1-82"
## [1] "1-83"
## [1] "1-84"
## [1] "1-85"
## [1] "1-86"
## [1] "1-87"
## [1] "1-88"
## [1] "1-89"
## [1] "1-90"
## [1] "1-91"
## [1] "1-92"
## [1] "1-93"
## [1] "1-94"
## [1] "1-95"
## [1] "1-96"
## [1] "1-97"
## [1] "1-98"
## [1] "1-99"
## [1] "1-100"
##      pvalue                    CYP3A4.outcome.1 label
## 1 0.6497876       bond.C..O.N_carboxamide_.NHR.     1
## 2 1.0000000       bond.C..O.N_carboxamide_.NR2.     1
## 3 0.1399237     bond.C..O.N_carboxamide_generic     1
## 4 1.0000000 bond.C..O.O_carboxylicEster_alkenyl     1
## 5 1.0000000       bond.C.N_imine_N.connect_noZ.     1
## 6 1.0000000                    bond.C.N_nitrile     1
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
## 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108 108

############################################################
dir.create(paste0(dir_path,"4_final_results/"))

## Warning in dir.create(paste0(dir_path, "4_final_results/")): '/Users/xut2/
## Desktop/bootstrap/4_final_results' already exists

output_1 <- paste0(dir_path,"4_final_results/")
#stat_list <- list()
#for (i in 1:length(list_name)) {
  #i =3
  data_ori <- data_1
  data_ori$source <- NULL
  colnames(data_ori)[1:2] <- c("pvalue-ori","feature")
  data_bs <- data_pvalue
  colnames(data_bs)[1:2] <- c("pvalue-ran","feature")
  #head(data_ori); head(data_bs)
  #colnames(data_1);colnames(data_2)
  #print(c(dim(data_ori),dim(data_bs)))
  data_bs_ori <- merge(data_ori, data_bs, by = "feature", all = T)
  print(head(data_bs_ori,2))

##                         feature   pvalue-ori pvalue-ran label
## 1 bond.C..O.N_carboxamide_.NHR. 9.480633e-08  0.6497876     1
## 2 bond.C..O.N_carboxamide_.NHR. 9.480633e-08  0.3108724    61

  dim(data_bs_ori) #[1] 10800     5

## [1] 10800     4

  #print(i)
  data_bs_ori <- data_bs_ori[which(data_bs_ori$`pvalue-ori` < 0.05),]
  #dat_0.05 <- length(unique(data_bs_ori$endpoint)) #[1] 43
  data_bs_ori$difference <- data_bs_ori$`pvalue-ran`- data_bs_ori$`pvalue-ori`
  data_bs_ori$seq[data_bs_ori$difference < 0] <- 1
  data_bs_ori$seq[data_bs_ori$difference >= 0] <- 0
  table(data_bs_ori$seq)

## 
##     0     1 
## 10789    11

  head(data_bs_ori)

##                         feature   pvalue-ori pvalue-ran label difference seq
## 1 bond.C..O.N_carboxamide_.NHR. 9.480633e-08  0.6497876     1  0.6497875   0
## 2 bond.C..O.N_carboxamide_.NHR. 9.480633e-08  0.3108724    61  0.3108723   0
## 3 bond.C..O.N_carboxamide_.NHR. 9.480633e-08  0.6497876    82  0.6497875   0
## 4 bond.C..O.N_carboxamide_.NHR. 9.480633e-08  0.1625970   100  0.1625969   0
## 5 bond.C..O.N_carboxamide_.NHR. 9.480633e-08  1.0000000    91  0.9999999   0
## 6 bond.C..O.N_carboxamide_.NHR. 9.480633e-08  0.1625970    11  0.1625969   0

  #data_bs_ori<- data_bs_ori[which(data_bs_ori$difference > 0),]
  #data_bs_ori$label_1 <- 1
  data_14 <- aggregate(data_bs_ori["seq"],by = list(feature = data_bs_ori$feature,
                                                   `pvalue-ori` = data_bs_ori$`pvalue-ori`),
                                                   FUN = sum, na.rm = T)
  head(data_14[order(data_14$seq, decreasing = T),]); dim(data_14)

##                                              feature   pvalue-ori seq
## 86                     bond.CN_amine_ter.N_aliphatic 1.376862e-02   4
## 87                       bond.CN_amine_ter.N_generic 1.376862e-02   4
## 81               chain.alkaneLinear_ethyl_C2.H_gt_1. 8.394230e-03   3
## 1                      bond.CN_amine_sec.NH_aromatic 1.000000e-20   0
## 2                  bond.CX_halide_aromatic.X_generic 1.000000e-20   0
## 3  chain.aromaticAlkane_Ph.C1_acyclic_connect_H_gt_1 1.000000e-20   0

## [1] 108   3

  #unique(data_bs_ori$source)
  #data_14$label_1 <- 10 - data_14$label_1
  #View(data_14)
  data_15 <- data_14[which(data_14$seq < 3),]
  dim(data_15); head(data_15)

## [1] 105   3

##                                              feature pvalue-ori seq
## 1                      bond.CN_amine_sec.NH_aromatic      1e-20   0
## 2                  bond.CX_halide_aromatic.X_generic      1e-20   0
## 3  chain.aromaticAlkane_Ph.C1_acyclic_connect_H_gt_1      1e-20   0
## 4 chain.aromaticAlkane_Ph.C1_acyclic_connect_noDblBd      1e-20   0
## 5         chain.aromaticAlkane_Ph.C1_acyclic_generic      1e-20   0
## 6                                     CYP3A4.outcome      1e-20   0

  #View(data_15)
  #table(data_15$label_1)
  write.csv(data_15,paste0(output_1,Sys.Date(),"-bootstrap_fisher_exact_test_final_result.csv"),row.names = F)
  #write.csv(data_bs_ori,paste0(output_1,Sys.Date(),"-detail-",unique(dir_path_name_1)[i]),row.names = F)
  #stat_list[[i]] <- data.frame(list_name[i],length(unique(data_15$feature)))
#}
#data_stat <- do.call("rbind",stat_list)
#colnames(data_stat) <- c("source","bs_pvalue_0.05_num")
#write.csv(data_stat,paste0(output_1,Sys.Date(),"-stat_p_0.05_BS.csv"),row.names = F)

bootstrap-fisher-exact-test_100-times.R

xut2

2022-09-04