rm(list = ls())
###############################input data 
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\tox21_data\\"
dir_path_name <- list.files(pattern = ".*zip",dir_path,full.names = T, recursive = F)
#dir_path_name
data_list_1 <- list()
for (i in 1:length(unique(dir_path_name))) {
  #i = 1
  #print(list.files(pattern = ".*zip",dir_path,full.names = F, recursive = F)[i])
  #print(unzip(dir_path_name[i], list = TRUE))
  print(i)
  data_1 <- unzip(dir_path_name[i], list = TRUE)
  class(data_1)
  data_1$source <- list.files(pattern = ".*zip",dir_path,full.names = F, recursive = F)[i]
  data_1$label <- i
  data_list_1[[i]] <- data_1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
data_2 <- do.call("rbind", data_list_1)
head(data_2)
##                           Name   Length                Date
## 1             tox21-ahr-p1.txt 40878234 2019-03-21 11:04:00
## 2         tox21-ahr-p1.slp.doc    51712 2019-03-21 11:04:00
## 3 tox21-ahr-p1.description.txt     4788 2019-03-21 11:04:00
## 4 tox21-ahr-p1.aggregrated.txt  4071382 2019-03-21 11:04:00
## 5     tox21-ap1-agonist-p1.txt 75708679 2019-03-21 11:05:00
## 6 tox21-ap1-agonist-p1.slp.doc    54784 2019-03-21 11:05:00
##                     source label
## 1         tox21-ahr-p1.zip     1
## 2         tox21-ahr-p1.zip     1
## 3         tox21-ahr-p1.zip     1
## 4         tox21-ahr-p1.zip     1
## 5 tox21-ap1-agonist-p1.zip     2
## 6 tox21-ap1-agonist-p1.zip     2
write.csv(data_2, paste0(dir_path,Sys.Date(),"-","all_file_in_zip.csv"),row.names = FALSE)
##################
list_1 <- c(1:c(length(unique(dir_path_name)) - 1))[!1:c(length(unique(dir_path_name)) - 1) %in% c(51,53,61,62)]
data_list_2 <- list()
for (i in list_1 ) {
 #i = 2
print(i)
data_3 <- read.delim2(unz(dir_path_name[i],grep("aggregrated.txt", unzip(dir_path_name[i], list = TRUE)$Name, value = T)),header = F,quote="",stringsAsFactors = F)
data_4 <- data_3[, -as.numeric(which.max(colSums(is.na(data_3))))]
colnames(data_4) <- data_4[1,]
data_4 <- data_4[-1, ]
data_5 <- data_4[, c("SAMPLE_ID", "PROTOCOL_NAME", "SAMPLE_DATA_TYPE", "CURVE_RANK", "PURITY_RATING")]
data_list_2[[i]] <- data_5
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 52
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
data_6 <- do.call("rbind",data_list_2 )
dim(data_6) #[1] 2007353       5
## [1] 2007353       5
table(data_6$SAMPLE_DATA_TYPE)
## 
##        100        653        657   activity    agonist antagonist    control 
##      10496      10496      10496     561719      29001      29001     322602 
##  flor 0 hr flor 16 hr flor 24 hr flor 32 hr flor 40 hr  flor 8 hr   glo 0 hr 
##      19334      19334      19334      19334      19334      19334      19334 
##  glo 16 hr  glo 24 hr  glo 32 hr  glo 40 hr   glo 8 hr      ratio     signal 
##      19334      19334      19334      19334      19334       9667     322602 
##  viability 
##     469265
unique(data_6$SAMPLE_DATA_TYPE)
##  [1] "activity"   "viability"  "signal"     "control"    "657"       
##  [6] "100"        "653"        "agonist"    "antagonist" "ratio"     
## [11] "flor 32 hr" "flor 40 hr" "flor 8 hr"  "glo 0 hr"   "glo 16 hr" 
## [16] "flor 0 hr"  "flor 16 hr" "flor 24 hr" "glo 24 hr"  "glo 32 hr" 
## [21] "glo 40 hr"  "glo 8 hr"
remove_cat <- c("activity","signal", "control", "agonist", "antagonist", "ratio")
data_7 <- data_6[!data_6$SAMPLE_DATA_TYPE %in% remove_cat, ]
unique(data_7$SAMPLE_DATA_TYPE)
##  [1] "viability"  "657"        "100"        "653"        "flor 32 hr"
##  [6] "flor 40 hr" "flor 8 hr"  "glo 0 hr"   "glo 16 hr"  "flor 0 hr" 
## [11] "flor 16 hr" "flor 24 hr" "glo 24 hr"  "glo 32 hr"  "glo 40 hr" 
## [16] "glo 8 hr"
dim(data_7) #[1] 732761      5
## [1] 732761      5
str(data_7)
## 'data.frame':    732761 obs. of  5 variables:
##  $ SAMPLE_ID       : chr  "NCGC00015096-07" "NCGC00015096-12" "NCGC00015099-05" "NCGC00015100-02" ...
##  $ PROTOCOL_NAME   : chr  "tox21-ahr-p1" "tox21-ahr-p1" "tox21-ahr-p1" "tox21-ahr-p1" ...
##  $ SAMPLE_DATA_TYPE: chr  "viability" "viability" "viability" "viability" ...
##  $ CURVE_RANK      : chr  "-2" "-2" "0" "0" ...
##  $ PURITY_RATING   : chr  "A" "A" "A" "A" ...
data_7$CURVE_RANK <- as.numeric(data_7$CURVE_RANK)
min(data_7$CURVE_RANK) #[1] -9
## [1] -9
max(data_7$CURVE_RANK) #[1] 9
## [1] 9
length(unique(data_7$SAMPLE_ID)) #[1] 13128
## [1] 13128
unique(data_7$PURITY_RATING)
##  [1] "A"   "Ac"  "Bc"  "F"   "Fc"  "Fns" "D"   "Z"   "C"   ""    "B"   "I"  
## [13] "M"   "FNS" "Cc"  "W"   "FC"  "ND"  "AC"  "CC"  "BC"
dim(data_7) #[1] 732761      5
## [1] 732761      5
data_7$PURITY_RATING <- toupper(data_7$PURITY_RATING)
qc <- c("A" , "AC", "BC","B", "C")
data_8 <- data_7[data_7$PURITY_RATING %in% qc,]
dim(data_8) #[1] 546297      5
## [1] 546297      5
unique(data_8$SAMPLE_DATA_TYPE)
##  [1] "viability"  "657"        "100"        "653"        "flor 32 hr"
##  [6] "flor 40 hr" "flor 8 hr"  "glo 0 hr"   "glo 16 hr"  "flor 0 hr" 
## [11] "flor 16 hr" "flor 24 hr" "glo 24 hr"  "glo 32 hr"  "glo 40 hr" 
## [16] "glo 8 hr"
unique(data_8$PURITY_RATING)
## [1] "A"  "AC" "BC" "C"  "B"
write.csv(data_8, paste0(dir_path,Sys.Date(),"-","viability_data_tox21.csv"),row.names = FALSE)
##################
data_8$PURITY_RATING <- NULL
head(data_8)
##             SAMPLE_ID PROTOCOL_NAME SAMPLE_DATA_TYPE CURVE_RANK
## 10498 NCGC00015096-07  tox21-ahr-p1        viability         -2
## 10499 NCGC00015096-12  tox21-ahr-p1        viability         -2
## 10500 NCGC00015099-05  tox21-ahr-p1        viability          0
## 10501 NCGC00015100-02  tox21-ahr-p1        viability          0
## 10502 NCGC00015116-10  tox21-ahr-p1        viability          0
## 10503 NCGC00015121-08  tox21-ahr-p1        viability          0
data_9 <- aggregate(data_8[4], list(SAMPLE_ID = data_8$SAMPLE_ID, 
                                     PROTOCOL_NAME = data_8$PROTOCOL_NAME,
                                     SAMPLE_DATA_TYPE = data_8$SAMPLE_DATA_TYPE),
                    FUN = mean, na.rm = T)

dim(data_9) #[1] 546297      4
## [1] 546297      4
head(data_9)
##         SAMPLE_ID PROTOCOL_NAME SAMPLE_DATA_TYPE CURVE_RANK
## 1 NCGC00013034-01 tox21-dt40-p1              100   -5.00000
## 2 NCGC00013042-01 tox21-dt40-p1              100   -9.00000
## 3 NCGC00013051-01 tox21-dt40-p1              100   -3.66667
## 4 NCGC00013051-05 tox21-dt40-p1              100   -9.00000
## 5 NCGC00013067-01 tox21-dt40-p1              100   -2.00000
## 6 NCGC00013109-01 tox21-dt40-p1              100    0.00000
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.0
data_10 <- spread(data_9, PROTOCOL_NAME, CURVE_RANK)
dim(data_10) #[1] 119726     53
## [1] 119726     53
length(unique(data_9$PROTOCOL_NAME)) #[1] 51
## [1] 51
##################################################
str(data_10)
## 'data.frame':    119726 obs. of  53 variables:
##  $ SAMPLE_ID                         : chr  "NCGC00013034-01" "NCGC00013034-01" "NCGC00013034-01" "NCGC00013034-01" ...
##  $ SAMPLE_DATA_TYPE                  : chr  "100" "653" "657" "viability" ...
##  $ tox21-ahr-p1                      : num  NA NA NA 0 NA NA NA -3 NA NA ...
##  $ tox21-ap1-agonist-p1              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-ar-bla-antagonist-p1        : num  NA NA NA 0.667 NA ...
##  $ tox21-ar-mda-kb2-luc-agonist-p3   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-ar-mda-kb2-luc-antagonist-p1: num  NA NA NA 0 NA NA NA -6 NA NA ...
##  $ tox21-ar-mda-kb2-luc-antagonist-p2: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-are-bla-p1                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-aromatase-p1                : num  NA NA NA 0 NA ...
##  $ tox21-car-agonist-p1              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-car-antagonist-p1           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-casp3-cho-p1                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-casp3-hepg2-p1              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-dt40-p1                     : num  -5 -4 -2.33 NA -9 ...
##  $ tox21-elg1-luc-agonist-p1         : num  NA NA NA 0 NA NA NA 0 NA NA ...
##  $ tox21-er-bla-antagonist-p1        : num  NA NA NA 0 NA NA NA 0 NA NA ...
##  $ tox21-er-luc-bg1-4e2-agonist-p4   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-er-luc-bg1-4e2-antagonist-p1: num  NA NA NA 0 NA NA NA 0 NA NA ...
##  $ tox21-er-luc-bg1-4e2-antagonist-p2: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-erb-bla-antagonist-p1       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-erb-bla-p1                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-err-p1                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-esre-bla-p1                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-fxr-bla-agonist-p2          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-fxr-bla-antagonist-p1       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-gh3-tre-antagonist-p1       : num  NA NA NA -1.33 NA ...
##  $ tox21-gr-hela-bla-antagonist-p1   : num  NA NA NA -1.33 NA ...
##  $ tox21-h2ax-cho-p2                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-hdac-p1                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-hre-bla-agonist-p1          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-hse-bla-p1                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-mitotox-p1                  : num  NA NA NA 0 NA NA NA 0 NA NA ...
##  $ tox21-nfkb-bla-agonist-p1         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-p53-bla-p1                  : num  NA NA NA -0.5 NA NA NA 0 NA NA ...
##  $ tox21-pgc-err-p1                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-ppard-bla-agonist-p1        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-ppard-bla-antagonist-p1     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-pparg-bla-antagonist-p1     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-pr-bla-agonist-p1           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-pr-bla-antagonist-p1        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-pxr-p1                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-rar-antagonist-p2           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-ror-cho-antagonist-p1       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-rt-viability-hek293-p1      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-rt-viability-hepg2-p1       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-rxr-bla-agonist-p1          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-sbe-bla-agonist-p1          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-sbe-bla-antagonist-p1       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-shh-3t3-gli3-agonist-p1     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-shh-3t3-gli3-antagonist-p1  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-vdr-bla-agonist-p1          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tox21-vdr-bla-antagonist-p1       : num  NA NA NA NA NA NA NA NA NA NA ...
colnames(data_10)
##  [1] "SAMPLE_ID"                          "SAMPLE_DATA_TYPE"                  
##  [3] "tox21-ahr-p1"                       "tox21-ap1-agonist-p1"              
##  [5] "tox21-ar-bla-antagonist-p1"         "tox21-ar-mda-kb2-luc-agonist-p3"   
##  [7] "tox21-ar-mda-kb2-luc-antagonist-p1" "tox21-ar-mda-kb2-luc-antagonist-p2"
##  [9] "tox21-are-bla-p1"                   "tox21-aromatase-p1"                
## [11] "tox21-car-agonist-p1"               "tox21-car-antagonist-p1"           
## [13] "tox21-casp3-cho-p1"                 "tox21-casp3-hepg2-p1"              
## [15] "tox21-dt40-p1"                      "tox21-elg1-luc-agonist-p1"         
## [17] "tox21-er-bla-antagonist-p1"         "tox21-er-luc-bg1-4e2-agonist-p4"   
## [19] "tox21-er-luc-bg1-4e2-antagonist-p1" "tox21-er-luc-bg1-4e2-antagonist-p2"
## [21] "tox21-erb-bla-antagonist-p1"        "tox21-erb-bla-p1"                  
## [23] "tox21-err-p1"                       "tox21-esre-bla-p1"                 
## [25] "tox21-fxr-bla-agonist-p2"           "tox21-fxr-bla-antagonist-p1"       
## [27] "tox21-gh3-tre-antagonist-p1"        "tox21-gr-hela-bla-antagonist-p1"   
## [29] "tox21-h2ax-cho-p2"                  "tox21-hdac-p1"                     
## [31] "tox21-hre-bla-agonist-p1"           "tox21-hse-bla-p1"                  
## [33] "tox21-mitotox-p1"                   "tox21-nfkb-bla-agonist-p1"         
## [35] "tox21-p53-bla-p1"                   "tox21-pgc-err-p1"                  
## [37] "tox21-ppard-bla-agonist-p1"         "tox21-ppard-bla-antagonist-p1"     
## [39] "tox21-pparg-bla-antagonist-p1"      "tox21-pr-bla-agonist-p1"           
## [41] "tox21-pr-bla-antagonist-p1"         "tox21-pxr-p1"                      
## [43] "tox21-rar-antagonist-p2"            "tox21-ror-cho-antagonist-p1"       
## [45] "tox21-rt-viability-hek293-p1"       "tox21-rt-viability-hepg2-p1"       
## [47] "tox21-rxr-bla-agonist-p1"           "tox21-sbe-bla-agonist-p1"          
## [49] "tox21-sbe-bla-antagonist-p1"        "tox21-shh-3t3-gli3-agonist-p1"     
## [51] "tox21-shh-3t3-gli3-antagonist-p1"   "tox21-vdr-bla-agonist-p1"          
## [53] "tox21-vdr-bla-antagonist-p1"
as.numeric(data_10[4,3:ncol(data_10)])
##  [1]  0.00000       NA  0.66667       NA  0.00000       NA       NA  0.00000
##  [9]       NA       NA       NA       NA       NA  0.00000  0.00000       NA
## [17]  0.00000       NA       NA       NA       NA       NA       NA       NA
## [25] -1.33333 -1.33333       NA       NA       NA       NA  0.00000       NA
## [33] -0.50000       NA       NA       NA       NA       NA       NA       NA
## [41]       NA       NA       NA       NA       NA       NA       NA       NA
## [49]       NA       NA       NA
#View(data_10)
rowSums(data_10[4,3:ncol(data_10)] < -1, na.rm = T) < 1
##     4 
## FALSE
data_11 <- data_10[rowSums(data_10[,3:ncol(data_10)] < -1, na.rm = T) < 1, ]
dim(data_11)
## [1] 95162    53
#View(data_11)
length(unique(data_11$SAMPLE_ID)) #9030
## [1] 9030
write.csv(data_11, paste0(dir_path,Sys.Date(),"-","viability_data_tox21_without_toxicity.csv"),row.names = FALSE)