rm(list = ls())
dir_path <- "C:\\Users\\liyix\\OneDrive\\Desktop\\tox21_data\\"
dir_path_name <- list.files(pattern = ".*zip",dir_path,full.names = T, recursive = F)
data_list_1 <- list()
for (i in 1:length(unique(dir_path_name))) {
print(i)
data_1 <- unzip(dir_path_name[i], list = TRUE)
class(data_1)
data_1$source <- list.files(pattern = ".*zip",dir_path,full.names = F, recursive = F)[i]
data_1$label <- i
data_list_1[[i]] <- data_1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 51
## [1] 52
## [1] 53
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 61
## [1] 62
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
## [1] 69
data_2 <- do.call("rbind", data_list_1)
head(data_2)
## Name Length Date
## 1 tox21-ahr-p1.txt 40878234 2019-03-21 11:04:00
## 2 tox21-ahr-p1.slp.doc 51712 2019-03-21 11:04:00
## 3 tox21-ahr-p1.description.txt 4788 2019-03-21 11:04:00
## 4 tox21-ahr-p1.aggregrated.txt 4071382 2019-03-21 11:04:00
## 5 tox21-ap1-agonist-p1.txt 75708679 2019-03-21 11:05:00
## 6 tox21-ap1-agonist-p1.slp.doc 54784 2019-03-21 11:05:00
## source label
## 1 tox21-ahr-p1.zip 1
## 2 tox21-ahr-p1.zip 1
## 3 tox21-ahr-p1.zip 1
## 4 tox21-ahr-p1.zip 1
## 5 tox21-ap1-agonist-p1.zip 2
## 6 tox21-ap1-agonist-p1.zip 2
write.csv(data_2, paste0(dir_path,Sys.Date(),"-","all_file_in_zip.csv"),row.names = FALSE)
list_1 <- c(1:c(length(unique(dir_path_name)) - 1))[!1:c(length(unique(dir_path_name)) - 1) %in% c(51,53,61,62)]
data_list_2 <- list()
for (i in list_1 ) {
print(i)
data_3 <- read.delim2(unz(dir_path_name[i],grep("aggregrated.txt", unzip(dir_path_name[i], list = TRUE)$Name, value = T)),header = F,quote="",stringsAsFactors = F)
data_4 <- data_3[, -as.numeric(which.max(colSums(is.na(data_3))))]
colnames(data_4) <- data_4[1,]
data_4 <- data_4[-1, ]
data_5 <- data_4[, c("SAMPLE_ID", "PROTOCOL_NAME", "SAMPLE_DATA_TYPE", "CURVE_RANK", "PURITY_RATING")]
data_list_2[[i]] <- data_5
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
## [1] 45
## [1] 46
## [1] 47
## [1] 48
## [1] 49
## [1] 50
## [1] 52
## [1] 54
## [1] 55
## [1] 56
## [1] 57
## [1] 58
## [1] 59
## [1] 60
## [1] 63
## [1] 64
## [1] 65
## [1] 66
## [1] 67
## [1] 68
data_6 <- do.call("rbind",data_list_2 )
dim(data_6)
## [1] 2007353 5
table(data_6$SAMPLE_DATA_TYPE)
##
## 100 653 657 activity agonist antagonist control
## 10496 10496 10496 561719 29001 29001 322602
## flor 0 hr flor 16 hr flor 24 hr flor 32 hr flor 40 hr flor 8 hr glo 0 hr
## 19334 19334 19334 19334 19334 19334 19334
## glo 16 hr glo 24 hr glo 32 hr glo 40 hr glo 8 hr ratio signal
## 19334 19334 19334 19334 19334 9667 322602
## viability
## 469265
unique(data_6$SAMPLE_DATA_TYPE)
## [1] "activity" "viability" "signal" "control" "657"
## [6] "100" "653" "agonist" "antagonist" "ratio"
## [11] "flor 32 hr" "flor 40 hr" "flor 8 hr" "glo 0 hr" "glo 16 hr"
## [16] "flor 0 hr" "flor 16 hr" "flor 24 hr" "glo 24 hr" "glo 32 hr"
## [21] "glo 40 hr" "glo 8 hr"
remove_cat <- c("activity","signal", "control", "agonist", "antagonist", "ratio")
data_7 <- data_6[!data_6$SAMPLE_DATA_TYPE %in% remove_cat, ]
unique(data_7$SAMPLE_DATA_TYPE)
## [1] "viability" "657" "100" "653" "flor 32 hr"
## [6] "flor 40 hr" "flor 8 hr" "glo 0 hr" "glo 16 hr" "flor 0 hr"
## [11] "flor 16 hr" "flor 24 hr" "glo 24 hr" "glo 32 hr" "glo 40 hr"
## [16] "glo 8 hr"
dim(data_7)
## [1] 732761 5
str(data_7)
## 'data.frame': 732761 obs. of 5 variables:
## $ SAMPLE_ID : chr "NCGC00015096-07" "NCGC00015096-12" "NCGC00015099-05" "NCGC00015100-02" ...
## $ PROTOCOL_NAME : chr "tox21-ahr-p1" "tox21-ahr-p1" "tox21-ahr-p1" "tox21-ahr-p1" ...
## $ SAMPLE_DATA_TYPE: chr "viability" "viability" "viability" "viability" ...
## $ CURVE_RANK : chr "-2" "-2" "0" "0" ...
## $ PURITY_RATING : chr "A" "A" "A" "A" ...
data_7$CURVE_RANK <- as.numeric(data_7$CURVE_RANK)
min(data_7$CURVE_RANK)
## [1] -9
max(data_7$CURVE_RANK)
## [1] 9
length(unique(data_7$SAMPLE_ID))
## [1] 13128
unique(data_7$PURITY_RATING)
## [1] "A" "Ac" "Bc" "F" "Fc" "Fns" "D" "Z" "C" "" "B" "I"
## [13] "M" "FNS" "Cc" "W" "FC" "ND" "AC" "CC" "BC"
dim(data_7)
## [1] 732761 5
data_7$PURITY_RATING <- toupper(data_7$PURITY_RATING)
qc <- c("A" , "AC", "BC","B", "C")
data_8 <- data_7[data_7$PURITY_RATING %in% qc,]
dim(data_8)
## [1] 546297 5
unique(data_8$SAMPLE_DATA_TYPE)
## [1] "viability" "657" "100" "653" "flor 32 hr"
## [6] "flor 40 hr" "flor 8 hr" "glo 0 hr" "glo 16 hr" "flor 0 hr"
## [11] "flor 16 hr" "flor 24 hr" "glo 24 hr" "glo 32 hr" "glo 40 hr"
## [16] "glo 8 hr"
unique(data_8$PURITY_RATING)
## [1] "A" "AC" "BC" "C" "B"
write.csv(data_8, paste0(dir_path,Sys.Date(),"-","viability_data_tox21.csv"),row.names = FALSE)
data_8$PURITY_RATING <- NULL
head(data_8)
## SAMPLE_ID PROTOCOL_NAME SAMPLE_DATA_TYPE CURVE_RANK
## 10498 NCGC00015096-07 tox21-ahr-p1 viability -2
## 10499 NCGC00015096-12 tox21-ahr-p1 viability -2
## 10500 NCGC00015099-05 tox21-ahr-p1 viability 0
## 10501 NCGC00015100-02 tox21-ahr-p1 viability 0
## 10502 NCGC00015116-10 tox21-ahr-p1 viability 0
## 10503 NCGC00015121-08 tox21-ahr-p1 viability 0
data_9 <- aggregate(data_8[4], list(SAMPLE_ID = data_8$SAMPLE_ID,
PROTOCOL_NAME = data_8$PROTOCOL_NAME,
SAMPLE_DATA_TYPE = data_8$SAMPLE_DATA_TYPE),
FUN = mean, na.rm = T)
dim(data_9)
## [1] 546297 4
head(data_9)
## SAMPLE_ID PROTOCOL_NAME SAMPLE_DATA_TYPE CURVE_RANK
## 1 NCGC00013034-01 tox21-dt40-p1 100 -5.00000
## 2 NCGC00013042-01 tox21-dt40-p1 100 -9.00000
## 3 NCGC00013051-01 tox21-dt40-p1 100 -3.66667
## 4 NCGC00013051-05 tox21-dt40-p1 100 -9.00000
## 5 NCGC00013067-01 tox21-dt40-p1 100 -2.00000
## 6 NCGC00013109-01 tox21-dt40-p1 100 0.00000
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.0
data_10 <- spread(data_9, PROTOCOL_NAME, CURVE_RANK)
dim(data_10)
## [1] 119726 53
length(unique(data_9$PROTOCOL_NAME))
## [1] 51
str(data_10)
## 'data.frame': 119726 obs. of 53 variables:
## $ SAMPLE_ID : chr "NCGC00013034-01" "NCGC00013034-01" "NCGC00013034-01" "NCGC00013034-01" ...
## $ SAMPLE_DATA_TYPE : chr "100" "653" "657" "viability" ...
## $ tox21-ahr-p1 : num NA NA NA 0 NA NA NA -3 NA NA ...
## $ tox21-ap1-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-ar-bla-antagonist-p1 : num NA NA NA 0.667 NA ...
## $ tox21-ar-mda-kb2-luc-agonist-p3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-ar-mda-kb2-luc-antagonist-p1: num NA NA NA 0 NA NA NA -6 NA NA ...
## $ tox21-ar-mda-kb2-luc-antagonist-p2: num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-are-bla-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-aromatase-p1 : num NA NA NA 0 NA ...
## $ tox21-car-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-car-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-casp3-cho-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-casp3-hepg2-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-dt40-p1 : num -5 -4 -2.33 NA -9 ...
## $ tox21-elg1-luc-agonist-p1 : num NA NA NA 0 NA NA NA 0 NA NA ...
## $ tox21-er-bla-antagonist-p1 : num NA NA NA 0 NA NA NA 0 NA NA ...
## $ tox21-er-luc-bg1-4e2-agonist-p4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-er-luc-bg1-4e2-antagonist-p1: num NA NA NA 0 NA NA NA 0 NA NA ...
## $ tox21-er-luc-bg1-4e2-antagonist-p2: num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-erb-bla-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-erb-bla-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-err-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-esre-bla-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-fxr-bla-agonist-p2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-fxr-bla-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-gh3-tre-antagonist-p1 : num NA NA NA -1.33 NA ...
## $ tox21-gr-hela-bla-antagonist-p1 : num NA NA NA -1.33 NA ...
## $ tox21-h2ax-cho-p2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-hdac-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-hre-bla-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-hse-bla-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-mitotox-p1 : num NA NA NA 0 NA NA NA 0 NA NA ...
## $ tox21-nfkb-bla-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-p53-bla-p1 : num NA NA NA -0.5 NA NA NA 0 NA NA ...
## $ tox21-pgc-err-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-ppard-bla-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-ppard-bla-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-pparg-bla-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-pr-bla-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-pr-bla-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-pxr-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-rar-antagonist-p2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-ror-cho-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-rt-viability-hek293-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-rt-viability-hepg2-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-rxr-bla-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-sbe-bla-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-sbe-bla-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-shh-3t3-gli3-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-shh-3t3-gli3-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-vdr-bla-agonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ tox21-vdr-bla-antagonist-p1 : num NA NA NA NA NA NA NA NA NA NA ...
colnames(data_10)
## [1] "SAMPLE_ID" "SAMPLE_DATA_TYPE"
## [3] "tox21-ahr-p1" "tox21-ap1-agonist-p1"
## [5] "tox21-ar-bla-antagonist-p1" "tox21-ar-mda-kb2-luc-agonist-p3"
## [7] "tox21-ar-mda-kb2-luc-antagonist-p1" "tox21-ar-mda-kb2-luc-antagonist-p2"
## [9] "tox21-are-bla-p1" "tox21-aromatase-p1"
## [11] "tox21-car-agonist-p1" "tox21-car-antagonist-p1"
## [13] "tox21-casp3-cho-p1" "tox21-casp3-hepg2-p1"
## [15] "tox21-dt40-p1" "tox21-elg1-luc-agonist-p1"
## [17] "tox21-er-bla-antagonist-p1" "tox21-er-luc-bg1-4e2-agonist-p4"
## [19] "tox21-er-luc-bg1-4e2-antagonist-p1" "tox21-er-luc-bg1-4e2-antagonist-p2"
## [21] "tox21-erb-bla-antagonist-p1" "tox21-erb-bla-p1"
## [23] "tox21-err-p1" "tox21-esre-bla-p1"
## [25] "tox21-fxr-bla-agonist-p2" "tox21-fxr-bla-antagonist-p1"
## [27] "tox21-gh3-tre-antagonist-p1" "tox21-gr-hela-bla-antagonist-p1"
## [29] "tox21-h2ax-cho-p2" "tox21-hdac-p1"
## [31] "tox21-hre-bla-agonist-p1" "tox21-hse-bla-p1"
## [33] "tox21-mitotox-p1" "tox21-nfkb-bla-agonist-p1"
## [35] "tox21-p53-bla-p1" "tox21-pgc-err-p1"
## [37] "tox21-ppard-bla-agonist-p1" "tox21-ppard-bla-antagonist-p1"
## [39] "tox21-pparg-bla-antagonist-p1" "tox21-pr-bla-agonist-p1"
## [41] "tox21-pr-bla-antagonist-p1" "tox21-pxr-p1"
## [43] "tox21-rar-antagonist-p2" "tox21-ror-cho-antagonist-p1"
## [45] "tox21-rt-viability-hek293-p1" "tox21-rt-viability-hepg2-p1"
## [47] "tox21-rxr-bla-agonist-p1" "tox21-sbe-bla-agonist-p1"
## [49] "tox21-sbe-bla-antagonist-p1" "tox21-shh-3t3-gli3-agonist-p1"
## [51] "tox21-shh-3t3-gli3-antagonist-p1" "tox21-vdr-bla-agonist-p1"
## [53] "tox21-vdr-bla-antagonist-p1"
as.numeric(data_10[4,3:ncol(data_10)])
## [1] 0.00000 NA 0.66667 NA 0.00000 NA NA 0.00000
## [9] NA NA NA NA NA 0.00000 0.00000 NA
## [17] 0.00000 NA NA NA NA NA NA NA
## [25] -1.33333 -1.33333 NA NA NA NA 0.00000 NA
## [33] -0.50000 NA NA NA NA NA NA NA
## [41] NA NA NA NA NA NA NA NA
## [49] NA NA NA
rowSums(data_10[4,3:ncol(data_10)] < -1, na.rm = T) < 1
## 4
## FALSE
data_11 <- data_10[rowSums(data_10[,3:ncol(data_10)] < -1, na.rm = T) < 1, ]
dim(data_11)
## [1] 95162 53
length(unique(data_11$SAMPLE_ID))
## [1] 9030
write.csv(data_11, paste0(dir_path,Sys.Date(),"-","viability_data_tox21_without_toxicity.csv"),row.names = FALSE)