###########################################################
### Build Data for Comparative Toxicogenomics Database

# Link to many toxigenomic databases:
# http://alttox.org/resource-center/databases/

# Link to CTD:
# nar.oxfordjournals.org/content/41/D1/D1104.full
# http://ctdbase.org/
#install.packages("devtools")
library(utils)
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
## 
##     throw
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save
## R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
## 
##     timestamp
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
##     warnings
#library(devtools)

# Download
url <- "http://ctdbase.org/reports/CTD_chem_gene_ixns.csv.gz"
file.gz <- tempfile(fileext = ".gz")
utils::download.file(url, file.gz)

# Unzip
file.csv <- tempfile(fileext = ".csv")
R.utils::gunzip(filename = file.gz, destname = file.csv)

# Read
ctd <- read.csv(file.csv, skip = 28, header = FALSE, stringsAsFactors = FALSE)
colnames(ctd) <-
  c("ChemicalName", "ChemicalID", "CasRN", "GeneSymbol", "GeneID", "GeneForms",
    "Organism", "OrganismID", "Interaction", "InteractionActions", "PubMedIDs")
head(ctd)
##   ChemicalName ChemicalID CasRN GeneSymbol GeneID GeneForms     Organism
## 1            #                                 NA                       
## 2     10074-G5    C534883               AR    367   protein Homo sapiens
## 3     10074-G5    C534883               AR    367   protein Homo sapiens
## 4     10074-G5    C534883               AR    367   protein Homo sapiens
## 5     10074-G5    C534883               AR    367   protein Homo sapiens
## 6     10074-G5    C534883            EPHB2   2048   protein Homo sapiens
##   OrganismID
## 1         NA
## 2       9606
## 3       9606
## 4       9606
## 5       9606
## 6       9606
##                                                                                                  Interaction
## 1                                                                                                           
## 2                  10074-G5 affects the reaction [MYC protein results in increased expression of AR protein]
## 3 10074-G5 inhibits the reaction [EPHB2 protein modified form results in increased expression of AR protein]
## 4                                                     10074-G5 results in decreased expression of AR protein
## 5                                    10074-G5 results in decreased expression of AR protein alternative form
## 6 10074-G5 inhibits the reaction [EPHB2 protein modified form results in increased expression of AR protein]
##                        InteractionActions PubMedIDs
## 1                                                  
## 2   affects^reaction|increases^expression  32184358
## 3 decreases^reaction|increases^expression  32184358
## 4                    decreases^expression  32184358
## 5                    decreases^expression  32184358
## 6 decreases^reaction|increases^expression  32184358
# Remove NAs
ctd <- ctd[!is.na(ctd$GeneID), ]
rownames(ctd) <- 1:nrow(ctd)
dim(ctd) #[1] 1759304      11
## [1] 2103174      11
# Save
class(ctd)
## [1] "data.frame"
out_put <- paste0(Sys.Date(),"-ctd_chem_gene.txt")
write.table(ctd,out_put,row.names = F, sep = "\t")