###########################################################
### Build Data for Comparative Toxicogenomics Database
# Link to many toxigenomic databases:
# http://alttox.org/resource-center/databases/
# Link to CTD:
# nar.oxfordjournals.org/content/41/D1/D1104.full
# http://ctdbase.org/
#install.packages("devtools")
library(utils)
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.1 (2020-08-26 16:20:06 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.24.0 (2020-08-26 16:11:58 UTC) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
##
## throw
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
## R.utils v2.10.1 (2020-08-26 22:50:31 UTC) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
## warnings
#library(devtools)
# Download
url <- "http://ctdbase.org/reports/CTD_chem_gene_ixns.csv.gz"
file.gz <- tempfile(fileext = ".gz")
utils::download.file(url, file.gz)
# Unzip
file.csv <- tempfile(fileext = ".csv")
R.utils::gunzip(filename = file.gz, destname = file.csv)
# Read
ctd <- read.csv(file.csv, skip = 28, header = FALSE, stringsAsFactors = FALSE)
colnames(ctd) <-
c("ChemicalName", "ChemicalID", "CasRN", "GeneSymbol", "GeneID", "GeneForms",
"Organism", "OrganismID", "Interaction", "InteractionActions", "PubMedIDs")
head(ctd)
## ChemicalName ChemicalID CasRN GeneSymbol GeneID GeneForms Organism
## 1 # NA
## 2 10074-G5 C534883 AR 367 protein Homo sapiens
## 3 10074-G5 C534883 AR 367 protein Homo sapiens
## 4 10074-G5 C534883 AR 367 protein Homo sapiens
## 5 10074-G5 C534883 AR 367 protein Homo sapiens
## 6 10074-G5 C534883 EPHB2 2048 protein Homo sapiens
## OrganismID
## 1 NA
## 2 9606
## 3 9606
## 4 9606
## 5 9606
## 6 9606
## Interaction
## 1
## 2 10074-G5 affects the reaction [MYC protein results in increased expression of AR protein]
## 3 10074-G5 inhibits the reaction [EPHB2 protein modified form results in increased expression of AR protein]
## 4 10074-G5 results in decreased expression of AR protein
## 5 10074-G5 results in decreased expression of AR protein alternative form
## 6 10074-G5 inhibits the reaction [EPHB2 protein modified form results in increased expression of AR protein]
## InteractionActions PubMedIDs
## 1
## 2 affects^reaction|increases^expression 32184358
## 3 decreases^reaction|increases^expression 32184358
## 4 decreases^expression 32184358
## 5 decreases^expression 32184358
## 6 decreases^reaction|increases^expression 32184358
# Remove NAs
ctd <- ctd[!is.na(ctd$GeneID), ]
rownames(ctd) <- 1:nrow(ctd)
dim(ctd) #[1] 1759304 11
## [1] 2103174 11
# Save
class(ctd)
## [1] "data.frame"
out_put <- paste0(Sys.Date(),"-ctd_chem_gene.txt")
write.table(ctd,out_put,row.names = F, sep = "\t")