- KEGG APIを使ってデータを取得し, ID対応表の作成や配列の取得を行う.
- ncbi gene2refseqやPubChemデータベースに対しても検索を行い. ID対応表を作成.
- ID対応表はそのままRでデータフレームに変換する.
- ID対応表はトランスクリプトームデータやメタボロームデータの解析に用いる.
wget -q -O - http://rest.kegg.jp/list/organism
-qでダウンロードの詳細情報を入れないようにする.-O -内容を標準出力に出す.# ダウンロード
org <- system("wget -q -O - http://rest.kegg.jp/list/organism", intern = T)
# データフレームに変換
org_dat <- data.frame(T_number = sapply(strsplit(org, "\t"), "[", 1),
organism_code = sapply(strsplit(org, "\t"), "[", 2),
species = sapply(strsplit(org, "\t"), "[", 3),
stringsAsFactors = F
)| T_number | organism_code | species |
|---|---|---|
| T01001 | hsa | Homo sapiens (human) |
| T01005 | ptr | Pan troglodytes (chimpanzee) |
| T02283 | pps | Pan paniscus (bonobo) |
| T02442 | ggo | Gorilla gorilla gorilla (western lowland gorilla) |
| T01416 | pon | Pongo abelii (Sumatran orangutan) |
| T03265 | nle | Nomascus leucogenys (northern white-cheeked gibbon) |
wget -q -O - http://rest.kegg.jp/list/pathway/cge # cgeは生物種コードwget -q -O - http://rest.kegg.jp/list/pathway # 全生物種の場合
# ダウンロード
pathway <- system("wget -q -O - http://rest.kegg.jp/list/pathway/cge", intern = T)
# データフレームに変換
map_dat <- data.frame(map = sapply(strsplit(pathway, "\t"), "[", 1),
name = sapply(strsplit(pathway, "\t"), "[", 2),
stringsAsFactors = F) | map | name |
|---|---|
| path:cge00010 | Glycolysis / Gluconeogenesis - Cricetulus griseus (Chinese hamster) |
| path:cge00020 | Citrate cycle (TCA cycle) - Cricetulus griseus (Chinese hamster) |
| path:cge00030 | Pentose phosphate pathway - Cricetulus griseus (Chinese hamster) |
wget -q -O - http://rest.kegg.jp/list/cge# ダウンロード
gene <- system("wget -q -O - http://rest.kegg.jp/list/cge", intern = T)
# データフレームに変換
gi_cge <-
data.frame(gene = sapply(strsplit(gene, "\t"), "[", 1),
description = sapply(strsplit(gene, "\t"), "[", 2),
stringsAsFactors = F
)| gene | description |
|---|---|
| cge:100682525 | Tp53; cellular tumor antigen p53 |
| cge:100682526 | Tuba1c; tubulin alpha-1C chain |
| cge:100682527 | Tuba1a; tubulin alpha-1A chain |
| cge:100682528 | Tuba1b; tubulin alpha-1B chain |
| cge:100682529 | Mgat1; alpha-1,3-mannosyl-glycoprotein 2-beta-N-acetylglucosaminyltransferase |
| cge:100682531 | Kif23; kinesin-like protein KIF23 |
wget -q -O - http://rest.kegg.jp/get/cge:100765387/ntseq # 塩基配列wget -q -O - http://rest.kegg.jp/get/cge:100765387/aaseq # アミノ酸配列+ で繋げてcge:100682525+cge:100682526+cge:100682527のようにしてもよい.
+で繋げて実行する.# idを10づつに分割してアミノ酸配列を取得する.
gi <- gi_cge$gene[1:29]
# id chunk per 10 seq
chunk <- function(x, n){
nchnk <- ceiling(length(x)/n)
split(x, cut(seq_along(x), nchnk, labels = FALSE))
}
chnk <- chunk(x = gi, n = 10)
# 10 seqづつwget
out_fna <- "~/pub/fasta/cge_29.fna"
fna <- file(out_fna, "w") # ファイルコネクションを書き込みモードで開く
# fasta配列を取得し, wireLinesで順次書き出し
for(i in seq_along(chnk)){
gids_chnk <- paste(chnk[[i]], collapse = "+")
com <- paste0("wget -q -O - http://rest.kegg.jp/get/", gids_chnk, "/ntseq")
fst <- system(com, wait = T, intern = T)
writeLines(fst, fna)
}
close(fna) # ファイルコネクション閉じるwget -q -O - http://rest.kegg.jp/link/pathway/cge# ダウンロード
genemap <- system("wget -q -O - http://rest.kegg.jp/link/pathway/cge", intern = T)
# データフレームに変換
map_cge <-
data.frame(gene = sapply(strsplit(genemap, "\t"), "[", 1),
map = sub("path:","",sapply(strsplit(genemap, "\t"), "[", 2)),
stringsAsFactors = F
)
# ユニークなGENE IDにして MAP IDを";"で結合
maps_cge <- map_cge %>% group_by(gene) %>% summarise(maps = paste(map, collapse = ";"))| gene | map |
|---|---|
| cge:100689064 | cge00010 |
| cge:100689106 | cge00010 |
| cge:100689437 | cge00010 |
| cge:100689467 | cge00010 |
| cge:100689468 | cge00010 |
| cge:100736557 | cge00010 |
| gene | maps |
|---|---|
| cge:100682525 | cge01522; cge01524; cge04010; cge04071; cge04110; cge04115; cge04137; cge04151; cge04210; cge04211; cge04216; cge04218; cge04310; cge04722; cge04919; cge05014; cge05016; cge05160; cge05161; cge05162; cge05163; cge05165; cge05166; cge05167; cge05168; cge05169; cge05200; cge05202; cge05203; cge05205; cge05206; cge05210; cge05212; cge05213; cge05214; cge05215; cge05216; cge05217; cge05218; cge05219; cge05220; cge05222; cge05223; cge05224; cge05225; cge05226; cge05230; cge05418 |
| cge:100682526 | cge04145; cge04210; cge04530; cge04540 |
| cge:100682527 | cge04145; cge04210; cge04530; cge04540 |
| cge:100682528 | cge04145; cge04210; cge04530; cge04540 |
| cge:100682529 | cge00510; cge01100 |
| cge:100682531 | cge05206 |
wget -q -O - http://rest.kegg.jp/link/ec/cge# ダウンロード
ec_cge <- system("wget -q -O - http://rest.kegg.jp/link/ec/cge", intern = T)
# データフレームに変換
ec_cge <- data.frame(gene = sapply(strsplit(ec_cge, "\t"), "[", 1),
ec = sapply(strsplit(ec_cge, "\t"), "[", 2),
stringsAsFactors = F
)
# 複数のenzyme codeが付いているものは";"で結合する.
ecs_cge <- ec_cge %>% group_by(gene) %>% summarise(ecs = paste(ec, collapse = "; "))| gene | ec |
|---|---|
| cge:100773215 | ec:2.3.2.23 |
| cge:100771631 | ec:2.7.11.1 |
| cge:100751045 | ec:1.16.1.6 |
| cge:100751045 | ec:2.5.1.151 |
| cge:100754178 | ec:3.6.4.12 |
| cge:100773468 | ec:3.2.2.6 |
| cge:100773468 | ec:2.4.99.20 |
| cge:100770686 | ec:3.2.1.17 |
| cge:100758766 | ec:3.1.3.48 |
| cge:100759427 | ec:3.6.4.13 |
| gene | ecs |
|---|---|
| cge:100682529 | ec:2.4.1.101 |
| cge:100682532 | ec:3.2.1.130 |
| cge:100682534 | ec:3.4.21.75 |
| cge:100682535 | ec:7.6.2.2 |
| cge:100682536 | ec:7.6.2.2 |
| cge:100682537 | ec:7.6.2.2 |
| cge:100684971 | ec:3.4.24.86 |
| cge:100689016 | ec:2.7.7.6 |
| cge:100689019 | ec:1.1.1.184; ec:1.1.1.189; ec:1.1.1.197 |
| cge:100689020 | ec:1.1.1.184; ec:1.1.1.189; ec:1.1.1.197 |
wget -q -O - http://rest.kegg.jp/conv/ncbi-proteinid/cge# ダウンロード
refp <- system("wget -q -O - http://rest.kegg.jp/conv/ncbi-proteinid/cge", intern = T)
# データフレームに変換
refp_cge <- data.frame(gene = sapply(strsplit(refp, "\t"), "[", 1),
ref.protein = sub("ncbi-proteinid:","",
sapply(strsplit(refp, "\t"), "[", 2)),
stringsAsFactors = F)
# length(unique(refp_cge$gene)) == nrow(refp_cge)| gene | ref.protein |
|---|---|
| cge:100682525 | NP_001230905 |
| cge:100682526 | NP_001230906 |
| cge:100682527 | NP_001230907 |
| cge:100682528 | NP_001230908 |
| cge:100682529 | NP_001230909 |
| cge:100682531 | NP_001230910 |
wget -q -O - http://rest.kegg.jp/conv/uniprot/cge# ダウンロード
unip <- system("wget -q -O - http://rest.kegg.jp/conv/uniprot/cge", intern = T)
# データフレームに変換
unip_cge <- data.frame(gene = sapply(strsplit(unip, "\t"), "[", 1),
upid = sapply(strsplit(unip, "\t"), "[", 2),
stringsAsFactors = F)
# 複数のUniProt IDが付いているものは";"で結合する.
unips_cge <- unip_cge %>% group_by(gene) %>% summarise(upids = paste(upid, collapse = "; "))| gene | upid |
|---|---|
| cge:100682525 | up:O09185 |
| cge:100682526 | up:P68365 |
| cge:100682527 | up:P68362 |
| cge:100682528 | up:P68361 |
| cge:100682529 | up:Q924C0 |
| cge:100682531 | up:Q60442 |
| cge:100682532 | up:A7RDN7 |
| cge:100682533 | up:Q9WVM7 |
| cge:100682533 | up:G3GVA8 |
| cge:100682534 | up:Q60426 |
| gene | upids |
|---|---|
| cge:100682525 | up:O09185 |
| cge:100682526 | up:P68365 |
| cge:100682527 | up:P68362 |
| cge:100682528 | up:P68361 |
| cge:100682529 | up:Q924C0 |
| cge:100682531 | up:Q60442 |
| cge:100682532 | up:A7RDN7 |
| cge:100682533 | up:Q9WVM7; up:G3GVA8 |
| cge:100682534 | up:Q60426 |
| cge:100682535 | up:P23174 |
# ダウンロード
ncbi <- system("wget -q -O - http://rest.kegg.jp/conv/ncbi-geneid/cge", intern = T)
# データフレームに変換
ncbi_cge <- data.frame(gene = sapply(strsplit(ncbi, "\t"), "[", 1),
ncbi_gi = sub("ncbi-geneid:", "", sapply(strsplit(ncbi, "\t"), "[", 2)),
stringsAsFactors = F)| gene | ncbi_gi |
|---|---|
| cge:100682525 | 100682525 |
| cge:100682526 | 100682526 |
| cge:100682527 | 100682527 |
| cge:100682528 | 100682528 |
| cge:100682529 | 100682529 |
| cge:100682531 | 100682531 |
wget -q -O - http://rest.kegg.jp/link/ko/cge# ダウンロード
ko <- system("wget -q -O - http://rest.kegg.jp/link/ko/cge", intern = T)
# データフレームに変換
ko_cge <- data.frame(gene = sapply(strsplit(ko, "\t"), "[", 1),
ko = sub("ko:","", sapply(strsplit(ko, "\t"), "[", 2)),
stringsAsFactors = F)
# 複数のKO IDが付いているものは";"で結合する.
kos_cge <- ko_cge %>% group_by(gene) %>% summarise(kos = paste(ko, collapse = ";"))
# kos_cge$kos[duplicated(kos_cge$kos)]
# kos_cge$gene[duplicated(kos_cge$gene)]
# ko_cge[ko_cge$gene == "cge:100770162",] # K00504, K18200| gene | kos |
|---|---|
| cge:100682525 | K04451 |
| cge:100682526 | K07374 |
| cge:100682527 | K07374 |
| cge:100682528 | K07374 |
| cge:100682529 | K00726 |
| cge:100682531 | K17387 |
readr::read_delimかdata.table::freadで読み込み.# # ファイルをダウンロード(サイズが大きいのでダウンロードしてから読み込む)
# download.file(url = "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz",
# destfile = "~/db/ncbi_gene/gene2refseq.gz",
# method = "wget")
# #
# g2ref <- data.table::fread("gunzip -c ~/db/ncbi_gene/gene2refseq.gz", sep = "\t", header = T ) # 24586020
# g2ref <- readr::read_delim("~/db/ncbi_gene/gene2refseq.gz", delim = "\t",
# col_type=paste(rep("c",16), collapse = ""),
# col_names=T)
# g2ref.cge <-subset(g2ref, g2ref$assembly=="Reference CriGri_1.0 Primary Assembly") # 45522
# write.table(g2ref.cge, "~/db/ncbi_gene/g2ref.cge", quote = F, sep="\t", col.names = T, row.names = F)
# 1列目の列名が#tax_idになっているので注意.
g2ref <- readr::read_delim("~/db/ncbi_gene/g2ref.cge", delim = "\t",
col_type = paste(rep("c",16), collapse = ""),
col_names = T)
names(g2ref)[1] <- sub("^#", "", names(g2ref)[1])
# 指定列取り出し
# "GeneID" "RNA_nucleotide_accession.version" "Symbol" # CriGri_1.0のみ(45522行)
g2ref.dat <- g2ref[,c(2,4,16)]
# RefIdを";"でつなぐ, 重複行を除去(28447)
refn <- tapply(g2ref.dat$RNA_nucleotide_accession.version, g2ref.dat$GeneID,
function(x){paste(x, collapse = ";")}) %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
setNames(., c("ncbi_gi", "ref.nuc"))
# ncbi_cgeとマージ
ncbi_ref <- left_join(ncbi_cge, refn, by = "ncbi_gi")| gene | ncbi_gi | ref.nuc |
|---|---|---|
| cge:100682525 | 100682525 | NM_001243976.1 |
| cge:100682526 | 100682526 | NM_001243977.1 |
| cge:100682527 | 100682527 | NA |
| cge:100682528 | 100682528 | NM_001243979.1 |
| cge:100682529 | 100682529 | XM_007644559.2; XM_007644560.2; XM_007644561.2; NM_001243980.1 |
| cge:100682531 | 100682531 | NM_001243981.1 |
dfs <- list(gi_cge, ecs_cge, ncbi_ref, refp_cge, unips_cge, kos_cge, maps_cge)
cge.kegg.ids <-
Reduce(function(x,y){merge(x,y, all.x = T, by = "gene")}, dfs)| gene | description | ecs | ncbi_gi | ref.nuc | ref.protein | upids | kos | maps |
|---|---|---|---|---|---|---|---|---|
| cge:100682526 | Tuba1c; tubulin alpha-1C chain | NA | 100682526 | NM_001243977.1 | NP_001230906 | up:P68365 | K07374 | cge04145; cge04210; cge04530; cge04540 |
| cge:100682527 | Tuba1a; tubulin alpha-1A chain | NA | 100682527 | NA | NP_001230907 | up:P68362 | K07374 | cge04145; cge04210; cge04530; cge04540 |
| cge:100682528 | Tuba1b; tubulin alpha-1B chain | NA | 100682528 | NM_001243979.1 | NP_001230908 | up:P68361 | K07374 | cge04145; cge04210; cge04530; cge04540 |
| cge:100682529 | Mgat1; alpha-1,3-mannosyl-glycoprotein 2-beta-N-acetylglucosaminyltransferase | ec:2.4.1.101 | 100682529 | XM_007644559.2; XM_007644560.2; XM_007644561.2; NM_001243980.1 | NP_001230909 | up:Q924C0 | K00726 | cge00510; cge01100 |
| cge:100682531 | Kif23; kinesin-like protein KIF23 | NA | 100682531 | NM_001243981.1 | NP_001230910 | up:Q60442 | K17387 | cge05206 |
wget -q -O - http://rest.kegg.jp/list/cpdwget -q -O - http://rest.kegg.jp/link/pathway/cpd# list/cpd
cpd <- system("wget -q -O - http://rest.kegg.jp/list/cpd", intern = T, wait = T)
cpdat <- data.frame(cid = sub("cpd:", "", sapply(strsplit(cpd, "\t"), "[", 1)),
description = sapply(strsplit(cpd, "\t"), "[", 2),
stringsAsFactors = F)
# link/pathway/cpd
cpdmap <- system("wget -q -O - http://rest.kegg.jp/link/pathway/cpd", intern = T, wait = T)
cpd_map <- data.frame(cid = sub("cpd:", "", sapply(strsplit(cpdmap, "\t"), "[", 1)),
map = sub("path:", "", sapply(strsplit(cpdmap, "\t"), "[", 2)),
stringsAsFactors = F)
# 代謝マップ上に載っている化合物
cpd_maps <- cpd_map %>%
group_by(cid) %>%
summarise(maps = paste(map, collapse = ";")) %>%
left_join(., cpdat, by = "cid")| cid | description |
|---|---|
| C00001 | H2O; Water |
| C00002 | ATP; Adenosine 5’-triphosphate |
| C00003 | NAD+; NAD; Nicotinamide adenine dinucleotide; DPN; Diphosphopyridine nucleotide; Nadide; beta-NAD+ |
| C00004 | NADH; DPNH; Reduced nicotinamide adenine dinucleotide |
| cid | map |
|---|---|
| C00022 | map00010 |
| C00024 | map00010 |
| C00031 | map00010 |
| C00033 | map00010 |
| cid | maps | description |
|---|---|---|
| C00001 | map00190; map00195; map00710; map01100; map01120; map04918; map04924; map04962; map04964; map04966; map04970; map04971; map04972; map04976; map05014 | H2O; Water |
| C00002 | map00190; map00195; map00230; map00231; map00908; map01100; map01110; map01130; map03070; map04020; map04066; map04080; map04142; map04217; map04611; map04621; map04714; map04721; map04742; map04750; map04911; map04917; map04924; map04930; map05012; map05133; map05410 | ATP; Adenosine 5’-triphosphate |
| C00003 | map00190; map00730; map00760; map00983; map01100; map04152; map04211; map04212; map04925; map04977 | NAD+; NAD; Nicotinamide adenine dinucleotide; DPN; Diphosphopyridine nucleotide; Nadide; beta-NAD+ |
| C00004 | map00190; map01100; map04020; map04714; map04925 | NADH; DPNH; Reduced nicotinamide adenine dinucleotide |
wget -q -O - http://rest.kegg.jp/conv/pubchem/cpd# KEGG COMPOUND ID と PubChem SID
cpdpub <- system("wget -q -O - http://rest.kegg.jp/conv/pubchem/cpd", intern = T, wait = T)
cpd_pub <- data.frame(
cid = sub("cpd:", "", sapply(strsplit(cpdpub, "\t"), "[", 1)),
pubchem_SID = sub("pubchem:", "", sapply(strsplit(cpdpub, "\t"), "[", 2)),
stringsAsFactors = F
)
# PubChem SIDとPubChem CID (PubChem Identifier Exchange Service からDL)
scid <- readr::read_delim("./pubchem_sid_cid.txt", "\t", col_types = "cc")
# データをマージ
keg_pubc <- dplyr::left_join(cpd_pub, scid, by = "pubchem_SID")
# KEGG COMPOUND IDの冗長性を確認
kegg_cpdup <- keg_pubc[unlist(rsko::dup(keg_pubc$pubchem_CID)),] %>%
left_join(., cpdat, by = "cid")| cid | pubchem_SID | pubchem_CID |
|---|---|---|
| C00001 | 3303 | 962 |
| C00002 | 3304 | 5957 |
| C00003 | 3305 | 5893 |
| C00004 | 3306 | 439153 |
| C00005 | 3307 | 5884 |
| C00006 | 3308 | 5886 |
| cid | pubchem_SID | pubchem_CID | description |
|---|---|---|---|
| C02061 | 5150 | 10219885 | Plastoquinone |
| C16694 | 51091016 | 10219885 | Plastoquinone-1 |
| C07911 | 10113 | 10297 | Phenylpropanolamine; (+-)-Phenylpropanolamine; (R,S)-(+-)-alpha-(1-Aminoethyl)benzenemethanol; (+-)-Norephedrine |
| C16719 | 96023243 | 10297 | (-)-Norephedrine; l-Phenylpropanolamine |
| C12452 | 582842 | 103008 | Acetyltropine; 3-Acetyltropine |
| C12453 | 582843 | 103008 | Acetylpseudotropine |
# REACTION idと説明
rn <- system("wget -q -O - http://rest.kegg.jp/list/rn/", intern = T, wait = T)
rndat <- data.frame(RID = sapply(strsplit(rn, "\t"), "[", 1),
Description = sapply(strsplit(rn, "\t"), "[", 2),
stringsAsFactors = F)| RID | Description |
|---|---|
| rn:R00001 | polyphosphate polyphosphohydrolase; Polyphosphate + n H2O <=> (n+1) Oligophosphate |
| rn:R00002 | Reduced ferredoxin:dinitrogen oxidoreductase (ATP-hydrolysing); 16 ATP + 16 H2O + 8 Reduced ferredoxin <=> 8 e- + 16 Orthophosphate + 16 ADP + 8 Oxidized ferredoxin |
| rn:R00004 | diphosphate phosphohydrolase; pyrophosphate phosphohydrolase; Diphosphate + H2O <=> 2 Orthophosphate |
wget -q -O - http://rest.kegg.jp/list/pathway/wget -q -O - http://rest.kegg.jp/link/rn/rn00010+rn00020# 全てのpathway idを取得する 570 pathways
pi <- system("wget -q -O - http://rest.kegg.jp/list/pathway/", intern = T, wait = T)
# idの変換
pw_dat <- do.call(rbind, strsplit(pi, "\t")) %>%
data.frame(., stringsAsFactors = F) %>%
setNames(., c("map_id","description")) %>%
mutate(map_id = sub("path:map", "rn", map_id))
# 10 idずづ結合
chunk <- function(x, n){
nchnk <- ceiling(seq_along(x)/n)
split(x, nchnk)
}
rnids <- lapply(chunk(pw_dat$map, 10), paste, collapse = "+")
# map idと reaction_idとの対応(reactionがないmapにはNA)
l_rct_id <- lapply(seq_along(rnids), function(i){
com <- paste0("wget -q -O - http://rest.kegg.jp/link/rn/", rnids[[i]])
system(com, intern = T, wait = T)
})
map_rids <- do.call(c, l_rct_id) %>%
strsplit(., "\t") %>%
{data.frame(do.call(rbind, .))} %>%
setNames(., c("map_id", "Rids")) %>%
mutate(map_id = sub("path:", "", map_id),
Rids = sub("rn:", "", Rids)) %>%
group_by(map_id) %>%
summarise(Rids = paste(Rids, collapse = "; ")) %>%
right_join(pw_dat, ., by = "map_id")| map_id | description | Rids |
|---|---|---|
| rn00010 | Glycolysis / Gluconeogenesis | R00014; R00200; R00229; R00235; R00341; R00431; R00658; R00703; R00710; R00711; R00726; R00746; R00754; R00755; R00947; R00959; R01015; R01058; R01061; R01063; R01070; R01196; R01512; R01516; R01518; R01600; R01602; R01662; R01786; R01788; R02073; R02187; R02189; R02569; R02738; R02739; R02740; R03270; R03321; R04394; R04779; R04780; R05132; R05133; R05134; R05198; R07159; R07618; R09084; R09085; R09086; R09127; R09479; R09532; R10860 |
| rn00020 | Citrate cycle (TCA cycle) | R00014; R00268; R00341; R00342; R00344; R00351; R00352; R00361; R00405; R00431; R00432; R00621; R00709; R00726; R00727; R01082; R01196; R01197; R01325; R01899; R01900; R02164; R02569; R02570; R03270; R03316; R07618; R10343 |
| rn00030 | Pentose phosphate pathway | R00305; R01049; R01051; R01056; R01057; R01058; R01066; R01070; R01519; R01520; R01521; R01522; R01528; R01529; R01538; R01541; R01544; R01621; R01641; R01737; R01739; R01741; R01827; R01830; R02032; R02034; R02035; R02036; R02073; R02658; R02736; R02739; R02740; R02749; R02750; R04779; R04780; R05338; R05605; R06620; R06836; R06837; R07147; R08570; R08571; R08572; R09084; R09780; R10221; R10324; R10407; R10408; R10615; R10860; R10907 |
wget -q -O - http://rest.kegg.jp/get/R00001ENTRY, NAME, DEFINITION, EQUATION, REMARK, COMMENT, RCLASS, ENZYME, PATHWAY, MODULE, ORTHOLOGY, REFERENCE;で結合する# ユニークなreaction id
# rn <- system("wget -q -O - http://rest.kegg.jp/list/rn/", intern = T, wait = T)
urid <- sub("rn:", "", sapply(strsplit(rn, "\t"), "[", 1))
# KEGG REACTIONのダウンロード(一部, 10づつ50エントリー)
chunk <- function(x, n){
nchnk <- ceiling(seq_along(x)/n)
split(x, nchnk)
}
urid_chnk <- lapply(chunk(urid[1:50], 10), paste, collapse = "+")
l_react <- vector("list", length(urid_chnk))
for (i in seq_along(urid_chnk)) {
com <- paste0("wget -q -O - http://rest.kegg.jp/get/", urid_chnk[[i]])
l_react[[i]] <- system(com, intern = T, wait = T)
}
# ベクトルの要素を特定の要素を基に分割
vsplt <- function(x, splt){
pos <- grep(splt, x)
splt_x <- split(x, findInterval(seq_along(x), pos))
res <- lapply(splt_x, function(x) x[!x %in% splt])
res[!sapply(res, identical, character(0))]
}
react <- vsplt(unlist(l_react), "///")
# check
react[[1]]## [1] "ENTRY R00001 Reaction"
## [2] "NAME polyphosphate polyphosphohydrolase"
## [3] "DEFINITION Polyphosphate + n H2O <=> (n+1) Oligophosphate"
## [4] "EQUATION C00404 + n C00001 <=> (n+1) C02174"
## [5] "ENZYME 3.6.1.10"
## [6] "DBLINKS RHEA: 22455"
# # tagの種類
# taglist <- lapply(react, function(x) {v <- sapply(strsplit(x, " "), "[", 1); v[v!=""]})
# tags <- unique(unlist(taglist))
tags <- c("ENTRY", "NAME", "DEFINITION", "EQUATION", "ENZYME", "DBLINKS", "COMMENT",
"RCLASS", "PATHWAY", "ORTHOLOGY", "MODULE", "REMARK", "REFERENCE")
# 各タグの情報を入れるベクトル
ENTRY <- character(); NAME <- character(); DEFINITION <- character();
EQUATION <- character(); REMARK <- character(); DBLINKS <- character();
COMMENT <- character(); RCLASS <- character(); ENZYME <- character();
PATHWAY <- character(); MODULE <- character(); ORTHOLOGY <- character();
REFERENCE <- character()
# 各レコードを処理
for (j in 1:length(react)) {
ent <- react[[j]]
# 指定タグが何行目にあるか (どのタグが複数行あるか)
ent_tags <- sapply(strsplit(ent, "[ ]"), "[", 1)
extag <- ent_tags[ent_tags != ""] # そのエントリ持っているタグの種類
extagst <- which(ent_tags != "") # それぞれのタグの開始行
extaglen <- diff(c(extagst, length(ent) + 1)) # それぞれのタグの記載行の長さ
extaged <- extagst + extaglen - 1 # それぞれのタグの終了行
# 記載無しのタグはNAを代入
n.tag <- match(tags, extag)
if (any(is.na(n.tag))) {
eval(parse(text = paste0(tags[is.na(n.tag)] , "[[j]] <- NA;")))
}
# 各タグごとに文字列を編集
for (i in 1:length(extaglen)) {
lines <- ent[extagst[i]:extaged[i]]
lines <- sub(paste0(extag[i], "[ ]+"), "", lines, fixed = F)
lines <- gsub("^[ ]+|[ ]+$","", lines)
# タグ別の編集
if (extag[i] == "ENTRY") {
Val <- unlist(strsplit(lines, split = "[ ]+"))[1]
} else if (extag[i] == "NAME") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "DEFINITION") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "EQUATION") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "REMARK") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "DBLINKS") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "COMMENT") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "RCLASS") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "ENZYME") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "PATHWAY") {
Val <- paste(lines, collapse = ";")
} else if (extag[i] == "MODULE") {
Val <- paste(lines, collapse = ";")
}else if (extag[i] == "ORTHOLOGY") {
Val <- paste(lines, collapse = ";")
}else if (extag[i] == "REFERENCE") {
Val <- paste(lines, collapse = " ")
}
# タグリストに代入
eval(parse(text = paste0(extag[i], "[[j]] <- Val")))
}
}
# データフレームに変換
react_dat <- data.frame(
cbind(ENTRY, NAME, DEFINITION, EQUATION,
REMARK, COMMENT, RCLASS, ENZYME,
PATHWAY, MODULE,ORTHOLOGY, DBLINKS, REFERENCE),
stringsAsFactors = F
)| ENTRY | NAME | DEFINITION | EQUATION | REMARK | COMMENT | RCLASS | ENZYME | PATHWAY | MODULE | ORTHOLOGY | DBLINKS | REFERENCE |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| R00001 | polyphosphate polyphosphohydrolase | Polyphosphate + n H2O <=> (n+1) Oligophosphate | C00404 + n C00001 <=> (n+1) C02174 | NA | NA | NA | 3.6.1.10 | NA | NA | NA | RHEA: 22455 | NA |
| R00002 | Reduced ferredoxin:dinitrogen oxidoreductase (ATP-hydrolysing) | 16 ATP + 16 H2O + 8 Reduced ferredoxin <=> 8 e- + 16 Orthophosphate + 16 ADP + 8 Oxidized ferredoxin | 16 C00002 + 16 C00001 + 8 C00138 <=> 8 C05359 + 16 C00009 + 16 C00008 + 8 C00139 | NA | a part of multi-step reaction (see R05185, R00002+R00067+R00153+R02802+R04782) | RC00002 C00002_C00008 | 1.18.6.1 | NA | NA | NA | NA | NA |
| R00004 | diphosphate phosphohydrolase;;pyrophosphate phosphohydrolase | Diphosphate + H2O <=> 2 Orthophosphate | C00013 + C00001 <=> 2 C00009 | NA | NA | NA | 3.6.1.1 | NA | NA | NA | RHEA: 24579 | NA |
| R00005 | urea-1-carboxylate amidohydrolase | Urea-1-carboxylate + H2O <=> 2 CO2 + 2 Ammonia | C01010 + C00001 <=> 2 C00011 + 2 C00014 | NA | The yeast enzyme (but not that from green algae) also catalyses the;reaction of EC 6.3.4.6 urea carboxylase, thus bringing about the;hydrolysis of urea to CO2 and NH3 in the presence of ATP and;bicarbonate.;R00774 (6.3.4.6) | RC02756 C00011_C01010 | 3.5.1.54 | rn00220 Arginine biosynthesis;rn00791 Atrazine degradation;rn01100 Metabolic pathways;rn01120 Microbial metabolism in diverse environments | NA | K01457 allophanate hydrolase [EC:3.5.1.54];K14541 urea carboxylase / allophanate hydrolase [EC:6.3.4.6 3.5.1.54] | RHEA: 19032 | NA |
(n-1) C00001, C00046(n+m), 2 C01330(in), 2 C01330(out)# 酵素反応式を取り出す
options(stringsAsFactors = F)
# 左辺を分解
Lt <- react_dat %$%
{setNames(strsplit(sapply(strsplit(EQUATION, " <=> "), "[", 1), " \\+ "), ENTRY)} %>%
reshape2::melt(.) %>%
mutate_if(is.factor, as.character) %>%
setNames(., c("Cid", "Rid"))
Lc <- lapply(strsplit(Lt$Cid, " "), function(x){
if (length(x) != 1) {
gsub("\\(in\\)", "", x)
} else if (length(x) == 1) {
if (grepl("\\(.*\\)$", x) & !grepl("\\(in\\)$|\\(out\\)", x)) {
ai <- gsub("\\(|\\)", "", stringr::str_extract(x, "\\(.*\\)$"))
xi <- gsub("\\(.*\\)", "", x)
c(ai, xi)
} else if (grepl("\\(in\\)$", x)) {
xi <- gsub("\\(in\\)", "", x)
c(1, xi)
} else {
c(1, x)
}
}
}) %>%
{data.frame(do.call(rbind, .), stringsAsFactors = F)} %>%
setNames(., c("n", "Cid")) %>%
data.frame(Rid = Lt$Rid, ., equation = "left", stringsAsFactors = F)
# 右辺を分解
Rt <- react_dat %$%
{setNames(strsplit(sapply(strsplit(EQUATION, " <=> "), "[", 2), " \\+ "), ENTRY)} %>%
reshape2::melt(.) %>%
mutate_if(is.factor, as.character) %>%
setNames(., c("Cid", "Rid"))
Rc <- lapply(strsplit(Rt$Cid, " "), function(x){
if (length(x) != 1) {
gsub("\\(in\\)|\\(out\\)", "", x)
} else if (length(x) == 1) {
if (grepl("\\(.*\\)$", x) & !grepl("\\(in\\)$|\\(out\\)", x)) {
ai <- gsub("\\(|\\)", "", stringr::str_extract(x, "\\(.*\\)$"))
xi <- gsub("\\(.*\\)", "", x)
c(ai, xi)
} else if (grepl("\\(in\\)|\\(out\\)$", x)) {
xi <- gsub("\\(in\\)|\\(out\\)", "", x)
c(1, xi)
} else {
c(1, x)
}
}
}) %>%
{data.frame(do.call(rbind, .))} %>%
setNames(., c("n", "Cid")) %>%
data.frame(Rid = Rt$Rid, ., equation = "right", stringsAsFactors = F) | Rid | n | Cid | equation |
|---|---|---|---|
| R00001 | 1 | C00404 | left |
| R00001 | n | C00001 | left |
| R00002 | 16 | C00002 | left |
| R00002 | 16 | C00001 | left |
| R00002 | 8 | C00138 | left |
| R00004 | 1 | C00013 | left |
| Rid | n | Cid | equation |
|---|---|---|---|
| R00001 | (n+1) | C02174 | right |
| R00002 | 8 | C05359 | right |
| R00002 | 16 | C00009 | right |
| R00002 | 16 | C00008 | right |
| R00002 | 8 | C00139 | right |
| R00004 | 2 | C00009 | right |
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stringr_1.4.0 magrittr_1.5 dplyr_0.8.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 rstudioapi_0.10 xml2_1.2.0
## [4] knitr_1.23 hms_0.4.2 munsell_0.5.0
## [7] rvest_0.3.4 tidyselect_0.2.5 viridisLite_0.3.0
## [10] colorspace_1.4-1 R6_2.4.0 rlang_0.4.0
## [13] rsko_0.1.0 highr_0.8 httr_1.4.0
## [16] plyr_1.8.4 tools_3.5.2 webshot_0.5.1
## [19] xfun_0.8 htmltools_0.3.6 yaml_2.2.0
## [22] assertthat_0.2.1 digest_0.6.19 tibble_2.1.3
## [25] crayon_1.3.4 kableExtra_1.1.0 reshape2_1.4.3
## [28] purrr_0.3.2 readr_1.3.1 glue_1.3.1
## [31] evaluate_0.14 rmarkdown_1.13 stringi_1.4.3
## [34] compiler_3.5.2 pillar_1.4.2 scales_1.0.0
## [37] pkgconfig_2.0.2