#BiocManager::install("KEGGREST")
library(KEGGREST)
listDatabases()
## [1] "pathway" "brite" "module" "ko" "genome" "vg"
## [7] "ag" "compound" "glycan" "reaction" "rclass" "enzyme"
## [13] "disease" "drug" "dgroup" "environ" "genes" "ligand"
## [19] "kegg"
####You can obtain the list of organisms available in KEGG with the keggList() function:
org <- keggList("organism")
head(org)
## T.number organism species
## [1,] "T01001" "hsa" "Homo sapiens (human)"
## [2,] "T01005" "ptr" "Pan troglodytes (chimpanzee)"
## [3,] "T02283" "pps" "Pan paniscus (bonobo)"
## [4,] "T02442" "ggo" "Gorilla gorilla gorilla (western lowland gorilla)"
## [5,] "T01416" "pon" "Pongo abelii (Sumatran orangutan)"
## [6,] "T03265" "nle" "Nomascus leucogenys (northern white-cheeked gibbon)"
## phylogeny
## [1,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [2,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [3,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [4,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [5,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [6,] "Eukaryotes;Animals;Vertebrates;Mammals"
###Therefore, the complete list of entities that can be queried with KEGGREST can be obtained as follows:
queryables <- c(listDatabases(), org[,1], org[,2])
class(queryables)
## [1] "character"
length(queryables)
## [1] 14167
queryables[1:10]
## [1] "pathway" "brite" "module" "ko" "genome" "vg"
## [7] "ag" "compound" "glycan" "reaction"
###You could also ask for every entry in the “hsa” (Homo sapiens) database as follows:
hsa <- keggList("hsa")
class(keggList("hsa"))
## [1] "character"
length(keggList("hsa"))
## [1] 22208
hsa[1:10]
## hsa:100526771
## "SMIM35, TMPRSS4-AS1; small integral membrane protein 35"
## hsa:100302174
## "MIR1307, MIRN1307, hsa-mir-1307, mir-1307; microRNA 1307"
## hsa:100423038
## "MIR466, hsa-mir-466; microRNA 466"
## hsa:100616498
## "MIR378E, mir-378e; microRNA 378e"
## hsa:442913
## "MIR376C, MIR368, MIRN368, MIRN376C, hsa-mir-368, miRNA368, mir-376c; microRNA 376c"
## hsa:100302179
## "MIR1270, MIR1270-1, MIR1270-2, MIRN1270, hsa-mir-1270, hsa-mir-1270-1, mir-1270; microRNA 1270"
## hsa:664616
## "MIR487B, MIRN487B, mir-487b; microRNA 487b"
## hsa:643862
## "SPDYE10P; speedy/RINGO cell cycle regulator family member E10, pseudogene"
## hsa:574461
## "MIR520E, MIRN520E; microRNA 520e"
## hsa:100616159
## "MIR4779; microRNA 4779"
hsa_1 <- data.frame(hsa)
dim(hsa_1)
## [1] 22208 1
head(hsa_1)
## hsa
## hsa:100526771 SMIM35, TMPRSS4-AS1; small integral membrane protein 35
## hsa:100302174 MIR1307, MIRN1307, hsa-mir-1307, mir-1307; microRNA 1307
## hsa:100423038 MIR466, hsa-mir-466; microRNA 466
## hsa:100616498 MIR378E, mir-378e; microRNA 378e
## hsa:442913 MIR376C, MIR368, MIRN368, MIRN376C, hsa-mir-368, miRNA368, mir-376c; microRNA 376c
## hsa:100302179 MIR1270, MIR1270-1, MIR1270-2, MIRN1270, hsa-mir-1270, hsa-mir-1270-1, mir-1270; microRNA 1270
#View(hsa_1)
###Get specific entries with keggGet()
#Once you have a list of specific KEGG identifiers, use keggGet() to get more information about them.
#Here we look up a human gene and an E. coli O157 gene:
query <- keggGet(c("hsa:10458", "ece:Z5100"))
#query
length(query)
## [1] 2
keggGet(c("hsa:43"))
## [[1]]
## [[1]]$ENTRY
## CDS
## "43"
##
## [[1]]$NAME
## [1] "ACHE, ACEE, ARACHE, N-ACHE, YT"
##
## [[1]]$DEFINITION
## [1] "(RefSeq) acetylcholinesterase (Cartwright blood group)"
##
## [[1]]$ORTHOLOGY
## K01049
## "acetylcholinesterase [EC:3.1.1.7]"
##
## [[1]]$ORGANISM
## hsa
## "Homo sapiens (human)"
##
## [[1]]$PATHWAY
## hsa00564 hsa04725
## "Glycerophospholipid metabolism" "Cholinergic synapse"
##
## [[1]]$DRUG_TARGET
## [1] "Acotiamide hydrochloride: D08838<JP>"
## [2] "Ambenonium (DG00990): D01001<JP>"
## [3] "Demecarium: D00667"
## [4] "Dimetacrine: D02565"
## [5] "Distigmine (DG00989): D01228<JP>"
## [6] "Donepezil (DG00983): D00670<JP/US> D07869"
## [7] "Ecothiopate (DG01131): D02193<US>"
## [8] "Edrophonium: D00994<JP>"
## [9] "Fluostigmine: D00043"
## [10] "Galantamine (DG00985): D02173<JP/US> D04292"
## [11] "Gallamine: D02292"
## [12] "Icopezil maleate: D03751"
## [13] "Ipidacrine: D09750"
## [14] "Itopride (DG01395): D02729<JP> D08094"
## [15] "Ladostigil tartrate: D03239"
## [16] "Latrepirdine (DG01379): D09917 D09918"
## [17] "Neostigmine (DG00987): D00995<JP> D00998<JP/US> D08261"
## [18] "Obidoxime: D05215"
## [19] "Paraoxon: D10529"
## [20] "Physostigmine (DG01132): D00196 D02418 D03826"
## [21] "Pralidoxime (DG01153): D00469<US> D01572<JP> D05590"
## [22] "Pyridostigmine (DG00988): D00487<JP/US>"
## [23] "Quilostigmine: D03823"
## [24] "Rivastigmine (DG00984): D02558<US> D03822<JP/US>"
## [25] "Suronacrine maleate: D05981"
## [26] "Tacrine (DG00982): D02068 D08555"
## [27] "Velnacrine maleate: D06288"
## [28] "Zanapezil fumarate: D09835"
## [29] "Zifrosilone: D06365"
##
## [[1]]$BRITE
## [1] "KEGG Orthology (KO) [BR:hsa00001]"
## [2] " 09100 Metabolism"
## [3] " 09103 Lipid metabolism"
## [4] " 00564 Glycerophospholipid metabolism"
## [5] " 43 (ACHE)"
## [6] " 09150 Organismal Systems"
## [7] " 09156 Nervous system"
## [8] " 04725 Cholinergic synapse"
## [9] " 43 (ACHE)"
## [10] " 09180 Brite Hierarchies"
## [11] " 09183 Protein families: signaling and cellular processes"
## [12] " 00537 Glycosylphosphatidylinositol (GPI)-anchored proteins [BR:hsa00537]"
## [13] " 43 (ACHE)"
## [14] "Enzymes [BR:hsa01000]"
## [15] " 3. Hydrolases"
## [16] " 3.1 Acting on ester bonds"
## [17] " 3.1.1 Carboxylic-ester hydrolases"
## [18] " 3.1.1.7 acetylcholinesterase"
## [19] " 43 (ACHE)"
## [20] "Glycosylphosphatidylinositol (GPI)-anchored proteins [BR:hsa00537]"
## [21] " Enzymes"
## [22] " 43 (ACHE)"
##
## [[1]]$POSITION
## [1] "7q22.1"
##
## [[1]]$MOTIF
## [1] "Pfam: COesterase AChE_tetra Abhydrolase_3"
##
## [[1]]$DBLINKS
## [1] "NCBI-GeneID: 43" "NCBI-ProteinID: NP_000656"
## [3] "OMIM: 100740" "HGNC: 108"
## [5] "Ensembl: ENSG00000087085" "Vega: OTTHUMG00000157033"
## [7] "Pharos: P22303(Tclin)" "UniProt: P22303"
##
## [[1]]$STRUCTURE
## [1] "PDB: 4M0E 6O69 6O5V 5HF5 4EY4 5HF9 5HFA 6CQZ 6CQT 6CQW 6O4X"
## [2] " 5HF6 4M0F 4EY5 6CQU 6O4W 6O50 4EY7 1VZJ 6CQX 5FPQ 4EY6"
## [3] " 6NEA 6O66 6CQY 6CQV 4EY8 1B41 6O5R 6O5S 5HF8 4PQE 1F8U"
## [4] " 2X8B 6F25 4BDT 6O52 3LII"
##
## [[1]]$AASEQ
## AAStringSet object of length 1:
## width seq
## [1] 614 MRPPQCLLHTPSLASPLLLLLLWLLGGGVGAEGR...RQWKAEFHRWSSYMVHWKNQFDHYSKQDRCSDL
##
## [[1]]$NTSEQ
## DNAStringSet object of length 1:
## width seq
## [1] 1845 ATGAGGCCCCCGCAGTGTCTGCTGCACACGCCTT...TACAGCAAGCAGGATCGCTGCTCAGACCTGTGA
###Behind the scenes, KEGGREST downloaded and parsed a KEGG flat file, which you can now explore:
names(query[[1]])
## [1] "ENTRY" "NAME" "DEFINITION" "ORTHOLOGY" "ORGANISM"
## [6] "PATHWAY" "NETWORK" "BRITE" "POSITION" "MOTIF"
## [11] "DBLINKS" "STRUCTURE" "AASEQ" "NTSEQ"
query[[1]]$ENTRY
## CDS
## "10458"
query[[1]]$DBLINKS
## [1] "NCBI-GeneID: 10458" "NCBI-ProteinID: NP_059345"
## [3] "OMIM: 605475" "HGNC: 947"
## [5] "Ensembl: ENSG00000175866" "Vega: OTTHUMG00000177698"
## [7] "Pharos: Q9UQB8(Tbio)" "UniProt: Q9UQB8"
###keggGet() can also return amino acid sequences as AAStringSet objects (from the Biostrings package):
keggGet(c("hsa:10458", "ece:Z5100"), "aaseq") ## retrieves amino acid sequences
## AAStringSet object of length 2:
## width seq names
## [1] 552 MSLSRSEEMHRLTENVYKTIMEQ...DLSAQGPEGREHGDGSARTLAGR hsa:10458 K05627 ...
## [2] 248 MLNGISNAASTLGRQLVGIASRV...SGLPPLAQALKDHLAAYEQSKKG ece:Z5100 K12786 ...
###or DNAStringSet objects if option is ntseq:
keggGet(c("hsa:10458", "ece:Z5100"), "ntseq") ## retrieves nucleotide sequences
## DNAStringSet object of length 2:
## width seq names
## [1] 1659 ATGTCTCTGTCTCGCTCAGAGGA...CCCGCACCCTGGCTGGAAGATGA hsa:10458 K05627 ...
## [2] 747 ATGCTTAATGGAATTAGTAACGC...ATGAGCAATCGAAGAAAGGGTAA ece:Z5100 K12786 ...
###keggGet() can also return images:
png <- keggGet("hsa05130", "image")
t <- tempfile()
library(png)
writePNG(png, t)
if (interactive()) browseURL(t)
###Search by keywords with keggFind()
##You can search for two separate keywords (“shiga” and “toxin” in this case):
head(keggFind("genes", c("shiga", "toxin")))
## ece:Z1464
## "stx2A; shiga-like toxin II A subunit encoded by bacteriophage BP-933W"
## ece:Z1465
## "stx2B; shiga-like toxin II B subunit encoded by bacteriophage BP-933W"
## ece:Z3343
## "stx1B; shiga-like toxin 1 subunit B encoded within prophage CP-933V"
## ece:Z3344
## "stx1A; shiga-like toxin 1 subunit A encoded within prophage CP-933V"
## ecs:ECs1205
## "Shiga toxin 2 subunit A"
## ecs:ECs1206
## "Shiga toxin 2 subunit B"
###Or search for the two words together:
head(keggFind("genes", "shiga toxin"))
## ece:Z1464
## "stx2A; shiga-like toxin II A subunit encoded by bacteriophage BP-933W"
## ece:Z1465
## "stx2B; shiga-like toxin II B subunit encoded by bacteriophage BP-933W"
## ece:Z3343
## "stx1B; shiga-like toxin 1 subunit B encoded within prophage CP-933V"
## ece:Z3344
## "stx1A; shiga-like toxin 1 subunit A encoded within prophage CP-933V"
## ecs:ECs1205
## "Shiga toxin 2 subunit A"
## ecs:ECs1206
## "Shiga toxin 2 subunit B"
###Search for a chemical formula:
head(keggFind("compound", "C7H10O5", "formula"))
## cpd:C00493 cpd:C04236 cpd:C16588 cpd:C17696 cpd:C18307 cpd:C18312
## "C7H10O5" "C7H10O5" "C7H10O5" "C7H10O5" "C7H10O5" "C7H10O5"
###Search for a chemical formula containing “O5” and “C7”:
head(keggFind("compound", "O5C7", "formula"))
## cpd:C00493 cpd:C00624 cpd:C01215 cpd:C01424 cpd:C02123 cpd:C02236
## "C7H10O5" "C7H11NO5" "C7H9NO5" "C7H6O5" "C7H12O5" "C7H6O5S"
###You can search for compounds with a particular exact mass:
keggFind("compound", 174.05, "exact_mass")
## cpd:C00493 cpd:C04236 cpd:C16588 cpd:C17696 cpd:C18307 cpd:C18312
## "174.052823" "174.052823" "174.052823" "174.052823" "174.052823" "174.052823"
## cpd:C21281
## "174.052823"
###Integer ranges can be used to find compounds by molecular weight:
head(keggFind("compound", 300:310, "mol_weight"))
## cpd:C00051 cpd:C00200 cpd:C00219 cpd:C00239 cpd:C00270 cpd:C00357
## "307.32348" "306.33696" "304.46688" "307.197122" "309.26986" "301.187702"
###Convert identifiers with keggConv()
###You can either specify fully qualified identifiers:
keggConv("ncbi-proteinid", c("hsa:10458", "ece:Z5100"))
## hsa:10458 ece:Z5100
## "ncbi-proteinid:NP_059345" "ncbi-proteinid:AAG58814"
###or get the mapping for an entire species:
head(keggConv("eco", "ncbi-geneid"))
## ncbi-geneid:944742 ncbi-geneid:945803 ncbi-geneid:947498 ncbi-geneid:945198
## "eco:b0001" "eco:b0002" "eco:b0003" "eco:b0004"
## ncbi-geneid:944747 ncbi-geneid:944749
## "eco:b0005" "eco:b0006"
###Reversing the arguments does the opposite mapping:
head(keggConv("ncbi-geneid", "eco"))
## eco:b0001 eco:b0002 eco:b0003
## "ncbi-geneid:944742" "ncbi-geneid:945803" "ncbi-geneid:947498"
## eco:b0004 eco:b0005 eco:b0006
## "ncbi-geneid:945198" "ncbi-geneid:944747" "ncbi-geneid:944749"
###Link across databases with keggLink()
### for example get.pathways.by.genes(), can be replaced with the keggLink() function. Here we query all pathways for human:
head(keggLink("pathway", "hsa"))
## hsa:10327 hsa:124 hsa:125 hsa:126 hsa:127
## "path:hsa00010" "path:hsa00010" "path:hsa00010" "path:hsa00010" "path:hsa00010"
## hsa:128
## "path:hsa00010"
###but you can also specify one or more genes (from multiple species):
keggLink("pathway", c("hsa:10458", "ece:Z5100"))
## hsa:10458 hsa:10458 hsa:10458 hsa:10458 ece:Z5100
## "path:hsa04520" "path:hsa04810" "path:hsa05130" "path:hsa05135" "path:ece05130"
##ref https://www.bioconductor.org/packages/release/bioc/vignettes/KEGGREST/inst/doc/KEGGREST-vignette.html