#BiocManager::install("KEGGREST")
library(KEGGREST)
listDatabases()
##  [1] "pathway"  "brite"    "module"   "ko"       "genome"   "vg"      
##  [7] "ag"       "compound" "glycan"   "reaction" "rclass"   "enzyme"  
## [13] "disease"  "drug"     "dgroup"   "environ"  "genes"    "ligand"  
## [19] "kegg"
####You can obtain the list of organisms available in KEGG with the keggList() function:
org <- keggList("organism")
head(org)
##      T.number organism species                                              
## [1,] "T01001" "hsa"    "Homo sapiens (human)"                               
## [2,] "T01005" "ptr"    "Pan troglodytes (chimpanzee)"                       
## [3,] "T02283" "pps"    "Pan paniscus (bonobo)"                              
## [4,] "T02442" "ggo"    "Gorilla gorilla gorilla (western lowland gorilla)"  
## [5,] "T01416" "pon"    "Pongo abelii (Sumatran orangutan)"                  
## [6,] "T03265" "nle"    "Nomascus leucogenys (northern white-cheeked gibbon)"
##      phylogeny                               
## [1,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [2,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [3,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [4,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [5,] "Eukaryotes;Animals;Vertebrates;Mammals"
## [6,] "Eukaryotes;Animals;Vertebrates;Mammals"
###Therefore, the complete list of entities that can be queried with KEGGREST can be obtained as follows:
queryables <- c(listDatabases(), org[,1], org[,2])
class(queryables)
## [1] "character"
length(queryables)
## [1] 14167
queryables[1:10]
##  [1] "pathway"  "brite"    "module"   "ko"       "genome"   "vg"      
##  [7] "ag"       "compound" "glycan"   "reaction"
###You could also ask for every entry in the “hsa” (Homo sapiens) database as follows:
hsa <- keggList("hsa")
class(keggList("hsa"))
## [1] "character"
length(keggList("hsa"))
## [1] 22208
hsa[1:10]
##                                                                                    hsa:100526771 
##                                        "SMIM35, TMPRSS4-AS1; small integral membrane protein 35" 
##                                                                                    hsa:100302174 
##                                       "MIR1307, MIRN1307, hsa-mir-1307, mir-1307; microRNA 1307" 
##                                                                                    hsa:100423038 
##                                                              "MIR466, hsa-mir-466; microRNA 466" 
##                                                                                    hsa:100616498 
##                                                               "MIR378E, mir-378e; microRNA 378e" 
##                                                                                       hsa:442913 
##             "MIR376C, MIR368, MIRN368, MIRN376C, hsa-mir-368, miRNA368, mir-376c; microRNA 376c" 
##                                                                                    hsa:100302179 
## "MIR1270, MIR1270-1, MIR1270-2, MIRN1270, hsa-mir-1270, hsa-mir-1270-1, mir-1270; microRNA 1270" 
##                                                                                       hsa:664616 
##                                                     "MIR487B, MIRN487B, mir-487b; microRNA 487b" 
##                                                                                       hsa:643862 
##                      "SPDYE10P; speedy/RINGO cell cycle regulator family member E10, pseudogene" 
##                                                                                       hsa:574461 
##                                                               "MIR520E, MIRN520E; microRNA 520e" 
##                                                                                    hsa:100616159 
##                                                                         "MIR4779; microRNA 4779"
hsa_1 <- data.frame(hsa)
dim(hsa_1)
## [1] 22208     1
head(hsa_1)
##                                                                                                          hsa
## hsa:100526771                                        SMIM35, TMPRSS4-AS1; small integral membrane protein 35
## hsa:100302174                                       MIR1307, MIRN1307, hsa-mir-1307, mir-1307; microRNA 1307
## hsa:100423038                                                              MIR466, hsa-mir-466; microRNA 466
## hsa:100616498                                                               MIR378E, mir-378e; microRNA 378e
## hsa:442913                MIR376C, MIR368, MIRN368, MIRN376C, hsa-mir-368, miRNA368, mir-376c; microRNA 376c
## hsa:100302179 MIR1270, MIR1270-1, MIR1270-2, MIRN1270, hsa-mir-1270, hsa-mir-1270-1, mir-1270; microRNA 1270
#View(hsa_1)
###Get specific entries with keggGet()
#Once you have a list of specific KEGG identifiers, use keggGet() to get more information about them.
#Here we look up a human gene and an E. coli O157 gene:
query <- keggGet(c("hsa:10458", "ece:Z5100"))
#query
length(query)
## [1] 2
keggGet(c("hsa:43"))
## [[1]]
## [[1]]$ENTRY
##  CDS 
## "43" 
## 
## [[1]]$NAME
## [1] "ACHE, ACEE, ARACHE, N-ACHE, YT"
## 
## [[1]]$DEFINITION
## [1] "(RefSeq) acetylcholinesterase (Cartwright blood group)"
## 
## [[1]]$ORTHOLOGY
##                              K01049 
## "acetylcholinesterase [EC:3.1.1.7]" 
## 
## [[1]]$ORGANISM
##                    hsa 
## "Homo sapiens (human)" 
## 
## [[1]]$PATHWAY
##                         hsa00564                         hsa04725 
## "Glycerophospholipid metabolism"            "Cholinergic synapse" 
## 
## [[1]]$DRUG_TARGET
##  [1] "Acotiamide hydrochloride: D08838<JP>"                  
##  [2] "Ambenonium (DG00990): D01001<JP>"                      
##  [3] "Demecarium: D00667"                                    
##  [4] "Dimetacrine: D02565"                                   
##  [5] "Distigmine (DG00989): D01228<JP>"                      
##  [6] "Donepezil (DG00983): D00670<JP/US> D07869"             
##  [7] "Ecothiopate (DG01131): D02193<US>"                     
##  [8] "Edrophonium: D00994<JP>"                               
##  [9] "Fluostigmine: D00043"                                  
## [10] "Galantamine (DG00985): D02173<JP/US> D04292"           
## [11] "Gallamine: D02292"                                     
## [12] "Icopezil maleate: D03751"                              
## [13] "Ipidacrine: D09750"                                    
## [14] "Itopride (DG01395): D02729<JP> D08094"                 
## [15] "Ladostigil tartrate: D03239"                           
## [16] "Latrepirdine (DG01379): D09917 D09918"                 
## [17] "Neostigmine (DG00987): D00995<JP> D00998<JP/US> D08261"
## [18] "Obidoxime: D05215"                                     
## [19] "Paraoxon: D10529"                                      
## [20] "Physostigmine (DG01132): D00196 D02418 D03826"         
## [21] "Pralidoxime (DG01153): D00469<US> D01572<JP> D05590"   
## [22] "Pyridostigmine (DG00988): D00487<JP/US>"               
## [23] "Quilostigmine: D03823"                                 
## [24] "Rivastigmine (DG00984): D02558<US> D03822<JP/US>"      
## [25] "Suronacrine maleate: D05981"                           
## [26] "Tacrine (DG00982): D02068 D08555"                      
## [27] "Velnacrine maleate: D06288"                            
## [28] "Zanapezil fumarate: D09835"                            
## [29] "Zifrosilone: D06365"                                   
## 
## [[1]]$BRITE
##  [1] "KEGG Orthology (KO) [BR:hsa00001]"                                          
##  [2] " 09100 Metabolism"                                                          
##  [3] "  09103 Lipid metabolism"                                                   
##  [4] "   00564 Glycerophospholipid metabolism"                                    
##  [5] "    43 (ACHE)"                                                              
##  [6] " 09150 Organismal Systems"                                                  
##  [7] "  09156 Nervous system"                                                     
##  [8] "   04725 Cholinergic synapse"                                               
##  [9] "    43 (ACHE)"                                                              
## [10] " 09180 Brite Hierarchies"                                                   
## [11] "  09183 Protein families: signaling and cellular processes"                 
## [12] "   00537 Glycosylphosphatidylinositol (GPI)-anchored proteins [BR:hsa00537]"
## [13] "    43 (ACHE)"                                                              
## [14] "Enzymes [BR:hsa01000]"                                                      
## [15] " 3. Hydrolases"                                                             
## [16] "  3.1  Acting on ester bonds"                                               
## [17] "   3.1.1  Carboxylic-ester hydrolases"                                      
## [18] "    3.1.1.7  acetylcholinesterase"                                          
## [19] "     43 (ACHE)"                                                             
## [20] "Glycosylphosphatidylinositol (GPI)-anchored proteins [BR:hsa00537]"         
## [21] " Enzymes"                                                                   
## [22] "  43 (ACHE)"                                                                
## 
## [[1]]$POSITION
## [1] "7q22.1"
## 
## [[1]]$MOTIF
## [1] "Pfam: COesterase AChE_tetra Abhydrolase_3"
## 
## [[1]]$DBLINKS
## [1] "NCBI-GeneID: 43"           "NCBI-ProteinID: NP_000656"
## [3] "OMIM: 100740"              "HGNC: 108"                
## [5] "Ensembl: ENSG00000087085"  "Vega: OTTHUMG00000157033" 
## [7] "Pharos: P22303(Tclin)"     "UniProt: P22303"          
## 
## [[1]]$STRUCTURE
## [1] "PDB: 4M0E 6O69 6O5V 5HF5 4EY4 5HF9 5HFA 6CQZ 6CQT 6CQW 6O4X"
## [2] "     5HF6 4M0F 4EY5 6CQU 6O4W 6O50 4EY7 1VZJ 6CQX 5FPQ 4EY6"
## [3] "     6NEA 6O66 6CQY 6CQV 4EY8 1B41 6O5R 6O5S 5HF8 4PQE 1F8U"
## [4] "     2X8B 6F25 4BDT 6O52 3LII"                              
## 
## [[1]]$AASEQ
## AAStringSet object of length 1:
##     width seq
## [1]   614 MRPPQCLLHTPSLASPLLLLLLWLLGGGVGAEGR...RQWKAEFHRWSSYMVHWKNQFDHYSKQDRCSDL
## 
## [[1]]$NTSEQ
## DNAStringSet object of length 1:
##     width seq
## [1]  1845 ATGAGGCCCCCGCAGTGTCTGCTGCACACGCCTT...TACAGCAAGCAGGATCGCTGCTCAGACCTGTGA
###Behind the scenes, KEGGREST downloaded and parsed a KEGG flat file, which you can now explore:
names(query[[1]])
##  [1] "ENTRY"      "NAME"       "DEFINITION" "ORTHOLOGY"  "ORGANISM"  
##  [6] "PATHWAY"    "NETWORK"    "BRITE"      "POSITION"   "MOTIF"     
## [11] "DBLINKS"    "STRUCTURE"  "AASEQ"      "NTSEQ"
query[[1]]$ENTRY
##     CDS 
## "10458"
query[[1]]$DBLINKS
## [1] "NCBI-GeneID: 10458"        "NCBI-ProteinID: NP_059345"
## [3] "OMIM: 605475"              "HGNC: 947"                
## [5] "Ensembl: ENSG00000175866"  "Vega: OTTHUMG00000177698" 
## [7] "Pharos: Q9UQB8(Tbio)"      "UniProt: Q9UQB8"
###keggGet() can also return amino acid sequences as AAStringSet objects (from the Biostrings package):
keggGet(c("hsa:10458", "ece:Z5100"), "aaseq") ## retrieves amino acid sequences
## AAStringSet object of length 2:
##     width seq                                               names               
## [1]   552 MSLSRSEEMHRLTENVYKTIMEQ...DLSAQGPEGREHGDGSARTLAGR hsa:10458 K05627 ...
## [2]   248 MLNGISNAASTLGRQLVGIASRV...SGLPPLAQALKDHLAAYEQSKKG ece:Z5100 K12786 ...
###or DNAStringSet objects if option is ntseq:
keggGet(c("hsa:10458", "ece:Z5100"), "ntseq") ## retrieves nucleotide sequences
## DNAStringSet object of length 2:
##     width seq                                               names               
## [1]  1659 ATGTCTCTGTCTCGCTCAGAGGA...CCCGCACCCTGGCTGGAAGATGA hsa:10458 K05627 ...
## [2]   747 ATGCTTAATGGAATTAGTAACGC...ATGAGCAATCGAAGAAAGGGTAA ece:Z5100 K12786 ...
###keggGet() can also return images:
png <- keggGet("hsa05130", "image") 
t <- tempfile()
library(png)
writePNG(png, t)
if (interactive()) browseURL(t)
###Search by keywords with keggFind()
##You can search for two separate keywords (“shiga” and “toxin” in this case):
head(keggFind("genes", c("shiga", "toxin")))
##                                                               ece:Z1464 
## "stx2A; shiga-like toxin II A subunit encoded by bacteriophage BP-933W" 
##                                                               ece:Z1465 
## "stx2B; shiga-like toxin II B subunit encoded by bacteriophage BP-933W" 
##                                                               ece:Z3343 
##   "stx1B; shiga-like toxin 1 subunit B encoded within prophage CP-933V" 
##                                                               ece:Z3344 
##   "stx1A; shiga-like toxin 1 subunit A encoded within prophage CP-933V" 
##                                                             ecs:ECs1205 
##                                               "Shiga toxin 2 subunit A" 
##                                                             ecs:ECs1206 
##                                               "Shiga toxin 2 subunit B"
###Or search for the two words together:
head(keggFind("genes", "shiga toxin"))
##                                                               ece:Z1464 
## "stx2A; shiga-like toxin II A subunit encoded by bacteriophage BP-933W" 
##                                                               ece:Z1465 
## "stx2B; shiga-like toxin II B subunit encoded by bacteriophage BP-933W" 
##                                                               ece:Z3343 
##   "stx1B; shiga-like toxin 1 subunit B encoded within prophage CP-933V" 
##                                                               ece:Z3344 
##   "stx1A; shiga-like toxin 1 subunit A encoded within prophage CP-933V" 
##                                                             ecs:ECs1205 
##                                               "Shiga toxin 2 subunit A" 
##                                                             ecs:ECs1206 
##                                               "Shiga toxin 2 subunit B"
###Search for a chemical formula:
head(keggFind("compound", "C7H10O5", "formula"))
## cpd:C00493 cpd:C04236 cpd:C16588 cpd:C17696 cpd:C18307 cpd:C18312 
##  "C7H10O5"  "C7H10O5"  "C7H10O5"  "C7H10O5"  "C7H10O5"  "C7H10O5"
###Search for a chemical formula containing “O5” and “C7”:
head(keggFind("compound", "O5C7", "formula"))
## cpd:C00493 cpd:C00624 cpd:C01215 cpd:C01424 cpd:C02123 cpd:C02236 
##  "C7H10O5" "C7H11NO5"  "C7H9NO5"   "C7H6O5"  "C7H12O5"  "C7H6O5S"
###You can search for compounds with a particular exact mass:
keggFind("compound", 174.05, "exact_mass")
##   cpd:C00493   cpd:C04236   cpd:C16588   cpd:C17696   cpd:C18307   cpd:C18312 
## "174.052823" "174.052823" "174.052823" "174.052823" "174.052823" "174.052823" 
##   cpd:C21281 
## "174.052823"
###Integer ranges can be used to find compounds by molecular weight:
head(keggFind("compound", 300:310, "mol_weight"))
##   cpd:C00051   cpd:C00200   cpd:C00219   cpd:C00239   cpd:C00270   cpd:C00357 
##  "307.32348"  "306.33696"  "304.46688" "307.197122"  "309.26986" "301.187702"
###Convert identifiers with keggConv()
###You can either specify fully qualified identifiers:
keggConv("ncbi-proteinid", c("hsa:10458", "ece:Z5100"))
##                  hsa:10458                  ece:Z5100 
## "ncbi-proteinid:NP_059345"  "ncbi-proteinid:AAG58814"
###or get the mapping for an entire species:
head(keggConv("eco", "ncbi-geneid"))
## ncbi-geneid:944742 ncbi-geneid:945803 ncbi-geneid:947498 ncbi-geneid:945198 
##        "eco:b0001"        "eco:b0002"        "eco:b0003"        "eco:b0004" 
## ncbi-geneid:944747 ncbi-geneid:944749 
##        "eco:b0005"        "eco:b0006"
###Reversing the arguments does the opposite mapping:
head(keggConv("ncbi-geneid", "eco"))
##            eco:b0001            eco:b0002            eco:b0003 
## "ncbi-geneid:944742" "ncbi-geneid:945803" "ncbi-geneid:947498" 
##            eco:b0004            eco:b0005            eco:b0006 
## "ncbi-geneid:945198" "ncbi-geneid:944747" "ncbi-geneid:944749"
###Link across databases with keggLink()
### for example get.pathways.by.genes(), can be replaced with the keggLink() function. Here we query all pathways for human:
head(keggLink("pathway", "hsa"))
##       hsa:10327         hsa:124         hsa:125         hsa:126         hsa:127 
## "path:hsa00010" "path:hsa00010" "path:hsa00010" "path:hsa00010" "path:hsa00010" 
##         hsa:128 
## "path:hsa00010"
###but you can also specify one or more genes (from multiple species):
keggLink("pathway", c("hsa:10458", "ece:Z5100"))
##       hsa:10458       hsa:10458       hsa:10458       hsa:10458       ece:Z5100 
## "path:hsa04520" "path:hsa04810" "path:hsa05130" "path:hsa05135" "path:ece05130"
##ref https://www.bioconductor.org/packages/release/bioc/vignettes/KEGGREST/inst/doc/KEGGREST-vignette.html