# Package Install
if (!requireNamespace("BiocManager", quietly = TRUE)){
install.packages("BiocManager")
}
pkgs <- c("knitr", "rmdformats", "biomaRt", "AnnotationHub",
"meshr", "MeSHDbi", "readxl", "tagcloud", "RColorBrewer")
for(i in seq_along(pkgs)){
if (!requireNamespace(pkgs[i], quietly = TRUE)){
BiocManager::install(pkgs[i], suppressUpdates=TRUE, force=TRUE)
}
}
# Package Loading
library("knitr")
library("rmdformats")
library("biomaRt")
library("AnnotationHub")
library("meshr")
library("MeSHDbi")
library("readxl")
library("tagcloud")
library("RColorBrewer")
Medical Subject Headings, also known as MeSH, is a comprehensive collection of biological annotations annually updated by the National Library of Medicine. The collection includes more than 27,000 biological terms for many species, including crops and farm animals (Tsuyuzaki et al. 2015; Morota et al. 2015, 2016; Beissinger and Morota, 2017). The MeSH software packages (Tsuyuzaki et al. 2015) have been successfully applied to swine, cattle, horses, chicken, and turkey (Morota et al. 2015, 2016; Júnior et al. 2017; Abdalla et al. 2020). These studies demonstrated the utility of MeSH in improving the biological interpretation of gene sets and understanding the genetic architecture of complex traits in farm animals. The R code below shows how to perform MeSH enrichment analysis in Bioconductor 3.14 or later. The major change made in Bioconductor 3.14 is shown in Figure 1.
We first create a vector of background genes.
## Access to biomaRt
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "btaurus_gene_ensembl",
host = "uswest.ensembl.org")
univ.geneID <- getBM(attributes = c("ensembl_gene_id", "entrezgene_id", "hgnc_symbol"),
mart = mart) # 27962
## Remove genes with no corresponding Entrez Gene ID
univ.geneID2 <- univ.geneID[!is.na(univ.geneID[, 2]), ] # 21122
## Remove duplicated Entrez Gene ID
univ.geneID3 <- univ.geneID2[!duplicated(univ.geneID2[, 2]), ] # 20946
Note that the above code can be easily switched to any species by changing the “btaurus_gene_ensembl” parameter in the useMart function.
This tutorial will use data from an RNA-seq study by Li et al. (2015). The authors reported a list of differentially expressed genes in the ruminal wall of grass-fed and grain-fed Angus cattle. Please download the file S3 Table from the journal website.
## Read data
my.geneID <- read_excel("pone.0116437.s004.xls", col_names = TRUE, sheet = 1, na = "NA") # 342
## Change the 1st column name to 'ensembl_gene_id'.
colnames(my.geneID)[1] <- "ensembl_gene_id"
## Merge two files
my.geneID2 <- merge(my.geneID, univ.geneID3, by = "ensembl_gene_id") # 307
## Remove duplicated Entrez Gene ID
my.geneID3 <- my.geneID2[!duplicated(my.geneID2$entrezgene), ] # 307
MeSH terms are designated by articles indexed in PubMed by the National Library of Medicine and can be directly mapped to genes . There are currently 19 categories.
We will pay particular attention to the following five MeSH categories. A (Anatomy), B (Organisms), C (Diseases), D (Chemicals and Drugs), G (Phenomena and Process).
The usage of the MeSH enrichment analysis framework has changed in Bioconductor 3.14. Species-specific annotation packages are no longer distributed. Instead, the MeSH annotation data are now stored in a cloud accessible via the AnnotationHub package. , This change ensures data reproducibility (e.g., the user can specify versions of the data, such as “v002”).
The first step is to download MeSH annotated genes for Bos taurus from the cloud. We will first download MeSH annotations using the AnnotationHub package, which returns the metadata of all the data stored in the AnnotationHub server.
ah <- AnnotationHub()
dbfile1 <- query(ah, c("MeSHDb", "MeSH.db", "v002"))[[1]]
MeSH.db <- MeSHDbi::MeSHDb(dbfile1)
dbfile2 <- query(ah, c("MeSHDb", "Bos taurus", "v002"))[[1]]
MeSH.Bta.eg.db <- MeSHDbi::MeSHDb(dbfile2) # Converting SQLite file into MESHDb object
MeSH.Bta.eg.db
[1] "##### class ####"
[1] "MeSHDb"
attr(,"package")
[1] "MeSHDbi"
[1] "##### connection #####"
<SQLiteConnection>
Path: /Users/Sabrina/Library/Caches/org.R-project.R/R/AnnotationHub/45e727a850fa_104593
Extensions: TRUE
[1] "##### sqlite file #####"
AH97847
"/Users/Sabrina/Library/Caches/org.R-project.R/R/AnnotationHub/45e727a850fa_104593"
Note that the above code can be easily switched to any species by changing the “Bos taurus” parameter in the query function.
MeSH enrichment analysis can be performed using the meshr package. Let’s perform MeSH enrichment analysis for Chemicals and Drugs category by setting category=“D”.
# Chemicals and Drugs
meshParams.D <- new("MeSHHyperGParams",
geneIds = my.geneID3[, 6],
universeGeneIds = univ.geneID3[, 2],
annotation = "MeSH.Bta.eg.db",# Match between MeSH IDs and Entrez Gene IDs
meshdb = "MeSH.db",
category = "D",
database = "gene2pubmed", # data from PubMed
pvalueCutoff = 0.5, pAdjust = "none")
meshR.D <- meshHyperGTest(meshParams.D)
head(summary(meshR.D)) # Check your results
MESHID Pvalue OddsRatio ExpCount Count Size
40144 D019765 3.118305e-06 Inf 0.04397021 3 3
40145 D019765 3.118305e-06 Inf 0.04397021 3 3
40146 D019765 3.118305e-06 Inf 0.04397021 3 3
40147 D019765 3.118305e-06 Inf 0.04397021 3 3
30966 D015815 5.163257e-06 12.01608 0.68886661 7 47
30967 D015815 5.163257e-06 12.01608 0.68886661 7 47
MESHTERM GENEID SOURCEID
40144 Chondroitin ABC Lyase 282654 11322944
40145 Chondroitin ABC Lyase 280760 11322944
40146 Chondroitin ABC Lyase 280733 12354766
40147 Chondroitin ABC Lyase 280760 12601001
30966 Cell Adhesion Molecules 505775 30385793
30967 Cell Adhesion Molecules 281941 3512556
The user can easily change a MeSH category by changing the category parameter. Let’s perform enrichment analysis for Diseases category by changing “D” for “C”.
meshParams.C <- new("MeSHHyperGParams", geneIds = my.geneID3[, 6], universeGeneIds = univ.geneID3[,
2], annotation = "MeSH.Bta.eg.db", meshdb = "MeSH.db", category = "C", database = "gene2pubmed",
pvalueCutoff = 0.5, pAdjust = "none")
meshR.C <- meshHyperGTest(meshParams.C)
head(summary(meshR.C)) # Check your results
MESHID Pvalue OddsRatio ExpCount Count Size
2358 D006566 9.26359e-05 20.94542 0.2491645 4 17
2359 D006566 9.26359e-05 20.94542 0.2491645 4 17
2360 D006566 9.26359e-05 20.94542 0.2491645 4 17
2361 D006566 9.26359e-05 20.94542 0.2491645 4 17
2362 D006566 9.26359e-05 20.94542 0.2491645 4 17
2363 D006566 9.26359e-05 20.94542 0.2491645 4 17
MESHTERM GENEID SOURCEID
2358 Herpesviridae Infections 281496 22823939
2359 Herpesviridae Infections 504519 24657787
2360 Herpesviridae Infections 513497 19540267
2361 Herpesviridae Infections 281871 22160940
2362 Herpesviridae Infections 511951 22542975
2363 Herpesviridae Infections 281535 27473967
MeSH enrichment analysis for Anatomy (category = “A”).
meshParams.A <- new("MeSHHyperGParams",
geneIds = my.geneID3[, 6],
universeGeneIds = univ.geneID3[, 2],
annotation = "MeSH.Bta.eg.db",
meshdb = "MeSH.db",
category = "A",
database = "gene2pubmed",
pvalueCutoff = 0.5, pAdjust = "none")
meshR.A <- meshHyperGTest(meshParams.A)
head(summary(meshR.A)) # Check your results
MESHID Pvalue OddsRatio ExpCount Count Size MESHTERM GENEID
17365 D014998 6.907907e-08 2.452987 239.3592 275 16331 Y Chromosome 522449
17366 D014998 6.907907e-08 2.452987 239.3592 275 16331 Y Chromosome 513057
17367 D014998 6.907907e-08 2.452987 239.3592 275 16331 Y Chromosome 509031
17368 D014998 6.907907e-08 2.452987 239.3592 275 16331 Y Chromosome 509022
17369 D014998 6.907907e-08 2.452987 239.3592 275 16331 Y Chromosome 533250
17370 D014998 6.907907e-08 2.452987 239.3592 275 16331 Y Chromosome 528429
SOURCEID
17365 19393038
17366 19393038
17367 19393038
17368 19393038
17369 19393038
17370 19393038
MeSH enrichment analysis for Phenomena and Processes (category = “G”).
meshParams.G <- new("MeSHHyperGParams", geneIds = my.geneID3[, 6], universeGeneIds = univ.geneID3[,
2], annotation = "MeSH.Bta.eg.db", meshdb = "MeSH.db", category = "G", database = "gene2pubmed",
pvalueCutoff = 0.5, pAdjust = "none")
meshR.G <- meshHyperGTest(meshParams.G)
head(summary(meshR.G)) # Check your results
MESHID Pvalue OddsRatio ExpCount Count Size MESHTERM GENEID
122349 D016678 3.675469e-08 2.550414 241.0887 277 16449 Genome 515523
122350 D016678 3.675469e-08 2.550414 241.0887 277 16449 Genome 103157237
122351 D016678 3.675469e-08 2.550414 241.0887 277 16449 Genome 103753117
122352 D016678 3.675469e-08 2.550414 241.0887 277 16449 Genome 540050
122353 D016678 3.675469e-08 2.550414 241.0887 277 16449 Genome 507332
122354 D016678 3.675469e-08 2.550414 241.0887 277 16449 Genome 104977453
SOURCEID
122349 19393038
122350 23851991
122351 23122059
122352 19393038
122353 19393038
122354 22100599
The tagcloud function from the tagcloud package creates a word cloud. In our case, the size of the MeSH terms is proportional to their extent of enrichment.
meshR <- meshR.D # Use your meshR.D results from the Mesh-ORA analysis
meshR.summary <- summary(meshR)[!duplicated(summary(meshR)[, 7]), ] # Extract unique elements
tags <- strmultline(meshR.summary$MESHTERM)
weights <- -log(meshR.summary$Pvalue)
or <- meshR.summary$OddsRatio
colors <- smoothPalette(or, max = 4)
tagcloud(tags, weights = weights, col = colors)
# Here we are using the RColorBrewer package to include colors in the word
# cloud.
colors <- smoothPalette(weights, pal = brewer.pal(11, "Spectral"))
tagcloud(tags, weights = weights, col = colors, order = "size")
Note: This is a suggested color palette. The user is free to choose any color palette. More options can be found here.
# Diseases
meshR <- meshR.C
meshR.summary <- summary(meshR)[!duplicated(summary(meshR)[, 7]), ]
tags <- strmultline(meshR.summary$MESHTERM)
weights <- -log(meshR.summary$Pvalue)
or <- meshR.summary$OddsRatio
colors <- smoothPalette(or, max = 4)
tagcloud(tags, weights = weights, col = colors)
colors <- smoothPalette(weights, pal = brewer.pal(11, "Spectral"))
tagcloud(tags, weights = weights, col = colors, order = "size")
# Anatomy
meshR <- meshR.A
meshR.summary <- summary(meshR)[!duplicated(summary(meshR)[, 7]), ]
tags <- strmultline(meshR.summary$MESHTERM)
weights <- -log(meshR.summary$Pvalue)
or <- meshR.summary$OddsRatio
colors <- smoothPalette(or, max = 4)
tagcloud(tags, weights = weights, col = colors)
colors <- smoothPalette(weights, pal = brewer.pal(11, "Spectral"))
tagcloud(tags, weights = weights, col = colors, order = "size")
# Phenomena and Processes
meshR <- meshR.G
meshR.summary <- summary(meshR)[!duplicated(summary(meshR)[, 7]), ]
tags <- strmultline(meshR.summary$MESHTERM)
weights <- -log(meshR.summary$Pvalue)
or <- meshR.summary$OddsRatio
colors <- smoothPalette(or, max = 4)
tagcloud(tags, weights = weights, col = colors)
colors <- smoothPalette(weights, pal = brewer.pal(11, "Spectral"))
tagcloud(tags, weights = weights, col = colors, order = "size")
MeSH annotations for other species can be easily downloaded by providing the species names. The following code downloads the MeSH annotation databases for Swine and Chicken.
dbfile3 <- query(ah, c("MeSHDb", "Sus scrofa", "v002"))[[1]]
MeSH.Ssc.eg.db <- MeSHDbi::MeSHDb(dbfile3)
MeSH.Ssc.eg.db
[1] "##### class ####"
[1] "MeSHDb"
attr(,"package")
[1] "MeSHDbi"
[1] "##### connection #####"
<SQLiteConnection>
Path: /Users/Sabrina/Library/Caches/org.R-project.R/R/AnnotationHub/45e715230222_104648
Extensions: TRUE
[1] "##### sqlite file #####"
AH97902
"/Users/Sabrina/Library/Caches/org.R-project.R/R/AnnotationHub/45e715230222_104648"
dbfile4 <- query(ah, c("MeSHDb", "Gallus gallus", "v002"))[[1]]
MeSH.Gga.eg.db <- MeSHDbi::MeSHDb(dbfile4)
MeSH.Gga.eg.db
[1] "##### class ####"
[1] "MeSHDb"
attr(,"package")
[1] "MeSHDbi"
[1] "##### connection #####"
<SQLiteConnection>
Path: /Users/Sabrina/Library/Caches/org.R-project.R/R/AnnotationHub/45e73fa47689_104619
Extensions: TRUE
[1] "##### sqlite file #####"
AH97873
"/Users/Sabrina/Library/Caches/org.R-project.R/R/AnnotationHub/45e73fa47689_104619"
The MeSH database is updated frequently to reflect the findings from newly published papers in animal genetics, and we provide the database biannually along with Bioconductor updates. MeSH enrichment analysis may help to provide new insights into the biological interpretation of genes influencing complex traits in livestock species. The above R code is also available at https://sabrinaam.github.io/.