#Introduction to the msigdbr package
#Overview
#The msigdbr R package provides Molecular Signatures Database (MSigDB) gene sets for GSEA
rm(list = ls())
#Load package.
library(msigdbr)
#Check the available species.
msigdbr_show_species()
## [1] "Bos taurus" "Caenorhabditis elegans"
## [3] "Canis lupus familiaris" "Danio rerio"
## [5] "Drosophila melanogaster" "Gallus gallus"
## [7] "Homo sapiens" "Mus musculus"
## [9] "Rattus norvegicus" "Saccharomyces cerevisiae"
## [11] "Sus scrofa"
#Retrieve human genes for all gene sets in the database.
m_df = msigdbr(species = "Homo sapiens",category = NULL, subcategory = NULL)
dim(m_df)
## [1] 3306263 9
head(m_df)
## # A tibble: 6 x 9
## gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
## <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 M126~ AAACCA~ C3 MIR:MIR_~ ABCC4 Homo sapiens 10257
## 2 M126~ AAACCA~ C3 MIR:MIR_~ ABRAXAS2 Homo sapiens 23172
## 3 M126~ AAACCA~ C3 MIR:MIR_~ ACTN4 Homo sapiens 81
## 4 M126~ AAACCA~ C3 MIR:MIR_~ ACVR1 Homo sapiens 90
## 5 M126~ AAACCA~ C3 MIR:MIR_~ ADAM9 Homo sapiens 8754
## 6 M126~ AAACCA~ C3 MIR:MIR_~ ADAMTS5 Homo sapiens 11096
## # ... with 2 more variables: gene_symbol <chr>, sources <chr>
unique(m_df$gs_cat)
## [1] "C3" "C2" "C6" "C4" "C1" "C5" "C7" "H"
unique(m_df$gs_subcat)
## [1] "MIR:MIR_Legacy" "TFT:TFT_Legacy" "CGP" "TFT:GTRD"
## [5] "" "CP:BIOCARTA" "CGN" "MF"
## [9] "BP" "CC" "CP:KEGG" "MIR:MIRDB"
## [13] "CM" "CP" "CP:PID" "CP:REACTOME"
tf_1 <- m_df[m_df$gs_subcat == "TFT:TFT_Legacy" | m_df$gs_subcat == "TFT:GTRD",]
dim(tf_1)
## [1] 437343 9
tf_1[grep("CYP1A1",tf_1$gene_symbol),]
## # A tibble: 19 x 9
## gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
## <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 M9986 AHR_01 C3 TFT:TFT_~ CYP1A1 Homo sapiens 1543
## 2 M173~ AHR_Q5 C3 TFT:TFT_~ CYP1A1 Homo sapiens 1543
## 3 M298~ AHRR_T~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 4 M5866 HNF4_0~ C3 TFT:TFT_~ CYP1A1 Homo sapiens 1543
## 5 M300~ KLF7_T~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 6 M4998 MYOD_Q~ C3 TFT:TFT_~ CYP1A1 Homo sapiens 1543
## 7 M182~ MZF1_02 C3 TFT:TFT_~ CYP1A1 Homo sapiens 1543
## 8 M8746 RGAANN~ C3 TFT:TFT_~ CYP1A1 Homo sapiens 1543
## 9 M301~ RYBP_T~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 10 M7521 TATAAA~ C3 TFT:TFT_~ CYP1A1 Homo sapiens 1543
## 11 M113~ TGACCT~ C3 TFT:TFT_~ CYP1A1 Homo sapiens 1543
## 12 M302~ ZBTB24~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 13 M302~ ZFP69B~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 14 M302~ ZFP91_~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 15 M302~ ZNF205~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 16 M303~ ZNF595~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 17 M303~ ZNF768~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 18 M303~ ZNF784~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## 19 M304~ ZSCAN2~ C3 TFT:GTRD CYP1A1 Homo sapiens 1543
## # ... with 2 more variables: gene_symbol <chr>, sources <chr>
mir <- m_df[m_df$gs_subcat == "MIR:MIR_Legacy"| m_df$gs_subcat == "MIR:MIRDB", ]
dim(mir)
## [1] 406373 9
head(mir)
## # A tibble: 6 x 9
## gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
## <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 M126~ AAACCA~ C3 MIR:MIR_~ ABCC4 Homo sapiens 10257
## 2 M126~ AAACCA~ C3 MIR:MIR_~ ABRAXAS2 Homo sapiens 23172
## 3 M126~ AAACCA~ C3 MIR:MIR_~ ACTN4 Homo sapiens 81
## 4 M126~ AAACCA~ C3 MIR:MIR_~ ACVR1 Homo sapiens 90
## 5 M126~ AAACCA~ C3 MIR:MIR_~ ADAM9 Homo sapiens 8754
## 6 M126~ AAACCA~ C3 MIR:MIR_~ ADAMTS5 Homo sapiens 11096
## # ... with 2 more variables: gene_symbol <chr>, sources <chr>
mir[grep("ACHE",mir$gene_symbol),]
## # A tibble: 9 x 9
## gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
## <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 M1622 AGCTCC~ C3 MIR:MIR_~ ACHE Homo sapiens 43
## 2 M131~ CCTGTG~ C3 MIR:MIR_~ ACHE Homo sapiens 43
## 3 M195~ CGCTGC~ C3 MIR:MIR_~ ACHE Homo sapiens 43
## 4 M308~ MIR125~ C3 MIR:MIRDB ACHE Homo sapiens 43
## 5 M308~ MIR125~ C3 MIR:MIRDB ACHE Homo sapiens 43
## 6 M309~ MIR194~ C3 MIR:MIRDB ACHE Homo sapiens 43
## 7 M327~ MIR3615 C3 MIR:MIRDB ACHE Homo sapiens 43
## 8 M306~ MIR4319 C3 MIR:MIRDB ACHE Homo sapiens 43
## 9 M317~ MIR7976 C3 MIR:MIRDB ACHE Homo sapiens 43
## # ... with 2 more variables: gene_symbol <chr>, sources <chr>
mir[grep("MIR608",mir$gs_name),]
## # A tibble: 920 x 9
## gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
## <chr> <chr> <chr> <chr> <chr> <chr> <int>
## 1 M309~ MIR608 C3 MIR:MIRDB A1CF Homo sapiens 29974
## 2 M309~ MIR608 C3 MIR:MIRDB AAK1 Homo sapiens 22848
## 3 M309~ MIR608 C3 MIR:MIRDB ABCC12 Homo sapiens 94160
## 4 M309~ MIR608 C3 MIR:MIRDB AC020929.1 Homo sapiens 57731
## 5 M309~ MIR608 C3 MIR:MIRDB ACTR1A Homo sapiens 10121
## 6 M309~ MIR608 C3 MIR:MIRDB AIPL1 Homo sapiens 23746
## 7 M309~ MIR608 C3 MIR:MIRDB AK2 Homo sapiens 204
## 8 M309~ MIR608 C3 MIR:MIRDB ANKRD52 Homo sapiens 283373
## 9 M309~ MIR608 C3 MIR:MIRDB ARRB1 Homo sapiens 408
## 10 M309~ MIR608 C3 MIR:MIRDB ASIC1 Homo sapiens 41
## # ... with 910 more rows, and 2 more variables: gene_symbol <chr>,
## # sources <chr>
###########
#category MSigDB collection abbreviation, such as H, C1, C2, C3, C4, C5, C6, C7.
#H: hallmark gene sets
#C1: positional gene sets
#C2: curated gene sets
#CGP: chemical and genetic perturbations
#CP: Canonical pathways
#C3: regulatory target gene sets
#MIR: microRNA targets
#TFT: transcription factor targets
#C4: computational gene sets
#CGN: cancer gene neighborhoods
#CM: cancer modules
#C5: GO gene sets
#BP: GO biological process
#CC: GO cellular component
#MF: GO molecular function
#C6: oncogenic signatures
#C7: immunologic signatures
#References
#https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp
#EXAMPLE
#Nusinow, David P., et al. "Quantitative proteomics of the cancer cell line encyclopedia." Cell 180.2 (2020): 387-402.
#The set we collectively refer to as ‘‘pathways’’ was a combination of the curated gene sets (c2) and hallmark (h) gene sets from MSigDB.
#GO annotations were also taken from MSigDB (c5) or BioConductor’s org.Hs.eg.db annotations package.
#Transcription factor binding targets were also from MSigDB (the c3 TFT set).
#REF
#https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html