#Introduction to the msigdbr package
#Overview
#The msigdbr R package provides Molecular Signatures Database (MSigDB) gene sets for GSEA
rm(list = ls())
#Load package.
library(msigdbr)
#Check the available species.
msigdbr_show_species()
##  [1] "Bos taurus"               "Caenorhabditis elegans"  
##  [3] "Canis lupus familiaris"   "Danio rerio"             
##  [5] "Drosophila melanogaster"  "Gallus gallus"           
##  [7] "Homo sapiens"             "Mus musculus"            
##  [9] "Rattus norvegicus"        "Saccharomyces cerevisiae"
## [11] "Sus scrofa"
#Retrieve human genes for all gene sets in the database.
m_df = msigdbr(species = "Homo sapiens",category = NULL, subcategory = NULL)
dim(m_df)
## [1] 3306263       9
head(m_df)
## # A tibble: 6 x 9
##   gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
##   <chr> <chr>   <chr>  <chr>     <chr>            <chr>              <int>
## 1 M126~ AAACCA~ C3     MIR:MIR_~ ABCC4            Homo sapiens       10257
## 2 M126~ AAACCA~ C3     MIR:MIR_~ ABRAXAS2         Homo sapiens       23172
## 3 M126~ AAACCA~ C3     MIR:MIR_~ ACTN4            Homo sapiens          81
## 4 M126~ AAACCA~ C3     MIR:MIR_~ ACVR1            Homo sapiens          90
## 5 M126~ AAACCA~ C3     MIR:MIR_~ ADAM9            Homo sapiens        8754
## 6 M126~ AAACCA~ C3     MIR:MIR_~ ADAMTS5          Homo sapiens       11096
## # ... with 2 more variables: gene_symbol <chr>, sources <chr>
unique(m_df$gs_cat)
## [1] "C3" "C2" "C6" "C4" "C1" "C5" "C7" "H"
unique(m_df$gs_subcat)
##  [1] "MIR:MIR_Legacy" "TFT:TFT_Legacy" "CGP"            "TFT:GTRD"      
##  [5] ""               "CP:BIOCARTA"    "CGN"            "MF"            
##  [9] "BP"             "CC"             "CP:KEGG"        "MIR:MIRDB"     
## [13] "CM"             "CP"             "CP:PID"         "CP:REACTOME"
tf_1 <- m_df[m_df$gs_subcat == "TFT:TFT_Legacy" | m_df$gs_subcat == "TFT:GTRD",]
dim(tf_1)
## [1] 437343      9
tf_1[grep("CYP1A1",tf_1$gene_symbol),]
## # A tibble: 19 x 9
##    gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
##    <chr> <chr>   <chr>  <chr>     <chr>            <chr>              <int>
##  1 M9986 AHR_01  C3     TFT:TFT_~ CYP1A1           Homo sapiens        1543
##  2 M173~ AHR_Q5  C3     TFT:TFT_~ CYP1A1           Homo sapiens        1543
##  3 M298~ AHRR_T~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
##  4 M5866 HNF4_0~ C3     TFT:TFT_~ CYP1A1           Homo sapiens        1543
##  5 M300~ KLF7_T~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
##  6 M4998 MYOD_Q~ C3     TFT:TFT_~ CYP1A1           Homo sapiens        1543
##  7 M182~ MZF1_02 C3     TFT:TFT_~ CYP1A1           Homo sapiens        1543
##  8 M8746 RGAANN~ C3     TFT:TFT_~ CYP1A1           Homo sapiens        1543
##  9 M301~ RYBP_T~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## 10 M7521 TATAAA~ C3     TFT:TFT_~ CYP1A1           Homo sapiens        1543
## 11 M113~ TGACCT~ C3     TFT:TFT_~ CYP1A1           Homo sapiens        1543
## 12 M302~ ZBTB24~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## 13 M302~ ZFP69B~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## 14 M302~ ZFP91_~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## 15 M302~ ZNF205~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## 16 M303~ ZNF595~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## 17 M303~ ZNF768~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## 18 M303~ ZNF784~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## 19 M304~ ZSCAN2~ C3     TFT:GTRD  CYP1A1           Homo sapiens        1543
## # ... with 2 more variables: gene_symbol <chr>, sources <chr>
mir <- m_df[m_df$gs_subcat ==  "MIR:MIR_Legacy"| m_df$gs_subcat == "MIR:MIRDB", ]
dim(mir)
## [1] 406373      9
head(mir)
## # A tibble: 6 x 9
##   gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
##   <chr> <chr>   <chr>  <chr>     <chr>            <chr>              <int>
## 1 M126~ AAACCA~ C3     MIR:MIR_~ ABCC4            Homo sapiens       10257
## 2 M126~ AAACCA~ C3     MIR:MIR_~ ABRAXAS2         Homo sapiens       23172
## 3 M126~ AAACCA~ C3     MIR:MIR_~ ACTN4            Homo sapiens          81
## 4 M126~ AAACCA~ C3     MIR:MIR_~ ACVR1            Homo sapiens          90
## 5 M126~ AAACCA~ C3     MIR:MIR_~ ADAM9            Homo sapiens        8754
## 6 M126~ AAACCA~ C3     MIR:MIR_~ ADAMTS5          Homo sapiens       11096
## # ... with 2 more variables: gene_symbol <chr>, sources <chr>
mir[grep("ACHE",mir$gene_symbol),]
## # A tibble: 9 x 9
##   gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
##   <chr> <chr>   <chr>  <chr>     <chr>            <chr>              <int>
## 1 M1622 AGCTCC~ C3     MIR:MIR_~ ACHE             Homo sapiens          43
## 2 M131~ CCTGTG~ C3     MIR:MIR_~ ACHE             Homo sapiens          43
## 3 M195~ CGCTGC~ C3     MIR:MIR_~ ACHE             Homo sapiens          43
## 4 M308~ MIR125~ C3     MIR:MIRDB ACHE             Homo sapiens          43
## 5 M308~ MIR125~ C3     MIR:MIRDB ACHE             Homo sapiens          43
## 6 M309~ MIR194~ C3     MIR:MIRDB ACHE             Homo sapiens          43
## 7 M327~ MIR3615 C3     MIR:MIRDB ACHE             Homo sapiens          43
## 8 M306~ MIR4319 C3     MIR:MIRDB ACHE             Homo sapiens          43
## 9 M317~ MIR7976 C3     MIR:MIRDB ACHE             Homo sapiens          43
## # ... with 2 more variables: gene_symbol <chr>, sources <chr>
mir[grep("MIR608",mir$gs_name),]
## # A tibble: 920 x 9
##    gs_id gs_name gs_cat gs_subcat human_gene_symb~ species_name entrez_gene
##    <chr> <chr>   <chr>  <chr>     <chr>            <chr>              <int>
##  1 M309~ MIR608  C3     MIR:MIRDB A1CF             Homo sapiens       29974
##  2 M309~ MIR608  C3     MIR:MIRDB AAK1             Homo sapiens       22848
##  3 M309~ MIR608  C3     MIR:MIRDB ABCC12           Homo sapiens       94160
##  4 M309~ MIR608  C3     MIR:MIRDB AC020929.1       Homo sapiens       57731
##  5 M309~ MIR608  C3     MIR:MIRDB ACTR1A           Homo sapiens       10121
##  6 M309~ MIR608  C3     MIR:MIRDB AIPL1            Homo sapiens       23746
##  7 M309~ MIR608  C3     MIR:MIRDB AK2              Homo sapiens         204
##  8 M309~ MIR608  C3     MIR:MIRDB ANKRD52          Homo sapiens      283373
##  9 M309~ MIR608  C3     MIR:MIRDB ARRB1            Homo sapiens         408
## 10 M309~ MIR608  C3     MIR:MIRDB ASIC1            Homo sapiens          41
## # ... with 910 more rows, and 2 more variables: gene_symbol <chr>,
## #   sources <chr>
###########
#category    MSigDB collection abbreviation, such as H, C1, C2, C3, C4, C5, C6, C7.
#H: hallmark gene sets
#C1: positional gene sets
#C2: curated gene sets
#CGP: chemical and genetic perturbations
#CP: Canonical pathways
#C3: regulatory target gene sets
#MIR: microRNA targets
#TFT: transcription factor targets
#C4: computational gene sets
#CGN: cancer gene neighborhoods
#CM: cancer modules
#C5: GO gene sets
#BP: GO biological process
#CC: GO cellular component
#MF: GO molecular function
#C6: oncogenic signatures
#C7: immunologic signatures
#References
#https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp

#EXAMPLE
#Nusinow, David P., et al. "Quantitative proteomics of the cancer cell line encyclopedia." Cell 180.2 (2020): 387-402.
#The set we collectively refer to as ‘‘pathways’’ was a combination of the curated gene sets (c2) and hallmark (h) gene sets from MSigDB.
#GO annotations were also taken from MSigDB (c5) or BioConductor’s org.Hs.eg.db annotations package. 
#Transcription factor binding targets were also from MSigDB (the c3 TFT set). 

#REF
#https://cran.r-project.org/web/packages/msigdbr/vignettes/msigdbr-intro.html