1 knitr Options

knitr::opts_chunk$set(cache = TRUE)
knitr::opts_chunk$set(fig.align = "center")
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(results = "hold")

2 R Libraries

library(curatedMetagenomicData)
library(magrittr)
library(DT)
library(dplyr)
library(ggplot2)
library(tibble)

3 DT Options

options(DT.options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel')))

4 Combined Metadata

4.1 Data Structure

combined_metadata[1:1000, 1:4] %>%
  datatable(rownames = FALSE, extensions = 'Buttons')

4.2 Metadata Variables

combined_metadata %>%
  colnames()
##  [1] "dataset_name"                      
##  [2] "sampleID"                          
##  [3] "subjectID"                         
##  [4] "body_site"                         
##  [5] "antibiotics_current_use"           
##  [6] "study_condition"                   
##  [7] "disease"                           
##  [8] "age"                               
##  [9] "infant_age"                        
## [10] "age_category"                      
## [11] "gender"                            
## [12] "country"                           
## [13] "non_westernized"                   
## [14] "sequencing_platform"               
## [15] "DNA_extraction_kit"                
## [16] "PMID"                              
## [17] "number_reads"                      
## [18] "number_bases"                      
## [19] "minimum_read_length"               
## [20] "median_read_length"                
## [21] "pregnant"                          
## [22] "lactating"                         
## [23] "NCBI_accession"                    
## [24] "BMI"                               
## [25] "antibiotics_family"                
## [26] "momeducat"                         
## [27] "alcohol"                           
## [28] "flg-genotype"                      
## [29] "disease_subtype"                   
## [30] "hdl"                               
## [31] "triglycerides"                     
## [32] "hba1c"                             
## [33] "ldl"                               
## [34] "tnm"                               
## [35] "body_subsite"                      
## [36] "visit_number"                      
## [37] "days_from_first_collection"        
## [38] "c-peptide"                         
## [39] "family"                            
## [40] "cholesterol"                       
## [41] "glucose"                           
## [42] "mumps"                             
## [43] "adiponectin"                       
## [44] "insulin(cat)"                      
## [45] "fgf-19"                            
## [46] "hscrp"                             
## [47] "leptin"                            
## [48] "glutamate_decarboxylase_2_antibody"
## [49] "creatinine"                        
## [50] "il-1"                              
## [51] "cd163"                             
## [52] "glp-1"                             
## [53] "hitchip_probe_class"               
## [54] "hitchip_probe_number"              
## [55] "protein_intake"                    
## [56] "days_after_onset"                  
## [57] "stec_count"                        
## [58] "shigatoxin_2_elisa"                
## [59] "stool_texture"                     
## [60] "ferm_milk_prod_consumer"           
## [61] "mgs_richness"                      
## [62] "location"                          
## [63] "dyastolic_p"                       
## [64] "systolic_p"                        
## [65] "prothrombin_time"                  
## [66] "creatine"                          
## [67] "inr"                               
## [68] "ctp"                               
## [69] "albumine"                          
## [70] "bilubirin"                         
## [71] "smoker"                            
## [72] "ever_smoker"                       
## [73] "birth_control_pil"                 
## [74] "hla_drb12"                         
## [75] "hla_dqa12"                         
## [76] "hla_dqa11"                         
## [77] "hla_drb11"                         
## [78] "start_solidfood"                   
## [79] "ajcc"                              
## [80] "fobt"

5 Mega-Analysis

5.1 select Function

not_all_na <- function(x) {
    all(!is.na(x))
}

5.2 BMI Metadata

bmi_metadata <- 
  combined_metadata %>%
  filter(!is.na(BMI)) %>%
  filter(body_site == "stool") %>%
  select_if(not_all_na)

bmi_metadata %>%
  select(dataset_name, sampleID, BMI) %>%
  datatable(rownames = FALSE)

5.3 BMI Distribution

bmi_metadata %>%
  ggplot(aes(BMI)) +
  geom_density() +
  theme_minimal()

5.4 Download Data

bmi_datasets <-
  unique(bmi_metadata$dataset_name) %>%
  paste0(".metaphlan_bugs_list.stool") %>%
  curatedMetagenomicData(dryrun = FALSE)

bmi_datasets[1]
bmi_datasets[[1]]
## List of length 1
## names(1): FengQ_2015.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1547 features, 154 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: SID31766 SID31537 ... SID531663 (154 total)
##   varLabels: subjectID body_site ... NCBI_accession (23 total)
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
##   pubMedIds: 25758642 
## Annotation:

5.5 Specify Clades

beaumont_taxa <- c("k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales",
                   "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae",
                   "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae",
                   "k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Ruminococcus")

5.6 data.frame from ExpressionSet Objects

eset_df <- function(eset_obj, eset_name) {
    eset_name <- strsplit(eset_name, "\\.")[[1]][1]
    
    exprs_df <-
        eset_obj[featureNames(eset_obj) %in% beaumont_taxa, ] %>%
        exprs() %>%
        t() %>%
        data.frame() %>%
        rownames_to_column(var = "SampleID")
    
    bmi_vec <- eset_obj$BMI
    
    data.frame(eset_name, exprs_df, bmi_vec)
}

5.7 Make a Meta data.frame

dataset_names <- names(bmi_datasets)

meta_df <-
    mapply(eset_df, bmi_datasets, dataset_names, SIMPLIFY = FALSE) %>%
    Reduce(rbind, .) %>%
    set_colnames(c("Study", "SampleID", "o__Clostridiales", "f__Lachnospiraceae", "f__Ruminococcaceae", "g__Ruminococcus", "BMI")) %>%
    filter(!is.na(BMI))

meta_df %>%
  datatable(rownames = FALSE)

5.8 Function to Extract Linear Models

model_df <- function(lm_obj, model_name) {
  estimate <- 
    coef(lm_obj) %>%
    data.frame() %>%
    extract(2, )
  
  lower_ci <- 
    confint(lm_obj) %>%
    data.frame() %>%
    extract(2, 1)
  
  upper_ci <- 
    confint(lm_obj) %>%
    data.frame() %>%
    extract(2, 2)
  
  data.frame(model_name, estimate, lower_ci, upper_ci)
}

5.9 Linear Models

o__Clostridiales <- 
  meta_df %$%
  lm(BMI ~ o__Clostridiales) %>%
  model_df("o__Clostridiales")

f__Lachnospiraceae <- 
  meta_df %$%
  lm(BMI ~ f__Lachnospiraceae) %>%
  model_df("f__Lachnospiraceae")

f__Ruminococcaceae <- 
  meta_df %$%
  lm(BMI ~ f__Ruminococcaceae) %>%
  model_df("f__Ruminococcaceae")

g__Ruminococcus <- 
  meta_df %$%
  lm(BMI ~ g__Ruminococcus) %>%
  model_df("g__Ruminococcus")

box_plot_order <-
  c("o__Clostridiales", "f__Lachnospiraceae", "f__Ruminococcaceae",
    "g__Ruminococcus") %>%
  rev()

box_plot_df <-
  list(o__Clostridiales, f__Lachnospiraceae, f__Ruminococcaceae, 
       g__Ruminococcus) %>%
  Reduce(full_join, .) %>%
  mutate(model_name = ordered(model_name, levels = box_plot_order))

5.10 Box Plot

box_plot_df %>%
  ggplot(aes(model_name, estimate)) +
  geom_point() +
  geom_pointrange(aes(ymin = lower_ci, ymax = upper_ci)) +
  geom_hline(aes(yintercept = 0), color = "gray", linetype = "dashed") +
  coord_flip() +
  theme_minimal() +
  theme(axis.title.y = element_blank())

6 Meta-Analysis

o__Clostridiales <- 
  meta_df %$%
  lm(BMI ~ o__Clostridiales + Study) %>%
  model_df("o__Clostridiales")

f__Lachnospiraceae <- 
  meta_df %$%
  lm(BMI ~ f__Lachnospiraceae + Study) %>%
  model_df("f__Lachnospiraceae")

f__Ruminococcaceae <- 
  meta_df %$%
  lm(BMI ~ f__Ruminococcaceae + Study) %>%
  model_df("f__Ruminococcaceae")

g__Ruminococcus <- 
  meta_df %$%
  lm(BMI ~ g__Ruminococcus + Study) %>%
  model_df("g__Ruminococcus")

box_plot_df <-
  list(o__Clostridiales, f__Lachnospiraceae, f__Ruminococcaceae, 
       g__Ruminococcus) %>%
  Reduce(full_join, .) %>%
  mutate(model_name = ordered(model_name, levels = box_plot_order))

6.1 Box Plot

box_plot_df %>%
  ggplot(aes(model_name, estimate)) +
  geom_point() +
  geom_pointrange(aes(ymin = lower_ci, ymax = upper_ci)) +
  geom_hline(aes(yintercept = 0), color = "gray", linetype = "dashed") +
  coord_flip() +
  theme_minimal() +
  theme(axis.title.y = element_blank())