This protocol is borrowed from the Metacoder documentatiion available here.

The protocol has been suitably modified to run on taxonomic abundance data from the TS Dentistry metagnomic samples. The taxonomic abundance data was generated (and filtered to include taxa which contribute > 0.1 percent of total counts) by in-house python scripts.

Load the metacoder package and read in the taxonomy abundance data and sample data

library(metacoder)

## Loading required package: taxa

## This is metacoder verison 0.3.4.9002 (development version)

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
## x ggplot2::map_data() masks taxa::map_data()

library(taxa)

data = read.csv("test-filt.txt", header=T, sep="\t")
data = as_tibble(data)
data$newID = as.character(data$newID)
data$index = as.character(data$index)
print(data)

## # A tibble: 78 x 12
##    newID  index       X31D  X32Da  X34D X35Da  X36Da    C16    C18    C19    C22
##    <chr>  <chr>      <dbl>  <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
##  1 0_322… p__Actin… 3.03e5 9.57e5  7972 16472 3.48e6 126105 3.18e5 2.80e2  53454
##  2 1_207… p__Chord… 1.59e6 1.13e6  8745  6251 1.66e6 980275 2.51e6 1.31e6 857087
##  3 2_9597 p__Chord… 2.19e4 1.79e4   112   202 2.39e4  18539 4.82e4 1.16e4  21330
##  4 4_1301 p__Firmi… 3.41e5 2.13e6  4812 47523 4.84e6 539271 4.56e5 3.12e3 202700
##  5 5_314… p__Chord… 1.06e6 5.76e5  5828  6706 8.75e5 741295 1.52e6 6.79e5 565341
##  6 6_437… p__Actin… 3.71e5 7.38e4   536   376 4.60e5   4342 3.10e1 5.50e1   6633
##  7 7_281… p__Bacte… 2.37e4 5.43e4   143   121 3.95e4   5473 1.35e2 6.00e0  20843
##  8 8_1654 p__Actin… 1.59e5 3.83e6   443   727 1.57e5 274778 8.00e3 7.26e2  12220
##  9 14_11… p__Actin… 2.05e5 7.25e3   250    41 2.44e4  12392 9.00e0 4.50e1   8423
## 10 16_29… p__Firmi… 4.77e5 1.40e6  1670  3028 7.72e5 375318 6.45e4 4.88e2  70410
## # … with 68 more rows, and 1 more variable: C23 <dbl>

samples = read.csv("test-samples.txt", header=T, sep="\t")
samples = as_tibble(samples)
samples$sample_id = as.character(samples$sample_id)
samples$type = as.character(samples$type)
print(samples)

## # A tibble: 10 x 2
##    sample_id type 
##    <chr>     <chr>
##  1 X31D      A    
##  2 X32Da     A    
##  3 X34D      A    
##  4 X35Da     A    
##  5 X36Da     A    
##  6 C16       B    
##  7 C18       B    
##  8 C19       B    
##  9 C22       B    
## 10 C23       B

Create a taxonomy object from the taxonomy abundance data and calculate relative abundance for various taxonomic groups at different levels of taxonomy

obj <- parse_tax_data(data,class_cols = "index",class_sep = ";", class_regex = "^(.+)__(.+)$", class_key = c(tax_rank = "info",tax_name = "taxon_name"))

print(obj)

## <Taxmap>
##   191 taxa: ab. Actinobacteria ... hj. Neisseria sp. HMSC064F04
##   191 edges: NA->ab, NA->ac, NA->ad ... cz->hh, dv->hi, dv->hj
##   2 data sets:
##     tax_data:
##       # A tibble: 78 x 13
##         taxon_id newID  index    X31D  X32Da  X34D X35Da  X36Da    C16
##         <chr>    <chr>  <chr>   <dbl>  <dbl> <dbl> <dbl>  <dbl>  <dbl>
##       1 ek       0_322… p__Ac… 3.03e5 9.57e5  7972 16472 3.48e6 126105
##       2 el       1_207… p__Ch… 1.59e6 1.13e6  8745  6251 1.66e6 980275
##       3 em       2_9597 p__Ch… 2.19e4 1.79e4   112   202 2.39e4  18539
##       # … with 75 more rows, and 4 more variables: C18 <dbl>,
##       #   C19 <dbl>, C22 <dbl>, C23 <dbl>
##     class_data:
##       # A tibble: 468 x 5
##         taxon_id input_index tax_rank tax_name       regex_match      
##         <chr>          <int> <chr>    <chr>          <chr>            
##       1 ab                 1 p        Actinobacteria p__Actinobacteria
##       2 ai                 1 c        Actinobacteria c__Actinobacteria
##       3 ax                 1 o        Micrococcales  o__Micrococcales 
##       # … with 465 more rows
##   0 functions:

#Filter loww counts
obj$data$tax_data <- zero_low_counts(obj, dataset = "tax_data", min_count = 3)

## Warning: Use of "dataset" is depreciated. Use "data" instead.

## No `cols` specified, so using all numeric columns:
##    X31D, X32Da, X34D, X35Da, X36Da, C16, C18, C19, C22, C23

## Zeroing 8 of 780 counts less than 3.

#Convert counts to proportions
obj$data$tax_data <- calc_obs_props(obj, "tax_data")

## No `cols` specified, so using all numeric columns:
##    X31D, X32Da, X34D, X35Da, X36Da, C16, C18, C19, C22, C23

## Calculating proportions from counts for 10 columns for 78 observations.

#Calculate per Taxon abundance
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data",cols = samples$sample_id)

## Summing per-taxon counts from 10 columns for 191 taxa

obj$data$tax_occ <- calc_n_samples(obj, "tax_abund", groups = samples$type, cols = samples$sample_id)

## Calculating number of samples with a value greater than 0 for 10 columns in 2 groups for 191 observations

Generate a heat tree for the taxonomic classes in the samples data

# heat tree for taxa in samples of type A
heat_tree(obj,node_label = taxon_names,node_size = n_obs,node_color = A,node_size_axis_label = "OTU count",node_color_axis_label = "Samples with reads",layout = "davidson-harel",initial_layout = "reingold-tilford")

## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.

## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.

# heat tree for taxa in samples of type B
heat_tree(obj,node_label = taxon_names,node_size = n_obs,node_color = B,node_size_axis_label = "OTU count",node_color_axis_label = "Samples with reads",layout = "davidson-harel",initial_layout = "reingold-tilford")

## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.

## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.

Statiscal testing for differences in groups of samples

library(vegan)

## Loading required package: permute

## Loading required package: lattice

## This is vegan 2.5-7

library(ggplot2)
library(agricolae)

## Registered S3 methods overwritten by 'klaR':
##   method      from 
##   predict.rda vegan
##   print.rda   vegan
##   plot.rda    vegan

#Calculate Inverse Simpson diversity metric for the samples
samples$inv_simp <- diversity(obj$data$tax_data[, samples$sample_id],index = "invsimpson",MARGIN = 2)

#Plot means for sample groups
ggplot(samples, aes(x = type, y = inv_simp)) + geom_boxplot()

#ANOVA for means of samples grouped on type
anova_result <- aov(inv_simp ~ type, samples)
summary(anova_result)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## type         1  61.61   61.61   6.377 0.0355 *
## Residuals    8  77.29    9.66                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#A Tukey’s Honest Significant Difference (HSD) test can compare each site to every other and tell us which are significantly different. Although base R has a Tukey’s HSD function called TukeyHSD, we will use one from the package agricolae since it supplies grouping codes that are useful for graphing.
tukey_result <- HSD.test(anova_result, "type", group = TRUE)
print(tukey_result)

## $statistics
##    MSerror Df     Mean       CV      MSD
##   9.661202  8 10.30766 30.15472 4.533208
## 
## $parameters
##    test name.t ntr StudentizedRange alpha
##   Tukey   type   2         3.261182  0.05
## 
## $means
##    inv_simp      std r      Min      Max      Q25       Q50       Q75
## A 12.789838 3.019422 5 9.617950 17.10099 11.04401 11.578375 14.607864
## B  7.825492 3.194604 5 5.242423 13.05084  5.38352  7.068761  8.381917
## 
## $comparison
## NULL
## 
## $groups
##    inv_simp groups
## A 12.789838      a
## B  7.825492      b
## 
## attr(,"class")
## [1] "group"

#We are interested in the $groups table that says which sites are different. With a little tweaking, we can add this data to the graph we made. Lets add some nicer text as well.
group_data <- tukey_result$groups[order(rownames(tukey_result$groups)),]
ggplot(samples, aes(x = type, y = inv_simp)) + geom_text(data = data.frame(),aes(x = rownames(group_data), y = max(samples$inv_simp) + 1, label = group_data$groups), col = 'black', size = 10) + geom_boxplot() + ggtitle("Alpha diversity of Sample types") + xlab("type") + ylab("Inverse Simpson Index")

Comparing two treatments/groups

Usually we are interested in how groups of samples compare. For example, we might want to know which taxa differ between the nose and throat, or between men and women. The function compare_groups facilitates these comparisons:

#Get a table of differentially abundant taxa in the "type" groups of samples
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund",cols = samples$sample_id,groups = samples$type)

## Warning: Use of "dataset" is depreciated. Use "data" instead.

## Warning in wilcox.test.default(abund_1, abund_2): cannot compute exact p-value
## with ties

print(obj$data$diff_table)

## # A tibble: 191 x 7
##    taxon_id treatment_1 treatment_2 log2_median_ratio median_diff mean_diff
##    <chr>    <chr>       <chr>                   <dbl>       <dbl>     <dbl>
##  1 ab       A           B                       1.96      0.142     0.189  
##  2 ac       A           B                      -2.34     -0.526    -0.407  
##  3 ad       A           B                       1.41      0.151     0.170  
##  4 ae       A           B                       3.06      0.0346    0.0357 
##  5 af       A           B                       4.84      0.0971    0.0787 
##  6 ag       A           B                      -0.645    -0.0450   -0.0633 
##  7 ah       A           B                      -1.04     -0.00164  -0.00230
##  8 ai       A           B                       1.96      0.142     0.181  
##  9 aj       A           B                      -2.32     -0.511    -0.401  
## 10 ak       A           B                       0.964     0.0713    0.151  
## # … with 181 more rows, and 1 more variable: wilcox_p_value <dbl>

#For each taxon, a Wilcoxon Rank Sum test was used to test for differences between the median abundances of samples in each treatment. We can use this information to create what we call a differential heat tree, which indicates which taxa are more abundant in each treatment:

heat_tree(obj,node_label = taxon_names,node_size = n_obs,node_color = log2_median_ratio,node_color_interval = c(-2, 2),node_color_range = c("cyan", "gray", "tan"),node_size_axis_label = "OTU count",node_color_axis_label = "Log 2 ratio of median proportions",layout = "davidson-harel",initial_layout = "reingold-tilford")

## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.

## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.

Displaying statistically significantly different taxa

In this case, taxa colored tan are more abundant in type A and those colored blue are more abundant in type B. Note that we have not taken into account statistical significance when showing this, so lets do that. First, we need to correct for multiple comparisons:

# adjust p-values for multiple test correction
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method = "fdr")

#check the range of the p-values
range(obj$data$diff_table$wilcox_p_value, finite = TRUE)

## [1] 0.06450523 0.84126984

#if there still were some significant differences, we could set any difference that is not significant to zero and repeat the last heat_tree command doing something like (in this case I am setting the cutoff at 0.1, as there is nothing < 0.05):
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.1] <- 0

# repeat the heat_tree command with these new log_median_ratio values
heat_tree(obj,node_label = taxon_names,node_size = n_obs,node_color = log2_median_ratio,node_color_interval = c(-2, 2),node_color_range = c("cyan", "gray", "tan"),node_size_axis_label = "OTU count",node_color_axis_label = "Log 2 ratio of median proportions",layout = "davidson-harel",initial_layout = "reingold-tilford")