This protocol is borrowed from the Metacoder documentatiion available here.
The protocol has been suitably modified to run on taxonomic abundance data from the TS Dentistry metagnomic samples. The taxonomic abundance data was generated (and filtered to include taxa which contribute > 0.1 percent of total counts) by in-house python scripts.
library(metacoder)
## Loading required package: taxa
## This is metacoder verison 0.3.4.9002 (development version)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x ggplot2::map_data() masks taxa::map_data()
library(taxa)
data = read.csv("test-filt.txt", header=T, sep="\t")
data = as_tibble(data)
data$newID = as.character(data$newID)
data$index = as.character(data$index)
print(data)
## # A tibble: 78 x 12
## newID index X31D X32Da X34D X35Da X36Da C16 C18 C19 C22
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0_322… p__Actin… 3.03e5 9.57e5 7972 16472 3.48e6 126105 3.18e5 2.80e2 53454
## 2 1_207… p__Chord… 1.59e6 1.13e6 8745 6251 1.66e6 980275 2.51e6 1.31e6 857087
## 3 2_9597 p__Chord… 2.19e4 1.79e4 112 202 2.39e4 18539 4.82e4 1.16e4 21330
## 4 4_1301 p__Firmi… 3.41e5 2.13e6 4812 47523 4.84e6 539271 4.56e5 3.12e3 202700
## 5 5_314… p__Chord… 1.06e6 5.76e5 5828 6706 8.75e5 741295 1.52e6 6.79e5 565341
## 6 6_437… p__Actin… 3.71e5 7.38e4 536 376 4.60e5 4342 3.10e1 5.50e1 6633
## 7 7_281… p__Bacte… 2.37e4 5.43e4 143 121 3.95e4 5473 1.35e2 6.00e0 20843
## 8 8_1654 p__Actin… 1.59e5 3.83e6 443 727 1.57e5 274778 8.00e3 7.26e2 12220
## 9 14_11… p__Actin… 2.05e5 7.25e3 250 41 2.44e4 12392 9.00e0 4.50e1 8423
## 10 16_29… p__Firmi… 4.77e5 1.40e6 1670 3028 7.72e5 375318 6.45e4 4.88e2 70410
## # … with 68 more rows, and 1 more variable: C23 <dbl>
samples = read.csv("test-samples.txt", header=T, sep="\t")
samples = as_tibble(samples)
samples$sample_id = as.character(samples$sample_id)
samples$type = as.character(samples$type)
print(samples)
## # A tibble: 10 x 2
## sample_id type
## <chr> <chr>
## 1 X31D A
## 2 X32Da A
## 3 X34D A
## 4 X35Da A
## 5 X36Da A
## 6 C16 B
## 7 C18 B
## 8 C19 B
## 9 C22 B
## 10 C23 B
obj <- parse_tax_data(data,class_cols = "index",class_sep = ";", class_regex = "^(.+)__(.+)$", class_key = c(tax_rank = "info",tax_name = "taxon_name"))
print(obj)
## <Taxmap>
## 191 taxa: ab. Actinobacteria ... hj. Neisseria sp. HMSC064F04
## 191 edges: NA->ab, NA->ac, NA->ad ... cz->hh, dv->hi, dv->hj
## 2 data sets:
## tax_data:
## # A tibble: 78 x 13
## taxon_id newID index X31D X32Da X34D X35Da X36Da C16
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ek 0_322… p__Ac… 3.03e5 9.57e5 7972 16472 3.48e6 126105
## 2 el 1_207… p__Ch… 1.59e6 1.13e6 8745 6251 1.66e6 980275
## 3 em 2_9597 p__Ch… 2.19e4 1.79e4 112 202 2.39e4 18539
## # … with 75 more rows, and 4 more variables: C18 <dbl>,
## # C19 <dbl>, C22 <dbl>, C23 <dbl>
## class_data:
## # A tibble: 468 x 5
## taxon_id input_index tax_rank tax_name regex_match
## <chr> <int> <chr> <chr> <chr>
## 1 ab 1 p Actinobacteria p__Actinobacteria
## 2 ai 1 c Actinobacteria c__Actinobacteria
## 3 ax 1 o Micrococcales o__Micrococcales
## # … with 465 more rows
## 0 functions:
#Filter loww counts
obj$data$tax_data <- zero_low_counts(obj, dataset = "tax_data", min_count = 3)
## Warning: Use of "dataset" is depreciated. Use "data" instead.
## No `cols` specified, so using all numeric columns:
## X31D, X32Da, X34D, X35Da, X36Da, C16, C18, C19, C22, C23
## Zeroing 8 of 780 counts less than 3.
#Convert counts to proportions
obj$data$tax_data <- calc_obs_props(obj, "tax_data")
## No `cols` specified, so using all numeric columns:
## X31D, X32Da, X34D, X35Da, X36Da, C16, C18, C19, C22, C23
## Calculating proportions from counts for 10 columns for 78 observations.
#Calculate per Taxon abundance
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data",cols = samples$sample_id)
## Summing per-taxon counts from 10 columns for 191 taxa
obj$data$tax_occ <- calc_n_samples(obj, "tax_abund", groups = samples$type, cols = samples$sample_id)
## Calculating number of samples with a value greater than 0 for 10 columns in 2 groups for 191 observations
# heat tree for taxa in samples of type A
heat_tree(obj,node_label = taxon_names,node_size = n_obs,node_color = A,node_size_axis_label = "OTU count",node_color_axis_label = "Samples with reads",layout = "davidson-harel",initial_layout = "reingold-tilford")
## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.
## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.
# heat tree for taxa in samples of type B
heat_tree(obj,node_label = taxon_names,node_size = n_obs,node_color = B,node_size_axis_label = "OTU count",node_color_axis_label = "Samples with reads",layout = "davidson-harel",initial_layout = "reingold-tilford")
## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.
## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
library(ggplot2)
library(agricolae)
## Registered S3 methods overwritten by 'klaR':
## method from
## predict.rda vegan
## print.rda vegan
## plot.rda vegan
#Calculate Inverse Simpson diversity metric for the samples
samples$inv_simp <- diversity(obj$data$tax_data[, samples$sample_id],index = "invsimpson",MARGIN = 2)
#Plot means for sample groups
ggplot(samples, aes(x = type, y = inv_simp)) + geom_boxplot()
#ANOVA for means of samples grouped on type
anova_result <- aov(inv_simp ~ type, samples)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## type 1 61.61 61.61 6.377 0.0355 *
## Residuals 8 77.29 9.66
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#A Tukey’s Honest Significant Difference (HSD) test can compare each site to every other and tell us which are significantly different. Although base R has a Tukey’s HSD function called TukeyHSD, we will use one from the package agricolae since it supplies grouping codes that are useful for graphing.
tukey_result <- HSD.test(anova_result, "type", group = TRUE)
print(tukey_result)
## $statistics
## MSerror Df Mean CV MSD
## 9.661202 8 10.30766 30.15472 4.533208
##
## $parameters
## test name.t ntr StudentizedRange alpha
## Tukey type 2 3.261182 0.05
##
## $means
## inv_simp std r Min Max Q25 Q50 Q75
## A 12.789838 3.019422 5 9.617950 17.10099 11.04401 11.578375 14.607864
## B 7.825492 3.194604 5 5.242423 13.05084 5.38352 7.068761 8.381917
##
## $comparison
## NULL
##
## $groups
## inv_simp groups
## A 12.789838 a
## B 7.825492 b
##
## attr(,"class")
## [1] "group"
#We are interested in the $groups table that says which sites are different. With a little tweaking, we can add this data to the graph we made. Lets add some nicer text as well.
group_data <- tukey_result$groups[order(rownames(tukey_result$groups)),]
ggplot(samples, aes(x = type, y = inv_simp)) + geom_text(data = data.frame(),aes(x = rownames(group_data), y = max(samples$inv_simp) + 1, label = group_data$groups), col = 'black', size = 10) + geom_boxplot() + ggtitle("Alpha diversity of Sample types") + xlab("type") + ylab("Inverse Simpson Index")
Usually we are interested in how groups of samples compare. For example, we might want to know which taxa differ between the nose and throat, or between men and women. The function compare_groups facilitates these comparisons:
#Get a table of differentially abundant taxa in the "type" groups of samples
obj$data$diff_table <- compare_groups(obj, dataset = "tax_abund",cols = samples$sample_id,groups = samples$type)
## Warning: Use of "dataset" is depreciated. Use "data" instead.
## Warning in wilcox.test.default(abund_1, abund_2): cannot compute exact p-value
## with ties
print(obj$data$diff_table)
## # A tibble: 191 x 7
## taxon_id treatment_1 treatment_2 log2_median_ratio median_diff mean_diff
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ab A B 1.96 0.142 0.189
## 2 ac A B -2.34 -0.526 -0.407
## 3 ad A B 1.41 0.151 0.170
## 4 ae A B 3.06 0.0346 0.0357
## 5 af A B 4.84 0.0971 0.0787
## 6 ag A B -0.645 -0.0450 -0.0633
## 7 ah A B -1.04 -0.00164 -0.00230
## 8 ai A B 1.96 0.142 0.181
## 9 aj A B -2.32 -0.511 -0.401
## 10 ak A B 0.964 0.0713 0.151
## # … with 181 more rows, and 1 more variable: wilcox_p_value <dbl>
#For each taxon, a Wilcoxon Rank Sum test was used to test for differences between the median abundances of samples in each treatment. We can use this information to create what we call a differential heat tree, which indicates which taxa are more abundant in each treatment:
heat_tree(obj,node_label = taxon_names,node_size = n_obs,node_color = log2_median_ratio,node_color_interval = c(-2, 2),node_color_range = c("cyan", "gray", "tan"),node_size_axis_label = "OTU count",node_color_axis_label = "Log 2 ratio of median proportions",layout = "davidson-harel",initial_layout = "reingold-tilford")
## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.
## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.
In this case, taxa colored tan are more abundant in type A and those colored blue are more abundant in type B. Note that we have not taken into account statistical significance when showing this, so lets do that. First, we need to correct for multiple comparisons:
# adjust p-values for multiple test correction
obj$data$diff_table$wilcox_p_value <- p.adjust(obj$data$diff_table$wilcox_p_value, method = "fdr")
#check the range of the p-values
range(obj$data$diff_table$wilcox_p_value, finite = TRUE)
## [1] 0.06450523 0.84126984
#if there still were some significant differences, we could set any difference that is not significant to zero and repeat the last heat_tree command doing something like (in this case I am setting the cutoff at 0.1, as there is nothing < 0.05):
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.1] <- 0
# repeat the heat_tree command with these new log_median_ratio values
heat_tree(obj,node_label = taxon_names,node_size = n_obs,node_color = log2_median_ratio,node_color_interval = c(-2, 2),node_color_range = c("cyan", "gray", "tan"),node_size_axis_label = "OTU count",node_color_axis_label = "Log 2 ratio of median proportions",layout = "davidson-harel",initial_layout = "reingold-tilford")
## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.
## Warning in FUN(X[[i]], ...): Could not apply layout 'davidson-harel' to
## subgraph. Using 'fruchterman-reingold' instead.