library(dplyr)  ## for data wrangling - %>% function
library(reshape2)  ##melt and cast data
library(tidyr)  # 'separate' function
library(readxl)  #read xlsx files into r on mac computer
library(vegan)  # dissimilarity matrix, permanova functions
library(tidyverse)
library(stringr)
library(ggplot2)  # plotting
library(magrittr)
library(cowplot)
library(formatR)

Logic

map6 <- read.csv("data/MapDJ.csv", header = TRUE, col.names = c("SampleID", 
    "BarcodeSequence", "LinkerPrimerSequence", "Origin", "Sterility", "Plate", 
    "Description"))

Background: Denise add text:

Q1. Does the sterility of rearing conditions alter slug-associated microbial communities ?

Outputs:

1. Determine the richness of OTUs identified from each treatment

2. NMDS of unweighted unifrac or jaccard dissimilarity. PERMANOVA with PERMDISP.

Prepare data

Split taxonomy column into multiple variables

# bacteria otu
B16S <- read.csv("data/FilteredTable.csv", header = TRUE, stringsAsFactors = FALSE)

# split taxa groupings into new columns
source("slug_functions.R")
B16S <- split_taxa(B16S)

Read in UniFrac distances

UniFrac is a measure of B-diversity that uses phylogenetic information to compare community samples. Use with multivariate methods like principal coordinates (PCoA), NMDS, etc to explain differences among communities. Measured as the amount of evolutionary history that is unique to each group - the fraction of branch length in a phylogenetic tree associated with ONLY the focal sample in pairwise comparison. With sequence data UniFrac values can be invfluencd by the number of sequences/sample and can use sequence jackknifing (Lozupone et al 2011. ISME).
Use to compare phylogenetic lineages between groups, to cluster samples.

unifrac <- read.table("data/unweighted_unifrac_dm.txt")
unifrac_wt <- read.table("data/weighted_unifrac_dm.txt")

Use heatmap to visualize sample dissimilarity based on UniFrac

View(B16S)

Prep data for multivariate anaylsis

otu.taxa <- read.csv("data/OTU_taxa_id.csv")  # otu ids
otu.comm <- read.csv("data/OTU_community_clean.csv")
## the samples in the unifrac dist and community need to be in the same order
## need to order community based on unifraq

# make new df based on unifrac, matched to same order as unifrac match the
# order to colnames(unifrac), rewrite over same name
otu.comm <- otu.comm[match(colnames(unifrac), otu.comm$SampleID), ]
# this says - for the community df, order the rows to match the order of the
# column names in unifrac, that match to SampleID

# reorder mapping data to community data
grps <- map6[match(colnames(unifrac), map6$SampleID), ]
# now that all the data (community, distances, grouping variables) are
# ordered the same, we can use permanova etc

Structure of grps

str(grps)
## 'data.frame':    30 obs. of  7 variables:
##  $ SampleID            : Factor w/ 30 levels "ALN1","ALN2",..: 29 7 27 21 28 11 9 6 22 20 ...
##  $ BarcodeSequence     : logi  NA NA NA NA NA NA ...
##  $ LinkerPrimerSequence: logi  NA NA NA NA NA NA ...
##  $ Origin              : Factor w/ 2 levels "lab","wild": 2 2 2 1 2 2 1 2 1 2 ...
##  $ Sterility           : Factor w/ 2 levels "N","S": 1 2 1 2 1 2 1 2 2 2 ...
##  $ Plate               : Factor w/ 4 levels "A","B","C","na": 4 4 4 4 4 4 2 4 4 4 ...
##  $ Description         : Factor w/ 8 levels "LRNA","LRNB",..: 5 6 5 4 5 7 2 6 4 8 ...

Permanova

Permanova tests for differences in composition among groups
Reminder - permanova is always based on pairwise distances/dissimilarities.

set.seed(304)
# unifrac distances
str(grps)  #look at data types to indetify error source
## 'data.frame':    30 obs. of  7 variables:
##  $ SampleID            : Factor w/ 30 levels "ALN1","ALN2",..: 29 7 27 21 28 11 9 6 22 20 ...
##  $ BarcodeSequence     : logi  NA NA NA NA NA NA ...
##  $ LinkerPrimerSequence: logi  NA NA NA NA NA NA ...
##  $ Origin              : Factor w/ 2 levels "lab","wild": 2 2 2 1 2 2 1 2 1 2 ...
##  $ Sterility           : Factor w/ 2 levels "N","S": 1 2 1 2 1 2 1 2 2 2 ...
##  $ Plate               : Factor w/ 4 levels "A","B","C","na": 4 4 4 4 4 4 2 4 4 4 ...
##  $ Description         : Factor w/ 8 levels "LRNA","LRNB",..: 5 6 5 4 5 7 2 6 4 8 ...
# Elevation is integer - change to factor

slug.uni <- adonis2(unifrac ~ Sterility + Origin, data = grps, permutations = 1000)
slug.uni
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = unifrac ~ Sterility + Origin, data = grps, permutations = 1000)
##           Df SumOfSqs      R2      F   Pr(>F)   
## Sterility  1   0.3663 0.05254 1.5873 0.014985 * 
## Origin     1   0.3746 0.05373 1.6231 0.006993 **
## Residual  27   6.2308 0.89373                   
## Total     29   6.9717 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
slug.uni2 <- adonis2(unifrac ~ Sterility + Origin + Plate, data = grps, permutations = 1000)
slug.uni2
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = unifrac ~ Sterility + Origin + Plate, data = grps, permutations = 1000)
##           Df SumOfSqs      R2      F   Pr(>F)   
## Sterility  1   0.3663 0.05254 1.6507 0.010989 * 
## Origin     1   0.3746 0.05373 1.6879 0.007992 **
## Plate      3   0.9050 0.12980 1.3593 0.001998 **
## Residual  24   5.3259 0.76393                   
## Total     29   6.9717 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# unifrac distances with month nested in year
ad.uniNest <- adonis2(unifrac ~ Sterility/Plate + Origin, data = grps, permutations = 1000)
ad.uniNest
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = unifrac ~ Sterility/Plate + Origin, data = grps, permutations = 1000)
##                 Df SumOfSqs      R2      F   Pr(>F)   
## Sterility        1   0.3663 0.05254 1.6507 0.007992 **
## Origin           1   0.3746 0.05373 1.6879 0.010989 * 
## Sterility:Plate  3   0.9050 0.12980 1.3593 0.003996 **
## Residual        24   5.3259 0.76393                   
## Total           29   6.9717 1.00000                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# unifrac distances weighted by relative abundances
ad.uniwt <- adonis2(unifrac_wt ~ Sterility + Origin, data = grps, permutations = 1000)
ad.uniwt
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 1000
## 
## adonis2(formula = unifrac_wt ~ Sterility + Origin, data = grps, permutations = 1000)
##           Df SumOfSqs      R2      F   Pr(>F)    
## Sterility  1  0.50368 0.19092 7.3463 0.000999 ***
## Origin     1  0.28325 0.10737 4.1312 0.003996 ** 
## Residual  27  1.85120 0.70171                    
## Total     29  2.63813 1.00000                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Sterility and Origin and Plate are significant in all tests.

Manipulate code for test of interest

NMDS

The NMDS often pairs well with the PERMANOVA because both are more robust to properties of community data. To gain proper fit may need to try multiple dissimilarity methods or tranformations.

## Run 0 stress 0.1405096 
## Run 1 stress 0.1403842 
## ... New best solution
## ... Procrustes: rmse 0.08214802  max resid 0.3980253 
## Run 2 stress 0.1547363 
## Run 3 stress 0.1403772 
## ... New best solution
## ... Procrustes: rmse 0.001148822  max resid 0.004577874 
## ... Similar to previous best
## Run 4 stress 0.1536064 
## Run 5 stress 0.1414833 
## Run 6 stress 0.1576283 
## Run 7 stress 0.1414833 
## Run 8 stress 0.1510142 
## Run 9 stress 0.1405093 
## ... Procrustes: rmse 0.08214842  max resid 0.4010473 
## Run 10 stress 0.1527337 
## Run 11 stress 0.1485077 
## Run 12 stress 0.1406034 
## ... Procrustes: rmse 0.01288115  max resid 0.03868609 
## Run 13 stress 0.1581357 
## Run 14 stress 0.1522202 
## Run 15 stress 0.144763 
## Run 16 stress 0.1574926 
## Run 17 stress 0.1530941 
## Run 18 stress 0.1495525 
## Run 19 stress 0.1458837 
## Run 20 stress 0.1531132 
## *** Solution reached
## Run 0 stress 0.08534348 
## Run 1 stress 0.08534199 
## ... New best solution
## ... Procrustes: rmse 0.004667868  max resid 0.0207152 
## Run 2 stress 0.08533844 
## ... New best solution
## ... Procrustes: rmse 0.003984785  max resid 0.01769576 
## Run 3 stress 0.08533989 
## ... Procrustes: rmse 0.000181527  max resid 0.000778532 
## ... Similar to previous best
## Run 4 stress 0.08533765 
## ... New best solution
## ... Procrustes: rmse 0.003340559  max resid 0.01482692 
## Run 5 stress 0.08533465 
## ... New best solution
## ... Procrustes: rmse 0.002517323  max resid 0.01118061 
## Run 6 stress 0.1240923 
## Run 7 stress 0.1053119 
## Run 8 stress 0.08529699 
## ... New best solution
## ... Procrustes: rmse 0.003851164  max resid 0.01360264 
## Run 9 stress 0.08533951 
## ... Procrustes: rmse 0.005811479  max resid 0.02099685 
## Run 10 stress 0.124088 
## Run 11 stress 0.08533971 
## ... Procrustes: rmse 0.003445163  max resid 0.01357549 
## Run 12 stress 0.08529326 
## ... New best solution
## ... Procrustes: rmse 0.0005073533  max resid 0.002245137 
## ... Similar to previous best
## Run 13 stress 0.1053128 
## Run 14 stress 0.1240884 
## Run 15 stress 0.0852929 
## ... New best solution
## ... Procrustes: rmse 6.815135e-05  max resid 0.0002975731 
## ... Similar to previous best
## Run 16 stress 0.08534383 
## ... Procrustes: rmse 0.005813618  max resid 0.02098805 
## Run 17 stress 0.1053122 
## Run 18 stress 0.08534019 
## ... Procrustes: rmse 0.005436154  max resid 0.01891636 
## Run 19 stress 0.1377937 
## Run 20 stress 0.145839 
## *** Solution reached

Plot ordination

ggplot(nmds.uni, aes(NMDS1, NMDS2)) + geom_point(aes(color = Sterility, shape = Origin))
## Error in ggplot(nmds.uni, aes(NMDS1, NMDS2)): object 'nmds.uni' not found

Manipulate plot code to make figure as desired

Separates well by elevation/site, within each elevation the months are grouped together for the most part.

## edit shape scale - the + is hard to see
## https://ggplot2.tidyverse.org/reference/scale_shape.html

# Show a list of available shapes
df_shapes <- data.frame(shape = 0:25)
ggplot(df_shapes, aes(0, 0, shape = shape)) + geom_point(aes(shape = shape), 
    size = 5, fill = "red") + scale_shape_identity() + facet_wrap(~shape) + 
    theme_void()

# shapes in red let you manipulate the color & fill (rather than just color)
# color then corresponds to the shape outline, and fill to the inside let's
# use the fill to indicate the Year, shape for the Month, and color for
# Elevation
# let's use the fill to indicate the Year, shape for the Month

# color = Sterility
ggplot(nmds.uni, aes(NMDS1, NMDS2))+
  geom_point(aes(color=Sterility), size=3)+
  theme_bw()
## Error in ggplot(nmds.uni, aes(NMDS1, NMDS2)): object 'nmds.uni' not found
# fill = Sterility
ggplot(nmds.uni, aes(NMDS1, NMDS2))+
  geom_point(aes(color=Sterility, fill=Origin), size=3)+
  scale_color_gradient2(low='turquoise', mid='yellow', high='red', midpoint=1500)+
  theme_bw()
## Error in ggplot(nmds.uni, aes(NMDS1, NMDS2)): object 'nmds.uni' not found
# note - this is the same as before even with fill = Year added because we are using a shape that cannot be filled

# fill = Year, shape = 21
ggplot(nmds.uni, aes(NMDS1, NMDS2))+
  geom_point(aes(color=Sterility, fill=Origin), shape = 21,color='black', size=3)+
  scale_color_gradient2(low='turquoise', mid='yellow', high='red', midpoint=1500)+
  theme_bw()
## Error in ggplot(nmds.uni, aes(NMDS1, NMDS2)): object 'nmds.uni' not found
# shape = 21 is the filled circle

# shape = Month
ggplot(nmds.uni, aes(NMDS1, NMDS2))+
  geom_point(aes(color=Sterility, fill=Origin, shape=Plate),color='black', size=3)+
  scale_color_gradient2(low='turquoise', mid='yellow', high='red', midpoint=1500)+
  scale_shape_manual(values=c(21, 22, 24))+ # the values correspond to the shapes above
  theme_bw()
## Error in ggplot(nmds.uni, aes(NMDS1, NMDS2)): object 'nmds.uni' not found
# and to change the color of the fill:
ggplot(nmds.uni, aes(NMDS1, NMDS2))+
  geom_point(aes(color=Sterility, fill=Origin), shape = 21,color='black', size=3)+
  scale_color_gradient2(low='turquoise', mid='yellow', high='red', midpoint=1500)+
  scale_shape_manual(values=c(21, 22, 24))+ # the values correspond to the shapes above
  scale_fill_manual(values=c('black','white'))+
  theme_bw()
## Error in ggplot(nmds.uni, aes(NMDS1, NMDS2)): object 'nmds.uni' not found
# lets add the stress to the plot
mds.uni$stress #the stress
## [1] 0.1403772
ggplot(nmds.uni, aes(NMDS1, NMDS2))+
  geom_point(aes(color=Sterility, fill=Plate, shape=Origin),color='black', size=3)+
  scale_color_gradient2(low='turquoise', mid='yellow', high='red', midpoint=1500)+
  scale_shape_manual(values=c(21, 22, 24))+ # the values correspond to the shapes above
  scale_fill_manual(values=c('black','white'))+
  theme_bw()+
  annotate('text', x=0.3, -0.25, label=paste0('Stress = ',round(mds.uni$stress,2)))
## Error in ggplot(nmds.uni, aes(NMDS1, NMDS2)): object 'nmds.uni' not found

NMDS vs PCoA

NMDS is an unconstrained ordination method to visualize multivariate data in fewer dimensions. Depending on the properties of your data and your questions, different methods of ordination may be appropriate. PCoA (or MDS) is a metric version of NMDS, meaning that PCoA is a Euclidean representation of dissimilarities. So, like NMDS PCoA uses dissimilarities, but it is euclidean rather than rank-order which is used to preserve distances hierarchically. NMDS and distance-based ordinations/tests are often easier to use with community data because 0’s and 1’s, rare species are common and do not configure in euclidean space.

In general, ordination & multivaraite methods are defined by (1) whether or not using dissimilarity, (2) whether or not euclidean, (3) test for differences vs explaining differences …

PCoA, like PCA, returns a set of orthogonal axes with associated eignevalues that measure their importance. PCoA and PCA should produce the same results if the PCA is calculated on a covariance matrix with scaling of 1.

plot the pcoa

Q2. If differences exist, are there particular taxa that contribute disproportionately to those differences?

Note: Can’t run any analyses that rely on relative abundance data (e.g., simper), Address using absence-presence data

Outputs:

1. Indicator species analysis to identify taxon-habitat association patterns.

This procedure identifies OTUs as indicator species independently from their abundance in the total data set. ####Steps (from https://www.nature.com/articles/ismej2015238):

  1. single- and doubleton OTUs are removed as they hold little indicator informationusing the multipatt function (number of permutations=9999) implemented in the indicspecies R package (De Cáceres et al., 2010).
  2. To account for multiple testing, P-values were corrected by calculating false discovery rates (q-values) with the q-value function implemented in the BiocLite R package (Dabney and Storney, 2014). Indicator OTUs with q<0.05 were considered significant.
  3. Indicator taxa were represented in bipartite networks by using the edge-weighted spring embedded algorithm layout implemented in Cytoscape v.3.0.2 (Shannon et al., 2003) where point biserial correlation values, a measure of species–habitat association, were used to weight the edges between nodes constituting the habitats and indicator OTUs (Hartmann et al., 2015).
  4. We further mapped these indicator OTUs on taxonomic networks generated in Cytoscape v.3.0.2 to investigate potential taxa–habitat associations (Hartmann et al., 2015). I
  5. Indicator OTUs classified at the genus level were displayed in a taxonomic tree generated in iTOL (Letunic and Bork, 2011) together with the positive point biserial correlation values associated with each habitat.
  6. To find patterns of co-occurrence among OTUs characteristic of the habitats studied, we analysed correlations among the relative abundances of all indicator OTUs (co-correlations) by calculating Spearman rank correlation values (Spearman, 1904) in R with the function corr.test implemented in the package psych. Multiple testing was accounted for by correcting P-values with a false discovery rates of 5% (q-value<0.05).
  7. Bacterial and fungal indicator OTUs that were significantly co-correlated were displayed in networks with the edge-weighted spring embedded algorithm layout implemented in Cytoscape where Spearman correlation values were used to weight the edges between the nodes representing the OTUs (Hartmann et al., 2015).”

Q3. Can the observed variation in XXX be explained by origin? By sterility?

Outputs:

1. Multiple regression on distance matrix