knitr::opts_chunk$set(echo = TRUE)

Lab set up/loading in data

### Abundance ###
# This is just copied from Lab 2
# Simply the number of sequences in the sample
# Because of sampling and sequencing biases this is less useful than one would hope.
abundance = function(x){
  return(sum(x,na.rm=TRUE))
}

### Group OTU table by Specified Taxonomic Rank ###
# This function takes an otu table and turns it into an abundance table for
# a different rank of taxonomy.
# OTU table must be only abundance data (no metadata!) and has OTUs as columns
#   and samples as rows
# The taxonomy table should have each OTU as a seperate row and 7 columns for the
#   different taxonomic ranks
# Ranks are as follows:
# 1: Kingdom, 2: Phylum, 3: Class, 4: Order, 5: Family, 6: Genus, 7: Species
group_by_rank = function(otu_table,taxonomy,metadata,rank){
  n_sample = nrow(otu_table)
  otu_table = cbind(t(otu_table),(taxonomy))
  if (rank>1){
    otu_table$taxa = apply(otu_table[,(n_sample+1):(n_sample+rank)],1,paste,collapse="")
  }else{
    otu_table$taxa = otu_table[,n_sample+1]
  }
  otu_table = group_by(otu_table,taxa)
  grouped_table = summarise_at(otu_table,1:n_sample,sum)
  taxa_names = grouped_table$taxa
  grouped_table = grouped_table[,-1]
  grouped_table = t(grouped_table)
  colnames(grouped_table) = taxa_names
  taxa_cols = 1:ncol(grouped_table)
  grouped_table = cbind(grouped_table,metadata)
  taxa_rel_abundance = colSums(grouped_table[,taxa_cols]/rowSums(grouped_table[,taxa_cols],na.rm=T),na.rm=T)
  grouped_table = group_by(grouped_table,SampleID)
  grouped_table = pivot_longer(grouped_table,names_to="taxa",values_to="abundance",cols=all_of(taxa_cols))
  grouped_table$taxa = factor(grouped_table$taxa,levels=names(sort(taxa_rel_abundance,decreasing=TRUE)))
  return(grouped_table)
}



#######################
### Start Code Here ###
#######################

# load packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

# Set working directory
setwd("C:/Users/siobh/OneDrive - The University Of British Columbia/TA/Biol 403 - Parfrey 2020/403 lab scripts")


# Read data set
seagrass_otu = read.table("Seagrass_2015_otu_metadata_clean.txt",header=TRUE,sep="\t")
seagrass_metadata = read.table("Seagrass_2015_all_metadata.txt",header=TRUE,sep="\t")
seagrass_taxonomy = read.table("Seagrass_2015_taxonomy.txt",header=TRUE,sep="\t")

# combine otu table and metadata into one data frame but take out the two duplicate metadata columns
seagrass = inner_join(seagrass_metadata,seagrass_otu[,-(2:3)],by="SampleID")
otu_cols = (ncol(seagrass_metadata)+1):ncol(seagrass)

# First calculate the OTU total abundance of each sample (we will need this shortly)
seagrass$total_abundance = apply(seagrass[,otu_cols],1,abundance)

# use pivot_longer to create a separate row for each otu in each sample
seagrass_long = pivot_longer(seagrass,names_to="OTU",values_to="abundance",cols=all_of(otu_cols))

# create a new column for relative abundance and plot this instead
seagrass_long$relative_abundance = seagrass_long$abundance/seagrass_long$total_abundance

# calculate the total relative abundance of each OTU
otu_rel_abundance = colSums(seagrass[,otu_cols]/seagrass$total_abundance)

# reorder the OTUs so the most abundant OTUs will appear as the top bar
seagrass_long$OTU = factor(seagrass_long$OTU,levels=names(sort(otu_rel_abundance,decreasing=TRUE)))

Including Plots

You can also embed plots, for example:

# This is exactly the same plot as above, just copied here
ggplot(seagrass_long,aes(x=SampleID,y=relative_abundance,fill=OTU)) +
  geom_bar(stat="identity",position="stack") +
  theme(legend.position = "none",
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

# This plot is still very hard to look at because all the colors blend together.
# Let's group samples based on host type to try and find any patterns.
# You can play with any categorical variable by changing the column name in the facet_grid function
# (don't delete the tilda!)
ggplot(seagrass_long,aes(x=SampleID,y=relative_abundance,fill=OTU)) +
  geom_bar(stat="identity",position="stack") +
  facet_grid(~host_type,scales = "free_x",space="free") +
  xlab("Samples") +
  ylab("Relative Abundance") +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

# You likely won't want to plot all the samples at once.
# For example, we can plot just he host associated ones and then look at samples from different lakes
seagrass_long_zostera = filter(seagrass_long,host_type=="zostera")
ggplot(seagrass_long_zostera,aes(x=SampleID,y=relative_abundance,fill=OTU)) +
  geom_bar(stat="identity",position="stack") +
  facet_grid(~region,scales = "free_x",space="free") +
  xlab("Samples") +
  ylab("Relative Abundance") +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

# Or water associates samples
seagrass_long_water = filter(seagrass_long,host_type=="water")
ggplot(seagrass_long_water,aes(x=SampleID,y=relative_abundance,fill=OTU)) +
  geom_bar(stat="identity",position="stack") +
  facet_grid(~region,scales = "free_x",space="free") +
  xlab("Samples") +
  ylab("Relative Abundance") +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

# If the titles on top aren’t fitting you can rotate them 90 degress by adding # strip.text=element_text(angle=90) into the theme function at the end # of the ggplot lines

These plots are still very hard to read because there are just far too many OTUs.

We can perhaps make this more interesting by coloring by a different taxonomic level so there are fewer colors.

We will use a function included at the top of the script to do this.

This next bit of code will convert your OTU table into a species table, or genus table, or whatever

taxa rank you choose.

This can be a bit slow if you have a large dataset.

# Set the rank you want to filter by
# 1: Kingdom, 2: Phylum, 3: Class, 4: Order, 5: Family, 6: Genus, 7: Species
seagrass_phyla = group_by_rank(otu_table = seagrass[,otu_cols],
                                 taxonomy  = seagrass_taxonomy,
                                 metadata  = seagrass[,-otu_cols],
                                 rank      = 2)

# Calculate relative abundances
seagrass_phyla$relative_abundance = seagrass_phyla$abundance/seagrass_phyla$total_abundance

# Now we can plot our new data frame seagrass_phyla the same way we did before
# A couple other stylistic changes were added to the theme at the bottom.
ggplot(seagrass_phyla,aes(x=SampleID,y=relative_abundance,fill=taxa)) +
  geom_bar(stat="identity",position="stack") +
  facet_grid(~host_type,scales = "free_x",space="free") +
  guides(fill=guide_legend(title="Phylum")) +             # Change the legend title
  xlab("Samples") +
  ylab("Relative Abundance") +
  labs(fill="Phylum") +
  theme_minimal() +
  theme(legend.position = "right",
        axis.text.x=element_blank(),
        axis.title.x = element_text(margin=margin(t=-15)),
        panel.grid.major = element_blank())

# If the titles on top aren't fitting you can rotate them 90 degress by adding
# strip.text=element_text(angle=90) into the theme function at the end
# of the ggplot lines



# These plots are still very hard to read because there are just far too many OTUs.
# We can perhaps make this more interesting by coloring by a different taxonomic level so there are fewer colors.
# We will use a function included at the top of the script to do this.

# This next bit of code will convert your OTU table into a species table, or genus table, or whatever
# taxa rank you choose. 
# This can be a bit slow if you have a large dataset.



# When dealing with a reasonable number of groups (maybe <25?) it is always better to chose your own colors
# GGplot's default colors are pretty, but they run together making the bars hard to differentiate
# Find how many colors we need
length(unique(seagrass_phyla$taxa))
## [1] 16
# I find the best way to get n distinct colors is from https://medialab.github.io/iwanthue/
# Just set the number of colors to the number from the above command and cope the "HEX json" output
# But you'll have to change the square brackets to parentheses
# though you have to do this manually. (even better check the colorblind friendly options!)
# These ones are from the fancy palette
my_colors = c("#ffb1bb",
              "#10c3c8",
              "#ffc58a",
              "#81bbff",
              "#ebfaac",
              "#50d1ff",
              "#c1bd72",
              "#49e4fd",
              "#d99f7f",
              "#afffc8",
              "#ffd2ff",
              "#82b888",
              "#a3a9d5",
              "#b4ffed",
              "#d39dac",
              "#c6efff")
# Feel free to generate your colors however you wish or chose them manually if you want even more control!
# Here is a list of all the colors R knows by name http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf

ggplot(seagrass_phyla,aes(x=SampleID,y=relative_abundance,fill=taxa)) +
  geom_bar(stat="identity",position="stack") +
  facet_grid(~host_type,scales = "free_x",space="free") +
  scale_fill_manual(values=my_colors) +
  guides(fill=guide_legend(title="Phylum")) +
  xlab("Samples") +
  ylab("Relative Abundance") +
  labs(fill="Phylum") +
  theme_minimal() +
  theme(legend.position = "right",
        axis.text.x=element_blank(),
        axis.title.x = element_text(margin=margin(t=-15)),
        panel.grid.major = element_blank())