knitr::opts_chunk$set(echo = TRUE)
Lab set up/loading in data
### Abundance ###
# This is just copied from Lab 2
# Simply the number of sequences in the sample
# Because of sampling and sequencing biases this is less useful than one would hope.
abundance = function(x){
return(sum(x,na.rm=TRUE))
}
### Group OTU table by Specified Taxonomic Rank ###
# This function takes an otu table and turns it into an abundance table for
# a different rank of taxonomy.
# OTU table must be only abundance data (no metadata!) and has OTUs as columns
# and samples as rows
# The taxonomy table should have each OTU as a seperate row and 7 columns for the
# different taxonomic ranks
# Ranks are as follows:
# 1: Kingdom, 2: Phylum, 3: Class, 4: Order, 5: Family, 6: Genus, 7: Species
group_by_rank = function(otu_table,taxonomy,metadata,rank){
n_sample = nrow(otu_table)
otu_table = cbind(t(otu_table),(taxonomy))
if (rank>1){
otu_table$taxa = apply(otu_table[,(n_sample+1):(n_sample+rank)],1,paste,collapse="")
}else{
otu_table$taxa = otu_table[,n_sample+1]
}
otu_table = group_by(otu_table,taxa)
grouped_table = summarise_at(otu_table,1:n_sample,sum)
taxa_names = grouped_table$taxa
grouped_table = grouped_table[,-1]
grouped_table = t(grouped_table)
colnames(grouped_table) = taxa_names
taxa_cols = 1:ncol(grouped_table)
grouped_table = cbind(grouped_table,metadata)
taxa_rel_abundance = colSums(grouped_table[,taxa_cols]/rowSums(grouped_table[,taxa_cols],na.rm=T),na.rm=T)
grouped_table = group_by(grouped_table,SampleID)
grouped_table = pivot_longer(grouped_table,names_to="taxa",values_to="abundance",cols=all_of(taxa_cols))
grouped_table$taxa = factor(grouped_table$taxa,levels=names(sort(taxa_rel_abundance,decreasing=TRUE)))
return(grouped_table)
}
#######################
### Start Code Here ###
#######################
# load packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
# Set working directory
setwd("C:/Users/siobh/OneDrive - The University Of British Columbia/TA/Biol 403 - Parfrey 2020/403 lab scripts")
# Read data set
seagrass_otu = read.table("Seagrass_2015_otu_metadata_clean.txt",header=TRUE,sep="\t")
seagrass_metadata = read.table("Seagrass_2015_all_metadata.txt",header=TRUE,sep="\t")
seagrass_taxonomy = read.table("Seagrass_2015_taxonomy.txt",header=TRUE,sep="\t")
# combine otu table and metadata into one data frame but take out the two duplicate metadata columns
seagrass = inner_join(seagrass_metadata,seagrass_otu[,-(2:3)],by="SampleID")
otu_cols = (ncol(seagrass_metadata)+1):ncol(seagrass)
# First calculate the OTU total abundance of each sample (we will need this shortly)
seagrass$total_abundance = apply(seagrass[,otu_cols],1,abundance)
# use pivot_longer to create a separate row for each otu in each sample
seagrass_long = pivot_longer(seagrass,names_to="OTU",values_to="abundance",cols=all_of(otu_cols))
# create a new column for relative abundance and plot this instead
seagrass_long$relative_abundance = seagrass_long$abundance/seagrass_long$total_abundance
# calculate the total relative abundance of each OTU
otu_rel_abundance = colSums(seagrass[,otu_cols]/seagrass$total_abundance)
# reorder the OTUs so the most abundant OTUs will appear as the top bar
seagrass_long$OTU = factor(seagrass_long$OTU,levels=names(sort(otu_rel_abundance,decreasing=TRUE)))
You can also embed plots, for example:
# This is exactly the same plot as above, just copied here
ggplot(seagrass_long,aes(x=SampleID,y=relative_abundance,fill=OTU)) +
geom_bar(stat="identity",position="stack") +
theme(legend.position = "none",
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# This plot is still very hard to look at because all the colors blend together.
# Let's group samples based on host type to try and find any patterns.
# You can play with any categorical variable by changing the column name in the facet_grid function
# (don't delete the tilda!)
ggplot(seagrass_long,aes(x=SampleID,y=relative_abundance,fill=OTU)) +
geom_bar(stat="identity",position="stack") +
facet_grid(~host_type,scales = "free_x",space="free") +
xlab("Samples") +
ylab("Relative Abundance") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# You likely won't want to plot all the samples at once.
# For example, we can plot just he host associated ones and then look at samples from different lakes
seagrass_long_zostera = filter(seagrass_long,host_type=="zostera")
ggplot(seagrass_long_zostera,aes(x=SampleID,y=relative_abundance,fill=OTU)) +
geom_bar(stat="identity",position="stack") +
facet_grid(~region,scales = "free_x",space="free") +
xlab("Samples") +
ylab("Relative Abundance") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# Or water associates samples
seagrass_long_water = filter(seagrass_long,host_type=="water")
ggplot(seagrass_long_water,aes(x=SampleID,y=relative_abundance,fill=OTU)) +
geom_bar(stat="identity",position="stack") +
facet_grid(~region,scales = "free_x",space="free") +
xlab("Samples") +
ylab("Relative Abundance") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# If the titles on top aren’t fitting you can rotate them 90 degress by adding # strip.text=element_text(angle=90) into the theme function at the end # of the ggplot lines
# Set the rank you want to filter by
# 1: Kingdom, 2: Phylum, 3: Class, 4: Order, 5: Family, 6: Genus, 7: Species
seagrass_phyla = group_by_rank(otu_table = seagrass[,otu_cols],
taxonomy = seagrass_taxonomy,
metadata = seagrass[,-otu_cols],
rank = 2)
# Calculate relative abundances
seagrass_phyla$relative_abundance = seagrass_phyla$abundance/seagrass_phyla$total_abundance
# Now we can plot our new data frame seagrass_phyla the same way we did before
# A couple other stylistic changes were added to the theme at the bottom.
ggplot(seagrass_phyla,aes(x=SampleID,y=relative_abundance,fill=taxa)) +
geom_bar(stat="identity",position="stack") +
facet_grid(~host_type,scales = "free_x",space="free") +
guides(fill=guide_legend(title="Phylum")) + # Change the legend title
xlab("Samples") +
ylab("Relative Abundance") +
labs(fill="Phylum") +
theme_minimal() +
theme(legend.position = "right",
axis.text.x=element_blank(),
axis.title.x = element_text(margin=margin(t=-15)),
panel.grid.major = element_blank())
# If the titles on top aren't fitting you can rotate them 90 degress by adding
# strip.text=element_text(angle=90) into the theme function at the end
# of the ggplot lines
# These plots are still very hard to read because there are just far too many OTUs.
# We can perhaps make this more interesting by coloring by a different taxonomic level so there are fewer colors.
# We will use a function included at the top of the script to do this.
# This next bit of code will convert your OTU table into a species table, or genus table, or whatever
# taxa rank you choose.
# This can be a bit slow if you have a large dataset.
# When dealing with a reasonable number of groups (maybe <25?) it is always better to chose your own colors
# GGplot's default colors are pretty, but they run together making the bars hard to differentiate
# Find how many colors we need
length(unique(seagrass_phyla$taxa))
## [1] 16
# I find the best way to get n distinct colors is from https://medialab.github.io/iwanthue/
# Just set the number of colors to the number from the above command and cope the "HEX json" output
# But you'll have to change the square brackets to parentheses
# though you have to do this manually. (even better check the colorblind friendly options!)
# These ones are from the fancy palette
my_colors = c("#ffb1bb",
"#10c3c8",
"#ffc58a",
"#81bbff",
"#ebfaac",
"#50d1ff",
"#c1bd72",
"#49e4fd",
"#d99f7f",
"#afffc8",
"#ffd2ff",
"#82b888",
"#a3a9d5",
"#b4ffed",
"#d39dac",
"#c6efff")
# Feel free to generate your colors however you wish or chose them manually if you want even more control!
# Here is a list of all the colors R knows by name http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ggplot(seagrass_phyla,aes(x=SampleID,y=relative_abundance,fill=taxa)) +
geom_bar(stat="identity",position="stack") +
facet_grid(~host_type,scales = "free_x",space="free") +
scale_fill_manual(values=my_colors) +
guides(fill=guide_legend(title="Phylum")) +
xlab("Samples") +
ylab("Relative Abundance") +
labs(fill="Phylum") +
theme_minimal() +
theme(legend.position = "right",
axis.text.x=element_blank(),
axis.title.x = element_text(margin=margin(t=-15)),
panel.grid.major = element_blank())