Phyloseq Tutorial

Section 1 Loading data/data manipulations

Load the packages we will be using

library(phyloseq)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.3.1
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(csv)

We will start by loading the metadata into R

sdata <- as.csv("/home/lgschaer/boatbiome/phyloseq_input/boatbiome_sampledata.csv", row.names = 1, header = TRUE, sep = ",", check.names = TRUE, stringsAsFactors = TRUE)
head(sdata)

Filter data to remove blanks and only include the samples we are using. We will make two versions of the sample data.

sdata2 will have a “SampleID” column that we can use to join it to the sequencing table to allow us to filter the sequencing table as well.

sdata2 <- sdata %>% 
  rownames_to_column(var = "SampleID") %>%
  filter(Sample_Project == "Boat_Microbiome") %>%
  filter(datatype == "water"|datatype == "swab"|datatype == "bilge"|datatype == "dock"|datatype == "air")%>%
  as_tibble()
head(sdata2)

sdate3 will have a rownames column without the “SampleID” header, which is the format it needs to be in to make a phyloseq object.

sdata3 <- sdata2 %>%
  column_to_rownames(var = "SampleID")
head(sdata3)

Load the sequence table (ASV table which we made in dada2)

sequence_table <- readRDS("/home/lgschaer/boatbiome/phyloseq_input/seqtab.rds")    
sequence_table[1:3,1:3]

##          GTGCCAGCAGCCGCGGTAAGACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTCAAGTCCGCCGTCAAATTCCAGGGCTCAACCCTGGACAGGCGGTAGAAACTACCAAGCTGGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTGAAATGCGTTGAGATCGGAAAGAACACCAACGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGCTAGGGGAGCAAATGGGATTAGATACCCCAGTAGTCCTAGCCGTAAACGATGGATACTAAGTGCTGTGCGTATCGACCCGCGCAGTGCTGTAGCTAACGCGTTAAGTATCCCGCCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGAAATTGACGG
## 061902-b                                                                                                                                                                                                                                                                                                                                                                                                                             0
## 061903-a                                                                                                                                                                                                                                                                                                                                                                                                                             0
## 061903-b                                                                                                                                                                                                                                                                                                                                                                                                                             0
##          GTGCCAGCAGCCGCGGTAAGACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTCAAGTCCGCCGTCAAATTCCAGGGCTCAACCCTGGACAGGCGGTAGAAACTACCAAGCTGGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTGAAATGCGTTGAGATCGGAAAGAACACCAACGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGCTAGGGGAGCAAATGGGATTAGATACCCCAGTAGTCCTAGCCGTAAACGATGGATACTAAGTGCTGTGCGTATCGACCCGCGCAGTGCTGTAGCTAACGCGTTAAGTATCCCGCCTGGGGAGTACGTTCGCAAGAATGAAACTCAAATAAATTGACGG
## 061902-b                                                                                                                                                                                                                                                                                                                                                                                                                             0
## 061903-a                                                                                                                                                                                                                                                                                                                                                                                                                             0
## 061903-b                                                                                                                                                                                                                                                                                                                                                                                                                             0
##          GTGCCAGCAGCCGCGGTAATACATAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGTGGTTCGTCACGTCGGATGTGAAAATCTGGGGCTTAACCCCAGACCTGCATTCGATACGGGCGAGCTAGAGTGTGGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCAATGGCGAAGGCAGGTCTCTGGGCCATTACTGACACTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGGTGGGCACTAGTTGTGGGAACCTTCCACGGTTTCTGCGACGCAGCTAACGCATTAAGTGCCCCGCCTGGGGAGTACGATCGCAAGATTAAAACTCAAAGAAATTGACGG
## 061902-b                                                                                                                                                                                                                                                                                                                                                                                                                            48
## 061903-a                                                                                                                                                                                                                                                                                                                                                                                                                             0
## 061903-b                                                                                                                                                                                                                                                                                                                                                                                                                            56

Now we will make some changes to the sequence table to get it in the right format to use in phyloseq. We will view a small portion of the sequence table after each change to make sure it worked.

Remove the column names

colnames(sequence_table) <- NULL
sequence_table[1:5,1:5]

##             [,1] [,2] [,3] [,4] [,5]
## 061902-b       0    0   48   47    0
## 061903-a       0    0    0    0    0
## 061903-b       0    0   56   70    0
## 0619B-b        0    0  110    0    0
## 0619bilge-a    0    0    0    0    0

Change format to data frame

sequence_table <- as.data.frame(sequence_table)
class(sequence_table)

## [1] "data.frame"

Add header to “SampleID” column

sequence_table <- rownames_to_column(sequence_table, var = "SampleID")
sequence_table[1:5,1:5]

Change sample names on sequence table to match sample names in metadata

sequence_table <- sequence_table %>%
  mutate(SampleID = str_replace_all(SampleID, "-", "_"))
sequence_table[1:4,1:4]

Add metadata to sequence table, this will allow us to filter out blanks/samples we do not want to include in our analysis

seq_table <- sdata2 %>% 
  left_join(sequence_table, by = "SampleID")
seq_table[1:5,1:5]

filter samples from the sequence table to only include what we want to use

#subset to only include sampletypes we want
subset_all <- seq_table %>%
  subset(Sample_Project=="Boat_Microbiome")%>%
  filter(datatype == "water"|datatype == "swab"|datatype == "bilge"|datatype == "dock"|datatype == "air")
subset_all[1:5,1:5]

Remove metadata columns from the sequence table and change the “SampleID” column to row names

#subset to only include sampletypes we want
subset_all <- seq_table %>%
  column_to_rownames(var = "SampleID")%>%
  select(-c(Sample_Project, sampletype, date, filtertype, datatype))
subset_all[1:5,1:5]

Remove column names from sequence table

colnames(subset_all) <- NULL
subset_all[1:5,1:5]

Load data for taxa table:

taxa <- readRDS("/home/lgschaer/boatbiome/phyloseq_input/taxa.rds")
taxa[1:5,1:5]

##      Kingdom    Phylum           Class            Order       
## [1,] "Bacteria" "Cyanobacteria"  "Chloroplast"    NA          
## [2,] "Bacteria" "Cyanobacteria"  "Chloroplast"    NA          
## [3,] "Bacteria" "Actinobacteria" "Actinobacteria" "Frankiales"
## [4,] "Bacteria" "Actinobacteria" "Actinobacteria" "Frankiales"
## [5,] "Bacteria" "Cyanobacteria"  "Chloroplast"    NA          
##      Family           
## [1,] NA               
## [2,] NA               
## [3,] "Sporichthyaceae"
## [4,] "Sporichthyaceae"
## [5,] NA

Section 2 Making a phyloseq object

To make the phyloseq object, we need sample data, a sequence table and a taxa table

#make phyloseq object
samdata = sample_data(sdata3)
seqtab = otu_table(subset_all, taxa_are_rows = FALSE)
taxtab = tax_table(taxa)

Combine all items into a phyloseq object

phyloseq_object_all = phyloseq(otu_table(seqtab), tax_table(taxtab), sample_data(samdata))
phyloseq_object_all

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 33191 taxa and 217 samples ]
## sample_data() Sample Data:       [ 217 samples by 5 sample variables ]
## tax_table()   Taxonomy Table:    [ 33191 taxa by 6 taxonomic ranks ]

Section 3: Normalize the data

Delete samples with a mean of less than 1000

samplesover1000_all <- subset_samples(phyloseq_object_all, sample_sums(phyloseq_object_all) > 1000)

Check if there are ASVs with no counts and how many there are

any(taxa_sums(samplesover1000_all) == 0)

## [1] TRUE

sum(taxa_sums(samplesover1000_all) == 0)

## [1] 809

If there are ASVs with no counts, we should remove them

prune_samplesover1000_all <- prune_taxa(taxa_sums(samplesover1000_all) > 0, samplesover1000_all)
any(taxa_sums(prune_samplesover1000_all) == 0)

## [1] FALSE

Set seed for reproducible data

set.seed(81)

Rarefy data

rarefy_samplesover1000_all <- rarefy_even_depth(prune_samplesover1000_all, rngseed= 81, sample.size = min(sample_sums(prune_samplesover1000_all)))

## `set.seed(81)` was used to initialize repeatable random subsampling.

## Please record this for your records so others can reproduce.

## Try `set.seed(81); .Random.seed` for the full vector

## ...

## 15694OTUs were removed because they are no longer 
## present in any sample after random subsampling

## ...

Change phyloseq object to a more memorable name

bbps <- rarefy_samplesover1000_all

bbps

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 16688 taxa and 178 samples ]
## sample_data() Sample Data:       [ 178 samples by 5 sample variables ]
## tax_table()   Taxonomy Table:    [ 16688 taxa by 6 taxonomic ranks ]

Section 3: plotting

Assign each category a color, define sample types and create sample labels.

sample_colors <- c("air" = "lightgoldenrod", "bilge" = "darkorange", "boatback" = "maroon", "boatrear" = "indianred2", "boatside" = "red3", "dock" = "tan4", "water" = "lightblue")

sample_types <- c("bilge", "boatback", "boatside", "boatrear", "water", "air", "dock")

sample_labels <- c("bilge" ="Bilge", "water" = "Water", "boatback" = "Boat\nBack", "boatside" = "Boat\nSide", "boatrear" = "Boat\nRear", "dock" = "Dock", "air" = "Air")

Create violin plot of alpha diversity

bbps %>%                                                              #phyloseq object
  plot_richness(
    x = "sampletype",                                                #compare diversity of datatype
    measures = c("Observed", "Shannon")) +                           #choose diversity measures
  geom_violin(aes(fill = sampletype), show.legend = FALSE)+          #make violin plot, set fill aes to sampletype
  geom_boxplot(width=0.1) +                                          #add boxplot, set width
  theme_classic()+                                                   #change theme to classic
  xlab(NULL)+                                                        #no label on x-axis
  theme(axis.text.y.left = element_text(size = 20),                  #adjust y-axis text
        axis.text.x = element_text(size = 20, hjust = 0.5, vjust = 1, angle = 0),           #adjust x-axis label position
        axis.title.y = element_text(size = 20))+                     #adjust y-axis title
  theme(strip.text = element_text(face = "bold", size = 25))+        #adjust headings
  scale_fill_manual(values = sample_colors)+   #set fill colors
  scale_x_discrete(                                                  #change x-axis labels
    breaks = sample_types, 
    labels = sample_labels)+                   
  ggtitle("Alpha Diversity") +                                       #add title
  theme(plot.title=element_text(size = 25, face = "bold", hjust = 0.5)) #change title size, face and position

Making a PCoA plot

Make an ordination

#ordination
all_pcoa <- ordinate(
  physeq = bbps, 
  method = "PCoA", 
  distance = "bray"
)

Plot

plot_ordination(
  physeq = bbps,                                                         #phyloseq object
  ordination = all_pcoa)+                                                #ordination
  geom_point(aes(fill = sampletype, shape = filtertype), size = 3) +     #sets fill color to sampletype
  scale_shape_manual(values = c(21, 22, 23))+
  scale_fill_manual(values = sample_colors) +
  theme_classic() +                                                      #changes theme, removes grey background
  theme(                             
    legend.text = element_text(size = 20),                               #changes legend size
    legend.title = element_blank(),                                      #removes legend title
    legend.background = element_rect(fill = "white", color = "black"))+  #adds black boarder around legend
  theme(axis.text.y.left = element_text(size = 20),
        axis.text.x = element_text(size = 20),
        axis.title.x = element_text(size = 20),
        axis.title.y = element_text(size = 20))+
  guides(fill = guide_legend(override.aes = list(shape = 21)))           #fills legend points based on the fill command

Making a taxa plot

Filter out eukaryotes and mitochondria

justbacteria <- bbps %>%
  subset_taxa(
    Kingdom == "Bacteria" &                   #only bacteria
      Family  != "mitochondria" &             #filter out mitochondria
      Class   != "Chloroplast"                #filter out chloroplasts
  )
justbacteria

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 13247 taxa and 178 samples ]
## sample_data() Sample Data:       [ 178 samples by 5 sample variables ]
## tax_table()   Taxonomy Table:    [ 13247 taxa by 6 taxonomic ranks ]

Convert phyloseq object into a data frame with relative abundance counts

phylumabundance <- justbacteria %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt() %>%                                         # Melt to long format
  arrange(Phylum) 
head(phylumabundance)

Summarize the most abundant taxa, filter to only include taxa with > 1% abundance

all <- phylumabundance %>%
  select(Phylum, sampletype, Abundance, date) %>%
  group_by(Phylum, sampletype, date) %>%
  summarize(
    avg_abundance = mean(Abundance)
  ) %>%
  filter(avg_abundance > 0.01)
head(all)

Save phylum colors

phylum_colors <- c(
  "#CBD588", "#5F7FC7", "orange","#DA5724", "#508578", "#CD9BCD",
  "#AD6F3B", "#673770","#D14285", "#652926", "#C84248", 
  "#8569D5", "#5E738F","#D1A33D", "#8A7C64", "#599861"
)

Create taxa plot!

ggplot(all)+
  geom_col(mapping = aes(x = sampletype, y = avg_abundance, fill = Phylum), position = "fill", show.legend = TRUE)+
  ylab("Proportion of Community") +
  scale_fill_manual(values = phylum_colors) +
  xlab(NULL)+
  theme_minimal()+
  theme(axis.text.y.left = element_text(size = 20),
        axis.text.x = element_text(size = 20, angle = 90, vjust = 0.5, hjust = 1),
        axis.title.y = element_text(size = 20),
        title = element_text(size = 25))

Phyloseq Tutorial

Laura Schaerer

7/26/2019

Section 1 Loading data/data manipulations

Section 2 Making a phyloseq object

Section 3: Normalize the data

Section 3: plotting