Load the packages we will be using
library(phyloseq)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(csv)
We will start by loading the metadata into R
sdata <- as.csv("/home/lgschaer/boatbiome/phyloseq_input/boatbiome_sampledata.csv", row.names = 1, header = TRUE, sep = ",", check.names = TRUE, stringsAsFactors = TRUE)
head(sdata)
Filter data to remove blanks and only include the samples we are using. We will make two versions of the sample data.
sdata2 will have a “SampleID” column that we can use to join it to the sequencing table to allow us to filter the sequencing table as well.
sdata2 <- sdata %>%
rownames_to_column(var = "SampleID") %>%
filter(Sample_Project == "Boat_Microbiome") %>%
filter(datatype == "water"|datatype == "swab"|datatype == "bilge"|datatype == "dock"|datatype == "air")%>%
as_tibble()
head(sdata2)
sdate3 will have a rownames column without the “SampleID” header, which is the format it needs to be in to make a phyloseq object.
sdata3 <- sdata2 %>%
column_to_rownames(var = "SampleID")
head(sdata3)
Load the sequence table (ASV table which we made in dada2)
sequence_table <- readRDS("/home/lgschaer/boatbiome/phyloseq_input/seqtab.rds")
sequence_table[1:3,1:3]
## GTGCCAGCAGCCGCGGTAAGACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTCAAGTCCGCCGTCAAATTCCAGGGCTCAACCCTGGACAGGCGGTAGAAACTACCAAGCTGGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTGAAATGCGTTGAGATCGGAAAGAACACCAACGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGCTAGGGGAGCAAATGGGATTAGATACCCCAGTAGTCCTAGCCGTAAACGATGGATACTAAGTGCTGTGCGTATCGACCCGCGCAGTGCTGTAGCTAACGCGTTAAGTATCCCGCCTGGGGAGTACGTTCGCAAGAATGAAACTCAAAGAAATTGACGG
## 061902-b 0
## 061903-a 0
## 061903-b 0
## GTGCCAGCAGCCGCGGTAAGACAGAGGATGCAAGCGTTATCCGGAATGATTGGGCGTAAAGCGTCTGTAGGTGGCTTTTCAAGTCCGCCGTCAAATTCCAGGGCTCAACCCTGGACAGGCGGTAGAAACTACCAAGCTGGAGTACGGTAGGGGCAGAGGGAATTTCCGGTGGAGCGGTGAAATGCGTTGAGATCGGAAAGAACACCAACGGCGAAAGCACTCTGCTGGGCCGACACTGACACTGAGAGACGAAAGCTAGGGGAGCAAATGGGATTAGATACCCCAGTAGTCCTAGCCGTAAACGATGGATACTAAGTGCTGTGCGTATCGACCCGCGCAGTGCTGTAGCTAACGCGTTAAGTATCCCGCCTGGGGAGTACGTTCGCAAGAATGAAACTCAAATAAATTGACGG
## 061902-b 0
## 061903-a 0
## 061903-b 0
## GTGCCAGCAGCCGCGGTAATACATAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGTGGTTCGTCACGTCGGATGTGAAAATCTGGGGCTTAACCCCAGACCTGCATTCGATACGGGCGAGCTAGAGTGTGGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCAATGGCGAAGGCAGGTCTCTGGGCCATTACTGACACTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGGTGGGCACTAGTTGTGGGAACCTTCCACGGTTTCTGCGACGCAGCTAACGCATTAAGTGCCCCGCCTGGGGAGTACGATCGCAAGATTAAAACTCAAAGAAATTGACGG
## 061902-b 48
## 061903-a 0
## 061903-b 56
Now we will make some changes to the sequence table to get it in the right format to use in phyloseq. We will view a small portion of the sequence table after each change to make sure it worked.
colnames(sequence_table) <- NULL
sequence_table[1:5,1:5]
## [,1] [,2] [,3] [,4] [,5]
## 061902-b 0 0 48 47 0
## 061903-a 0 0 0 0 0
## 061903-b 0 0 56 70 0
## 0619B-b 0 0 110 0 0
## 0619bilge-a 0 0 0 0 0
sequence_table <- as.data.frame(sequence_table)
class(sequence_table)
## [1] "data.frame"
sequence_table <- rownames_to_column(sequence_table, var = "SampleID")
sequence_table[1:5,1:5]
sequence_table <- sequence_table %>%
mutate(SampleID = str_replace_all(SampleID, "-", "_"))
sequence_table[1:4,1:4]
seq_table <- sdata2 %>%
left_join(sequence_table, by = "SampleID")
seq_table[1:5,1:5]
#subset to only include sampletypes we want
subset_all <- seq_table %>%
subset(Sample_Project=="Boat_Microbiome")%>%
filter(datatype == "water"|datatype == "swab"|datatype == "bilge"|datatype == "dock"|datatype == "air")
subset_all[1:5,1:5]
#subset to only include sampletypes we want
subset_all <- seq_table %>%
column_to_rownames(var = "SampleID")%>%
select(-c(Sample_Project, sampletype, date, filtertype, datatype))
subset_all[1:5,1:5]
colnames(subset_all) <- NULL
subset_all[1:5,1:5]
Load data for taxa table:
taxa <- readRDS("/home/lgschaer/boatbiome/phyloseq_input/taxa.rds")
taxa[1:5,1:5]
## Kingdom Phylum Class Order
## [1,] "Bacteria" "Cyanobacteria" "Chloroplast" NA
## [2,] "Bacteria" "Cyanobacteria" "Chloroplast" NA
## [3,] "Bacteria" "Actinobacteria" "Actinobacteria" "Frankiales"
## [4,] "Bacteria" "Actinobacteria" "Actinobacteria" "Frankiales"
## [5,] "Bacteria" "Cyanobacteria" "Chloroplast" NA
## Family
## [1,] NA
## [2,] NA
## [3,] "Sporichthyaceae"
## [4,] "Sporichthyaceae"
## [5,] NA
To make the phyloseq object, we need sample data, a sequence table and a taxa table
#make phyloseq object
samdata = sample_data(sdata3)
seqtab = otu_table(subset_all, taxa_are_rows = FALSE)
taxtab = tax_table(taxa)
Combine all items into a phyloseq object
phyloseq_object_all = phyloseq(otu_table(seqtab), tax_table(taxtab), sample_data(samdata))
phyloseq_object_all
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 33191 taxa and 217 samples ]
## sample_data() Sample Data: [ 217 samples by 5 sample variables ]
## tax_table() Taxonomy Table: [ 33191 taxa by 6 taxonomic ranks ]
Delete samples with a mean of less than 1000
samplesover1000_all <- subset_samples(phyloseq_object_all, sample_sums(phyloseq_object_all) > 1000)
Check if there are ASVs with no counts and how many there are
any(taxa_sums(samplesover1000_all) == 0)
## [1] TRUE
sum(taxa_sums(samplesover1000_all) == 0)
## [1] 809
If there are ASVs with no counts, we should remove them
prune_samplesover1000_all <- prune_taxa(taxa_sums(samplesover1000_all) > 0, samplesover1000_all)
any(taxa_sums(prune_samplesover1000_all) == 0)
## [1] FALSE
Set seed for reproducible data
set.seed(81)
Rarefy data
rarefy_samplesover1000_all <- rarefy_even_depth(prune_samplesover1000_all, rngseed= 81, sample.size = min(sample_sums(prune_samplesover1000_all)))
## `set.seed(81)` was used to initialize repeatable random subsampling.
## Please record this for your records so others can reproduce.
## Try `set.seed(81); .Random.seed` for the full vector
## ...
## 15694OTUs were removed because they are no longer
## present in any sample after random subsampling
## ...
Change phyloseq object to a more memorable name
bbps <- rarefy_samplesover1000_all
bbps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 16688 taxa and 178 samples ]
## sample_data() Sample Data: [ 178 samples by 5 sample variables ]
## tax_table() Taxonomy Table: [ 16688 taxa by 6 taxonomic ranks ]
Assign each category a color, define sample types and create sample labels.
sample_colors <- c("air" = "lightgoldenrod", "bilge" = "darkorange", "boatback" = "maroon", "boatrear" = "indianred2", "boatside" = "red3", "dock" = "tan4", "water" = "lightblue")
sample_types <- c("bilge", "boatback", "boatside", "boatrear", "water", "air", "dock")
sample_labels <- c("bilge" ="Bilge", "water" = "Water", "boatback" = "Boat\nBack", "boatside" = "Boat\nSide", "boatrear" = "Boat\nRear", "dock" = "Dock", "air" = "Air")
Create violin plot of alpha diversity
bbps %>% #phyloseq object
plot_richness(
x = "sampletype", #compare diversity of datatype
measures = c("Observed", "Shannon")) + #choose diversity measures
geom_violin(aes(fill = sampletype), show.legend = FALSE)+ #make violin plot, set fill aes to sampletype
geom_boxplot(width=0.1) + #add boxplot, set width
theme_classic()+ #change theme to classic
xlab(NULL)+ #no label on x-axis
theme(axis.text.y.left = element_text(size = 20), #adjust y-axis text
axis.text.x = element_text(size = 20, hjust = 0.5, vjust = 1, angle = 0), #adjust x-axis label position
axis.title.y = element_text(size = 20))+ #adjust y-axis title
theme(strip.text = element_text(face = "bold", size = 25))+ #adjust headings
scale_fill_manual(values = sample_colors)+ #set fill colors
scale_x_discrete( #change x-axis labels
breaks = sample_types,
labels = sample_labels)+
ggtitle("Alpha Diversity") + #add title
theme(plot.title=element_text(size = 25, face = "bold", hjust = 0.5)) #change title size, face and position
Making a PCoA plot
#ordination
all_pcoa <- ordinate(
physeq = bbps,
method = "PCoA",
distance = "bray"
)
plot_ordination(
physeq = bbps, #phyloseq object
ordination = all_pcoa)+ #ordination
geom_point(aes(fill = sampletype, shape = filtertype), size = 3) + #sets fill color to sampletype
scale_shape_manual(values = c(21, 22, 23))+
scale_fill_manual(values = sample_colors) +
theme_classic() + #changes theme, removes grey background
theme(
legend.text = element_text(size = 20), #changes legend size
legend.title = element_blank(), #removes legend title
legend.background = element_rect(fill = "white", color = "black"))+ #adds black boarder around legend
theme(axis.text.y.left = element_text(size = 20),
axis.text.x = element_text(size = 20),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20))+
guides(fill = guide_legend(override.aes = list(shape = 21))) #fills legend points based on the fill command
Making a taxa plot
justbacteria <- bbps %>%
subset_taxa(
Kingdom == "Bacteria" & #only bacteria
Family != "mitochondria" & #filter out mitochondria
Class != "Chloroplast" #filter out chloroplasts
)
justbacteria
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 13247 taxa and 178 samples ]
## sample_data() Sample Data: [ 178 samples by 5 sample variables ]
## tax_table() Taxonomy Table: [ 13247 taxa by 6 taxonomic ranks ]
phylumabundance <- justbacteria %>%
tax_glom(taxrank = "Phylum") %>% # agglomerate at phylum level
transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
psmelt() %>% # Melt to long format
arrange(Phylum)
head(phylumabundance)
all <- phylumabundance %>%
select(Phylum, sampletype, Abundance, date) %>%
group_by(Phylum, sampletype, date) %>%
summarize(
avg_abundance = mean(Abundance)
) %>%
filter(avg_abundance > 0.01)
head(all)
phylum_colors <- c(
"#CBD588", "#5F7FC7", "orange","#DA5724", "#508578", "#CD9BCD",
"#AD6F3B", "#673770","#D14285", "#652926", "#C84248",
"#8569D5", "#5E738F","#D1A33D", "#8A7C64", "#599861"
)
ggplot(all)+
geom_col(mapping = aes(x = sampletype, y = avg_abundance, fill = Phylum), position = "fill", show.legend = TRUE)+
ylab("Proportion of Community") +
scale_fill_manual(values = phylum_colors) +
xlab(NULL)+
theme_minimal()+
theme(axis.text.y.left = element_text(size = 20),
axis.text.x = element_text(size = 20, angle = 90, vjust = 0.5, hjust = 1),
axis.title.y = element_text(size = 20),
title = element_text(size = 25))