The goal of this project is to examine the microbial composition of samples retrieved from the sedimental layer of a thermal vent located in the Guaymas Basin off of the Gulf of California using R and R studio. Dataset was retrieved and used with the permission from Dr. Gustavo Ramirez and can be accessed from the follow paper:
Teske, A., McKay, L.J., Ravelo, A.C. et al. Characteristics and Evolution of sill-driven off-axis hydrothermalism in Guaymas Basin – the Ringvent site. Sci Rep 9, 13847 (2019). https://doi.org/10.1038/s41598-019-50200-5
library(ggplot2)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-6
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(scales)
library(grid)
library(reshape2)
library(parallel)
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(phyloseq)
These packages are available on the Comphrensive R Archive Network (CRAN) and contain the functions and codes necessary to execute the calculation, visualization and analysis of the data.
setwd("C:/Users/kcsas/Desktop/Bioinformatics project")
A working directory is the default file path in which any files are imported or exported into while using R.
Now we will organize our datasets
bact(1).shared contains the number of times an OTU is observed across the 8 samples bact(1).taxonomy contains the number of total reads for each OTU as well as Taxonomic Ranks bactMeta(1).csv contains additional information about the 8 samples (Sample ID, Methane concentration, Sulfate concentration etc.)
Our goal is to merge these files together so they will follow a single path
sharedfile="bact(1).shared"
taxfile="bact(1).taxonomy"
write.csv(taxfile, file="arch.taxonomy")
sharedfile
## [1] "bact(1).shared"
taxfile
## [1] "bact(1).taxonomy"
mothur_data <- import_mothur(mothur_shared_file = sharedfile, mothur_constaxonomy_file = taxfile)
map= read.csv("bactMeta(1).csv")
map <- sample_data(map)
map
## SampleID SampleOrder Core Depth_cm Methane_mM Del13_CH4 DIC_mM
## sa1 bac_P3_4_v4v5 P03_4 P03 NA NA NA NA
## sa2 bac_P6_3_v4v5 P06_3 P06 102 0.8100 -85.40 37.96
## sa3 bac_P6_4_v4v5 P06_4 P06 152 6.8100 -87.20 49.04
## sa4 bac_P10_2_v4v5 P10_2 P10 2 0.0077 -64.77 4.17
## sa5 bac_P10_4_v4v5 P10_4 P10 101 0.0103 -68.24 4.23
## sa6 bac_P11_2_v4v5 P11_2 P11 52 0.1809 -62.62 2.65
## sa7 bac_P12_4_v4v5 P12_4 P12 101 0.1500 -77.78 18.65
## sa8 bac_P13_4_v4v5 P13_4 P13 NA NA NA NA
## Del13_DIC Sulfate_mM Del34_SO42. Sulfide_uM
## sa1 NA NA NA NA
## sa2 -16.37 1.538 77.5 4151.00
## sa3 -12.72 0.039 23.6 2818.00
## sa4 -8.37 26.241 23.2 27.50
## sa5 -9.29 25.175 24.9 48.75
## sa6 -2.62 26.561 21.7 1.50
## sa7 -17.81 0.071 16.8 3848.00
## sa8 NA NA NA NA
rownames(map) <- map$SampleID
moth_merge <- merge_phyloseq(mothur_data, map)
moth_merge
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 54147 taxa and 8 samples ]
## sample_data() Sample Data: [ 8 samples by 11 sample variables ]
## tax_table() Taxonomy Table: [ 54147 taxa by 6 taxonomic ranks ]
Data from the 3 files are now merged together under “moth_merge”
We can check the taxonomic rankings of the dataset
colnames(tax_table(moth_merge))
## [1] "Rank1" "Rank2" "Rank3" "Rank4" "Rank5" "Rank6"
Here we see that the taxonomic ranks are labled “Rank1” “Rank2” etc. We can rename these labels to reflect the proper taxonomic ranks
colnames(tax_table(moth_merge)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")
colnames(tax_table(moth_merge))
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus"
Now that we organized the data, We would like to know what phyla of bacteria are most represented in the dataset. Before we can begin to address that, we need to pre-process the dataset. To account for contamination or sequencing errors that may have led to the presence of new OTUsin the dataset, we can perform prune function that would exclude any OTUs that have less than 500 reads in the dataset.
moth_merge_pruned = prune_samples(sample_sums(moth_merge)>500, moth_merge)
We can assess the number the OTUs in the dataset after it has been merged and pruned through the following functions
sample_sums(moth_merge_pruned)
## bac_P10_2_v4v5 bac_P10_4_v4v5 bac_P11_2_v4v5 bac_P12_4_v4v5 bac_P13_4_v4v5
## 82729 131296 147265 135368 147373
## bac_P3_4_v4v5 bac_P6_3_v4v5 bac_P6_4_v4v5
## 139489 136960 118954
king<- moth_merge_pruned%>%
subset_taxa(Kingdom == "Bacteria")
king
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 54147 taxa and 8 samples ]
## sample_data() Sample Data: [ 8 samples by 11 sample variables ]
## tax_table() Taxonomy Table: [ 54147 taxa by 6 taxonomic ranks ]
sample_sum_df <- data.frame(sum = sample_sums(king))
sample_sum_df
## sum
## bac_P10_2_v4v5 82729
## bac_P10_4_v4v5 131296
## bac_P11_2_v4v5 147265
## bac_P12_4_v4v5 135368
## bac_P13_4_v4v5 147373
## bac_P3_4_v4v5 139489
## bac_P6_3_v4v5 136960
## bac_P6_4_v4v5 118954
smin <- min(sample_sums(king))
smean <- mean(sample_sums(king))
smax <- max(sample_sums(king))
ssum <- sum(sample_sums(king))
ssd <- sd(sample_sums(king))
smin
## [1] 82729
smax
## [1] 147373
smean
## [1] 129929.2
ssum
## [1] 1039434
ssd
## [1] 21130.21
To assess the number of each taxonomic rank in the dataset. Use the following codes
tax_glom(moth_merge_pruned, taxrank = "Kingdom")
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1 taxa and 8 samples ]
## sample_data() Sample Data: [ 8 samples by 11 sample variables ]
## tax_table() Taxonomy Table: [ 1 taxa by 6 taxonomic ranks ]
tax_glom(moth_merge_pruned, taxrank = "Phylum")
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 53 taxa and 8 samples ]
## sample_data() Sample Data: [ 8 samples by 11 sample variables ]
## tax_table() Taxonomy Table: [ 53 taxa by 6 taxonomic ranks ]
tax_glom(moth_merge_pruned, taxrank = "Class")
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 143 taxa and 8 samples ]
## sample_data() Sample Data: [ 8 samples by 11 sample variables ]
## tax_table() Taxonomy Table: [ 143 taxa by 6 taxonomic ranks ]
tax_glom(moth_merge_pruned, taxrank = "Order")
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 316 taxa and 8 samples ]
## sample_data() Sample Data: [ 8 samples by 11 sample variables ]
## tax_table() Taxonomy Table: [ 316 taxa by 6 taxonomic ranks ]
tax_glom(moth_merge_pruned, taxrank = "Family")
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 468 taxa and 8 samples ]
## sample_data() Sample Data: [ 8 samples by 11 sample variables ]
## tax_table() Taxonomy Table: [ 468 taxa by 6 taxonomic ranks ]
tax_glom(moth_merge_pruned, taxrank = "Genus")
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 751 taxa and 8 samples ]
## sample_data() Sample Data: [ 8 samples by 11 sample variables ]
## tax_table() Taxonomy Table: [ 751 taxa by 6 taxonomic ranks ]
A visual representation of the overall abundance of sequences in each sample
plot_bar(moth_merge_pruned, fill="Phylum")
Another way to visualize the distribution of OTUs in the 8 sampkes can be viewed here:
readsumsdf = data.frame(nreads = sort(taxa_sums(moth_merge_pruned), TRUE), sorted = 1:ntaxa(moth_merge_pruned),
type = "OTUs")
readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(moth_merge_pruned),
TRUE), sorted = 1:nsamples(moth_merge_pruned), type = "SampleID"))
title = "Total number of reads"
p = ggplot(readsumsdf, aes(x = sorted, y = nreads)) + geom_bar(stat = "identity")
p + ggtitle(title) + scale_y_log10() + facet_wrap(~type, 1, scales = "free")
To assess the relative abundance, we need to transform each sample to have the same sample sum. We will do this by performing a random subsampling (rarefy) our dataset so that each sample will only contain 5000 random reads selected from each sample. Before using the rarefy function, we must first define a random number seed (set.seed function) to ensure reproducibility of results.
set.seed(1000)
moth_merge_prunedR = rarefy_even_depth(moth_merge_pruned, sample.size = 5000)
## You set `rngseed` to FALSE. Make sure you've set & recorded
## the random seed of your session for reproducibility.
## See `?set.seed`
## ...
## 47983OTUs were removed because they are no longer
## present in any sample after random subsampling
## ...
We can visualize the transformation of each sample to 5000 reads by the following functions:
par(mfrow = c(1, 2))
title = "Sum of reads for each sample, moth_merge_prunedR"
plot(sort(sample_sums(moth_merge_prunedR), TRUE), type = "h", main = title, ylab = "reads",
ylim = c(0, 10000))
readsumsdf = data.frame(nreads = sort(taxa_sums(moth_merge_prunedR), TRUE), sorted = 1:ntaxa(moth_merge_prunedR),
type = "OTUs")
readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(moth_merge_prunedR),
TRUE), sorted = 1:nsamples(moth_merge_prunedR), type = "SampleID"))
title = "Total number of reads"
p = ggplot(readsumsdf, aes(x = sorted, y = nreads)) + geom_bar(stat = "identity")
p + ggtitle(title) + scale_y_log10() + facet_wrap(~type, 1, scales = "free")
Now that we scaled our samples to have equal amounts of reads we can begin to assess for the most represented OTUs.
We will create a separate “rank” that would recognize the Top 100 most represented OTUs in the rarefied moth_merge_pruned dataset (moth_merge_prunedR)
topNotus = names(sort(taxa_sums(moth_merge_prunedR), TRUE)[1:100])
taxtabN = cbind(tax_table(moth_merge_prunedR), Top100Phylum = NA)
taxtabN[topNotus, "Top100Phylum"] <- as(tax_table(moth_merge_prunedR)[topNotus, "Phylum"],
"character")
tax_table(moth_merge_prunedR) <- tax_table(taxtabN)
This plot contains the abundance of all the OTUs in the moth_merge_prunedR dataset
title = "Subsample Abundance"
plot_bar(moth_merge_prunedR, "SampleID", fill = "Phylum", title = title) + coord_flip()
This plot is modified to show the percentage of OTUs represented in the subsampled dataset.
moth_merge_prunedRP = transform_sample_counts(moth_merge_prunedR, function(x) 100 * x/sum(x))
title = "Subsample Percentage"
plot_bar(moth_merge_prunedRP, "SampleID", fill = "Phylum", title = title) + coord_flip() +
ylab("Percentage of Sequences")
We will prune the subsampled dataset so that only the top 100 OTUs are represented on the bar plot.
moth_merge_pruned_top = prune_taxa(topNotus, moth_merge_prunedRP)
title = "Top OTUs in subsample"
plot_bar(moth_merge_pruned_top, "SampleID", fill = "Top100Phylum", title = title) + coord_flip() +
ylab("Percentage of Sequences") + ylim(0, 100)
We can get a closer look at the abundance of particular bacteria. Here we are looking for the percentage of Proteobacteria in the dataset
proteo <- subset_taxa(moth_merge_prunedRP, Phylum=="Proteobacteria")
title = " Proteobacteria only"
plot_bar(proteo, "SampleID", fill = "Class", title = title) + coord_flip() +
ylab("Percentage of Sequences")
Another example using Actinobacteria
actino <- subset_taxa(moth_merge_prunedRP, Phylum=="Actinobacteria")
title = " Actinobacteria only"
plot_bar(actino, "SampleID", fill = "Class", title = title) + coord_flip() +
ylab("Percentage of Sequences")
References
phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data. Paul J. McMurdie and Susan Holmes (2013) PLoS ONE 8(4):e61217.