Call in necessary packages
library(dada2)
## Loading required package: Rcpp
library(phyloseq); packageVersion("phyloseq")
## [1] '1.42.0'
library(Biostrings); packageVersion("Biostrings")
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
##
## distance
## Loading required package: XVector
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## [1] '2.66.0'
library(ggplot2); packageVersion("ggplot2")
## [1] '3.4.0'
library(tidyverse) ; packageVersion("tidyverse") # 1.3.1
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ✔ purrr 0.3.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine() masks BiocGenerics::combine()
## ✖ purrr::compact() masks XVector::compact()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ dplyr::slice() masks XVector::slice(), IRanges::slice()
## [1] '1.3.2'
library(insect)
##
## Attaching package: 'insect'
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following object is masked from 'package:IRanges':
##
## trim
##
## The following object is masked from 'package:S4Vectors':
##
## expand
##
## The following object is masked from 'package:dada2':
##
## rc
library(vegan) ; packageVersion("vegan") # 2.5.4
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
## [1] '2.6.4'
Setting your path; this is where most of your raw input files live
# first go to GitHub URL https://github.com/psalib/Praccomp2022-final-project.git and download a clone the DATA file within the repo, stick it on your desktop (or set your own working wirectory and stick it there)
path <- ("~/Desktop/Praccomp2022-final-project-main/data")
list.files(path) # you should see 8 fastq files (4 mock communities, 1 forward (R1) and 1 reverse (R2) filed), and 2 reference files
## [1] "filtered12s"
## [2] "GC-EC-8535-JH-mock1-100-12S_S394_L001_R1_001.fastq"
## [3] "GC-EC-8535-JH-mock1-100-12S_S394_L001_R2_001.fastq"
## [4] "GC-EC-8535-JH-mock2-100-12S_S395_L001_R1_001.fastq"
## [5] "GC-EC-8535-JH-mock2-100-12S_S395_L001_R2_001.fastq"
## [6] "GC-EC-8535-JH-mock3-100-12S_S396_L001_R1_001.fastq"
## [7] "GC-EC-8535-JH-mock3-100-12S_S396_L001_R2_001.fastq"
## [8] "GC-EC-8535-JH-mock4-100-12S_S397_L001_R1_001.fastq"
## [9] "GC-EC-8535-JH-mock4-100-12S_S397_L001_R2_001.fastq"
## [10] "references.12s.miya.dada.species.v251.fasta"
## [11] "references.12s.miya.dada.taxonomy.v251.fasta"
sorting read files into separate forward and reverse objects
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
# To check if sample.names function works to your intent and that samples names are still unique
list(sample.names)
## [[1]]
## [1] "GC-EC-8535-JH-mock1-100-12S" "GC-EC-8535-JH-mock2-100-12S"
## [3] "GC-EC-8535-JH-mock3-100-12S" "GC-EC-8535-JH-mock4-100-12S"
Plotting the quality profiles of each fastq file
# in order to determine where to truncate your sequences, try to follow the read quality profile until reads start to consistently dip below Qscore of 30
plotQualityProfile(fnFs[1:4]) # forward reads
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the dada2 package.
## Please report the issue at <]8;;https://github.com/benjjneb/dada2/issueshttps://github.com/benjjneb/dada2/issues]8;;>.
plotQualityProfile(fnRs[1:4]) # reverse reads
Create subdirectory in your set path of samples that pass through the filter to be stored
# Place filtered files in filtered/ subdirectory, this creaqtes vectors of sample names to be easily used down the line
filtFs <- file.path(path, "filtered12s", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered12s", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
filter and trimming for quality
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
truncLen=c(200,220), #based on the quality profiles
maxN=0, # default setting, no ambiguous base calls
maxEE=c(2,2), # no more than 2 expected erroneous base calls
truncQ=2, # trims all bases after it comes across the first quality score of 2
rm.phix=TRUE, #removes bacteriophage genome used for control
compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
head(out) # gives us a table with reads in and reads out to help determine if you need to relax/tighten trimming paramters
## reads.in reads.out
## GC-EC-8535-JH-mock1-100-12S_S394_L001_R1_001.fastq 17752 12931
## GC-EC-8535-JH-mock2-100-12S_S395_L001_R1_001.fastq 17698 12749
## GC-EC-8535-JH-mock3-100-12S_S396_L001_R1_001.fastq 14250 10376
## GC-EC-8535-JH-mock4-100-12S_S397_L001_R1_001.fastq 12826 9315
generating an error model
errF <- learnErrors(filtFs, multithread=TRUE)
## 9074200 total bases in 45371 reads from 4 samples will be used for learning the error rates.
errR <- learnErrors(filtRs, multithread=TRUE)
## 9981620 total bases in 45371 reads from 4 samples will be used for learning the error rates.
plotErrors(errF, nominalQ=TRUE)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
# red line is expected based on quality score
# black line is error estimate
# black dots are the observed errors
# you want to see the black line fit the trend of the black dots, if not, think about redoing the trim and filter step
visualize how our trimming and filtering changed read quality
plotQualityProfile(filtFs[1:4]) # forward filtered reads
plotQualityProfile(filtRs[1:4]) # reverse filtered reads
# you should not see quality profiles above Q30
Inferring Amplicon Sequence Variants
# here we create data matrices using the filtered rewds and learned errors to infer unique sequence variants
dadaFs <- dada(filtFs, err=errF, pool = "pseudo" ,multithread=TRUE)
## Sample 1 - 12931 reads in 7223 unique sequences.
## Sample 2 - 12749 reads in 7143 unique sequences.
## Sample 3 - 10376 reads in 5916 unique sequences.
## Sample 4 - 9315 reads in 5380 unique sequences.
##
## selfConsist step 2
dadaRs <- dada(filtRs, err=errR, pool = "pseudo", multithread=TRUE)
## Sample 1 - 12931 reads in 7177 unique sequences.
## Sample 2 - 12749 reads in 7183 unique sequences.
## Sample 3 - 10376 reads in 5984 unique sequences.
## Sample 4 - 9315 reads in 5237 unique sequences.
##
## selfConsist step 2
# pooling helps resolve low abundance variants
dadaFs[[1]] #tells us the number of sequence variants were inferred
## dada-class: object describing DADA2 denoising results
## 23 sequence variants were inferred from 7223 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
merging forward and reverse reads
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
## 11660 paired-reads (in 17 unique pairings) successfully merged out of 11847 (in 48 pairings) input.
## 11523 paired-reads (in 22 unique pairings) successfully merged out of 11675 (in 58 pairings) input.
## 9527 paired-reads (in 18 unique pairings) successfully merged out of 9662 (in 39 pairings) input.
## 8403 paired-reads (in 16 unique pairings) successfully merged out of 8520 (in 33 pairings) input.
# Inspect the merger data.frame from the first sample
head(mergers[[1]]) # tells us how many merged reads we have produced
## sequence
## 1 ACAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGAGACCCAAGTTGATAGCCACCGGCGTAAAGCGTGGCTAAGTTAAAAGTTAAACTAAAGTCGAACATCTTCAAGACTGTTATACGTAACCGAAGACAGGAAGTTCAGCCACGAAAGTGACTTTATTTGATCTGAGCCCACGAAAGCTAGGGGACAAACTGGGATTAGATACCCCACTATG
## 2 TACAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGAGGCCCAAGTTGATATCCGCCGGCGTAAAGAGTGGTTAAGAGAACCCAATACTAAAGCCGAACACTCTCAAAGCTGTTATATGCACCCGAGAGCAAGAAGCCCTCTCACGAAAGTAGCTTTAACTTTTCTGACTCCACGTAAGCTGAGGAACAAACTGGGATTAGATACCCCACTATG
## 3 TACAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGAGGCTCAAGTTGACAAATTACGGCGTAAAGCGTGGTTAAGAGTATTTCAAAATAAAGTCGAATGCTTTCAAAGCTGTTATACGCACCCGAAAGTAAGAAGTCCAACTACGAAAGTAACTTTACACATTCTGACCCCACGAAAGCTAGGCCACAAACTGGGATTAGATACCCCACTATG
## 4 GTCGGTAAAACTCGTGCCAGCCACCGCGGTTAAACGAGAGGCCCTAGTTAATAATACACGGCGTAAAGGGTGGTTAAGGGAAGCACAATAATAAAGCCAAATGGCCCTTTGGCTGTCATACGCTTCTAGGTGTCCGAAGCCCAATATACGAAAGTAGCTTTAATAAAGCCCACCTGACCCCACGAAAGCTGAGAAACAAACTGGGATTAGATACCCCACTATG
## 5 CAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAAAGACCCAAGTTGATAATCACCGGCGTAAAGAGTGGTTAAGATAAACCCCCAAACTAAAGCCGAACATCTTCAAAGCTGTTATACGCACACGAAGAAAAGAAACCCAACCACGAAAGTGGCTTTATAATATCTGACCCCACGAAAGCTATGTCACAAACTGGGATTAGATACCCCACTATG
## 6 TACAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGTGACCCAAGTTGAAAGTTACCGGCGTAAAGAGTGGTTAGGGAAATAATAAACTAAAGCCGAACACCCTCTAGGCTGTTATACGCTTCTGAAGGTACGAAGCCCCACTACGAAAGTGGCTTTAATACACCTGAACCCACGACAACTAAGACACAAACTGGGATTAGATACCCCACTATG
## abundance forward reverse nmatch nmismatch nindel prefer accept
## 1 2973 1 1 200 0 0 1 TRUE
## 2 2037 2 2 200 0 0 1 TRUE
## 3 1911 3 3 200 0 0 2 TRUE
## 4 1010 4 4 197 0 0 2 TRUE
## 5 992 5 5 200 0 0 1 TRUE
## 6 843 6 6 200 0 0 2 TRUE
Creating a sequence table
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
## [1] 4 31
# creates a table that indicates the number of reads belinging to each ASV sequence among each community
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
##
## 220 223 224 225
## 27 2 1 1
removing chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=FALSE, verbose=TRUE)
## Identified 18 bimeras out of 31 input sequences.
dim(seqtab.nochim)
## [1] 4 13
sum(seqtab.nochim)/sum(seqtab) ## this is the proprtion of reads kept after removing chimeras
## [1] 0.9831683
tracking reads throigh our pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
head(track)
## input filtered denoisedF denoisedR merged nonchim
## GC-EC-8535-JH-mock1-100-12S 17752 12931 11930 12179 11660 11475
## GC-EC-8535-JH-mock2-100-12S 17698 12749 11778 11954 11523 11265
## GC-EC-8535-JH-mock3-100-12S 14250 10376 9732 9889 9527 9389
## GC-EC-8535-JH-mock4-100-12S 12826 9315 8582 8742 8403 8292
making ASV tables
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
for (i in 1:dim(seqtab.nochim)[2]) {
asv_headers[i] <- paste(">ASV", i, sep="_")
}
# making and writing out a fasta of our final ASV seqs:
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, "ASVs.fa")
# count table:
asv_tab <- t(seqtab.nochim)
row.names(asv_tab) <- sub(">", "", asv_headers)
write.table(asv_tab, "ASVs_counts.tsv", sep="\t", quote=F, col.names=NA) #writes table to path directory
taxa <- assignTaxonomy(seqtab.nochim,"~/Desktop/Praccomp2022-final-project-main/data/references.12s.miya.dada.taxonomy.v251.fasta", multithread=TRUE)
taxa <- addSpecies(taxa, "~/Desktop/Praccomp2022-final-project-main/data/references.12s.miya.dada.species.v251.fasta")
# here we use reference databases to compare our sample reads and assign taxonomy
taxa.print <- taxa
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
## Kingdom Phylum Class Order Family
## [1,] "Chordata" "Actinopterygii" "Scorpaeniformes" "Agonidae" "Agonus"
## [2,] "Chordata" "Actinopterygii" "Atheriniformes" "Atherinidae" "Atherina"
## [3,] "Chordata" "Actinopterygii" "Lophiiformes" NA NA
## [4,] "Chordata" "Actinopterygii" "Cypriniformes" "Cyprinidae" "Rutilus"
## [5,] "Chordata" "Actinopterygii" "Osmeriformes" "Osmeridae" "Osmerus"
## [6,] "Chordata" "Actinopterygii" "Perciformes" "Labridae" "Labrus"
## Genus Species
## [1,] "Agonus cataphractus" NA
## [2,] "Atherina boyeri" NA
## [3,] NA NA
## [4,] "Rutilus rutilus" NA
## [5,] "Osmerus eperlanus" NA
## [6,] "Labrus bergylta" NA
rownames(asv_tab) <- NULL
asv_taxa <- cbind(taxa.print, asv_tab) #combines our taxa table and the asv table
## read in the example seqtab.nochim ASV table
data(seqtab.nochim)
## Warning in data(seqtab.nochim): data set 'seqtab.nochim' not found
## get sequences from table column names
y <- char2dna(colnames(seqtab.nochim))
## name the sequences sequentially
names(y) <- paste0("ASV", seq_along(y))
creating objects to use in our analysis
samples.out <- rownames(seqtab.nochim)
subject <- sapply(strsplit(samples.out, "D"), `[`, 1)
gender <- substr(subject,1,1)
subject <- substr(subject,2,999)
day <- as.integer(sapply(strsplit(samples.out, "D"), `[`, 2))
df <- data.frame(Subject=subject)
rownames(df) <- samples.out
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE),
tax_table(taxa))
dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 13 taxa and 4 samples ]
## tax_table() Taxonomy Table: [ 13 taxa by 7 taxonomic ranks ]
## refseq() DNAStringSet: [ 13 reference sequences ]
theme_set(theme_bw())
This is hashed out due to some issues
# Transform data to proportions as appropriate for Bray-Curtis distances
# ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
# ord.nmds.bray <- ordinate(ps.prop, method="NMDS", distance="bray")
# plot_ordination(ps.prop, ord.nmds.bray, color="Samples", title="Bray NMDS")
shannon species richness estimation
plot_richness(ps, measures = "shannon")
## Warning in estimate_richness(physeq, split = TRUE, measures = measures): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
##
## We recommended that you find the un-trimmed data and retry.
abundance bar graph
data("ps")
## Warning in data("ps"): data set 'ps' not found
gp.ch = subset_taxa(ps, Phylum == "Actinopterygii")
plot_bar(gp.ch, fill="Genus")