Call in necessary packages

library(dada2)
## Loading required package: Rcpp
library(phyloseq); packageVersion("phyloseq")
## [1] '1.42.0'
library(Biostrings); packageVersion("Biostrings")
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:phyloseq':
## 
##     distance
## Loading required package: XVector
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## [1] '2.66.0'
library(ggplot2); packageVersion("ggplot2")
## [1] '3.4.0'
library(tidyverse) ; packageVersion("tidyverse") # 1.3.1
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse()   masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine()    masks BiocGenerics::combine()
## ✖ purrr::compact()    masks XVector::compact()
## ✖ dplyr::desc()       masks IRanges::desc()
## ✖ tidyr::expand()     masks S4Vectors::expand()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::first()      masks S4Vectors::first()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()     masks IRanges::reduce()
## ✖ dplyr::rename()     masks S4Vectors::rename()
## ✖ dplyr::slice()      masks XVector::slice(), IRanges::slice()
## [1] '1.3.2'
library(insect)
## 
## Attaching package: 'insect'
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following object is masked from 'package:IRanges':
## 
##     trim
## 
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## 
## The following object is masked from 'package:dada2':
## 
##     rc
library(vegan) ; packageVersion("vegan") # 2.5.4
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
## [1] '2.6.4'

Setting your path; this is where most of your raw input files live

# first go to GitHub URL https://github.com/psalib/Praccomp2022-final-project.git and download a clone the repo, stick it on your desktop (or set your own working wirectory and stick it there)
path <- ("~/Desktop/Praccomp2022-final-project-main/data")
list.files(path)  # you should see 8 fastq files (4 moch communities, 1 forward (R1) and 1 reverse (R2) filed), and 2 reference files 
##  [1] "filtered12s"                                       
##  [2] "GC-EC-8535-JH-mock1-100-12S_S394_L001_R1_001.fastq"
##  [3] "GC-EC-8535-JH-mock1-100-12S_S394_L001_R2_001.fastq"
##  [4] "GC-EC-8535-JH-mock2-100-12S_S395_L001_R1_001.fastq"
##  [5] "GC-EC-8535-JH-mock2-100-12S_S395_L001_R2_001.fastq"
##  [6] "GC-EC-8535-JH-mock3-100-12S_S396_L001_R1_001.fastq"
##  [7] "GC-EC-8535-JH-mock3-100-12S_S396_L001_R2_001.fastq"
##  [8] "GC-EC-8535-JH-mock4-100-12S_S397_L001_R1_001.fastq"
##  [9] "GC-EC-8535-JH-mock4-100-12S_S397_L001_R2_001.fastq"
## [10] "references.12s.miya.dada.species.v251.fasta"       
## [11] "references.12s.miya.dada.taxonomy.v251.fasta"

sorting read files into separate forward and reverse objects

# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
# To check if sample.names function works to your intent and that samples names are still unique 
list(sample.names)
## [[1]]
## [1] "GC-EC-8535-JH-mock1-100-12S" "GC-EC-8535-JH-mock2-100-12S"
## [3] "GC-EC-8535-JH-mock3-100-12S" "GC-EC-8535-JH-mock4-100-12S"

Plotting the quality profiles of each fastq file

# in order to determine where to truncate your sequences, try to follow the read quality profile until reads start to consistently dip below Qscore of 30 
plotQualityProfile(fnFs[1:4])  # forward reads
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the dada2 package.
##   Please report the issue at <]8;;https://github.com/benjjneb/dada2/issueshttps://github.com/benjjneb/dada2/issues]8;;>.

plotQualityProfile(fnRs[1:4])  # reverse reads 

Create subdirectory in your set path of samples that pass through the filter to be stored

# Place filtered files in filtered/ subdirectory, this creaqtes vectors of sample names to be easily used down the line 
filtFs <- file.path(path, "filtered12s", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered12s", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names

filter and trimming for quality

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, 
              truncLen=c(200,220), #based on the quality profiles 
              maxN=0, # default setting, no ambiguous base calls
              maxEE=c(2,2), # no more than 2 expected erroneous base calls
              truncQ=2, # trims all bases after it comes across the first quality score of 2
              rm.phix=TRUE, #removes bacteriophage genome used for control 
              compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE
head(out) # gives us a table with reads in and reads out to help determine if you need to relax/tighten trimming paramters 
##                                                    reads.in reads.out
## GC-EC-8535-JH-mock1-100-12S_S394_L001_R1_001.fastq    17752     12931
## GC-EC-8535-JH-mock2-100-12S_S395_L001_R1_001.fastq    17698     12749
## GC-EC-8535-JH-mock3-100-12S_S396_L001_R1_001.fastq    14250     10376
## GC-EC-8535-JH-mock4-100-12S_S397_L001_R1_001.fastq    12826      9315

generating an error model

errF <- learnErrors(filtFs, multithread=TRUE)
## 9074200 total bases in 45371 reads from 4 samples will be used for learning the error rates.
errR <- learnErrors(filtRs, multithread=TRUE)
## 9981620 total bases in 45371 reads from 4 samples will be used for learning the error rates.
plotErrors(errF, nominalQ=TRUE)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

# red line is expected based on quality score
# black line is error estimate 
# black dots are the observed errors
# you want to see the black line fit the trend of the black dots, if not, think about redoing the trim and filter step

visualize how our trimming and filtering changed read quality

plotQualityProfile(filtFs[1:4])  # forward filtered reads

plotQualityProfile(filtRs[1:4]) # reverse filtered reads 

# you should not see quality profiles above Q30

Inferring Amplicon Sequence Variants

# here we create data matrices using the filtered rewds and learned errors to infer unique sequence variants 
dadaFs <- dada(filtFs, err=errF, pool = "pseudo" ,multithread=TRUE)
## Sample 1 - 12931 reads in 7223 unique sequences.
## Sample 2 - 12749 reads in 7143 unique sequences.
## Sample 3 - 10376 reads in 5916 unique sequences.
## Sample 4 - 9315 reads in 5380 unique sequences.
## 
##    selfConsist step 2
dadaRs <- dada(filtRs, err=errR, pool = "pseudo", multithread=TRUE)
## Sample 1 - 12931 reads in 7177 unique sequences.
## Sample 2 - 12749 reads in 7183 unique sequences.
## Sample 3 - 10376 reads in 5984 unique sequences.
## Sample 4 - 9315 reads in 5237 unique sequences.
## 
##    selfConsist step 2
# pooling helps resolve low abundance variants 
dadaFs[[1]] #tells us the number of sequence variants were inferred
## dada-class: object describing DADA2 denoising results
## 23 sequence variants were inferred from 7223 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

merging forward and reverse reads

mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
## 11660 paired-reads (in 17 unique pairings) successfully merged out of 11847 (in 48 pairings) input.
## 11523 paired-reads (in 22 unique pairings) successfully merged out of 11675 (in 58 pairings) input.
## 9527 paired-reads (in 18 unique pairings) successfully merged out of 9662 (in 39 pairings) input.
## 8403 paired-reads (in 16 unique pairings) successfully merged out of 8520 (in 33 pairings) input.
# Inspect the merger data.frame from the first sample
head(mergers[[1]]) # tells us how many merged reads we have produced 
##                                                                                                                                                                                                                          sequence
## 1    ACAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGAGACCCAAGTTGATAGCCACCGGCGTAAAGCGTGGCTAAGTTAAAAGTTAAACTAAAGTCGAACATCTTCAAGACTGTTATACGTAACCGAAGACAGGAAGTTCAGCCACGAAAGTGACTTTATTTGATCTGAGCCCACGAAAGCTAGGGGACAAACTGGGATTAGATACCCCACTATG
## 2    TACAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGAGGCCCAAGTTGATATCCGCCGGCGTAAAGAGTGGTTAAGAGAACCCAATACTAAAGCCGAACACTCTCAAAGCTGTTATATGCACCCGAGAGCAAGAAGCCCTCTCACGAAAGTAGCTTTAACTTTTCTGACTCCACGTAAGCTGAGGAACAAACTGGGATTAGATACCCCACTATG
## 3    TACAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGAGGCTCAAGTTGACAAATTACGGCGTAAAGCGTGGTTAAGAGTATTTCAAAATAAAGTCGAATGCTTTCAAAGCTGTTATACGCACCCGAAAGTAAGAAGTCCAACTACGAAAGTAACTTTACACATTCTGACCCCACGAAAGCTAGGCCACAAACTGGGATTAGATACCCCACTATG
## 4 GTCGGTAAAACTCGTGCCAGCCACCGCGGTTAAACGAGAGGCCCTAGTTAATAATACACGGCGTAAAGGGTGGTTAAGGGAAGCACAATAATAAAGCCAAATGGCCCTTTGGCTGTCATACGCTTCTAGGTGTCCGAAGCCCAATATACGAAAGTAGCTTTAATAAAGCCCACCTGACCCCACGAAAGCTGAGAAACAAACTGGGATTAGATACCCCACTATG
## 5    CAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAAAGACCCAAGTTGATAATCACCGGCGTAAAGAGTGGTTAAGATAAACCCCCAAACTAAAGCCGAACATCTTCAAAGCTGTTATACGCACACGAAGAAAAGAAACCCAACCACGAAAGTGGCTTTATAATATCTGACCCCACGAAAGCTATGTCACAAACTGGGATTAGATACCCCACTATG
## 6    TACAGTCGGTAAAACTCGTGCCAGCCACCGCGGTTATACGAGTGACCCAAGTTGAAAGTTACCGGCGTAAAGAGTGGTTAGGGAAATAATAAACTAAAGCCGAACACCCTCTAGGCTGTTATACGCTTCTGAAGGTACGAAGCCCCACTACGAAAGTGGCTTTAATACACCTGAACCCACGACAACTAAGACACAAACTGGGATTAGATACCCCACTATG
##   abundance forward reverse nmatch nmismatch nindel prefer accept
## 1      2973       1       1    200         0      0      1   TRUE
## 2      2037       2       2    200         0      0      1   TRUE
## 3      1911       3       3    200         0      0      2   TRUE
## 4      1010       4       4    197         0      0      2   TRUE
## 5       992       5       5    200         0      0      1   TRUE
## 6       843       6       6    200         0      0      2   TRUE

Creating a sequence table

seqtab <- makeSequenceTable(mergers)
dim(seqtab)
## [1]  4 31
# creates a table that indicates the number of reads belinging to each ASV sequence among each community 
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
## 
## 220 223 224 225 
##  27   2   1   1

removing chimeras

seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=FALSE, verbose=TRUE)
## Identified 18 bimeras out of 31 input sequences.
dim(seqtab.nochim)
## [1]  4 13
sum(seqtab.nochim)/sum(seqtab) ## this is the proprtion of reads kept after removing chimeras 
## [1] 0.9831683

tracking reads throigh our pipeline

getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
head(track)
##                             input filtered denoisedF denoisedR merged nonchim
## GC-EC-8535-JH-mock1-100-12S 17752    12931     11930     12179  11660   11475
## GC-EC-8535-JH-mock2-100-12S 17698    12749     11778     11954  11523   11265
## GC-EC-8535-JH-mock3-100-12S 14250    10376      9732      9889   9527    9389
## GC-EC-8535-JH-mock4-100-12S 12826     9315      8582      8742   8403    8292

making ASV tables

asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")

for (i in 1:dim(seqtab.nochim)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep="_")
}

  # making and writing out a fasta of our final ASV seqs:
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, "ASVs.fa")

  # count table:
asv_tab <- t(seqtab.nochim)
row.names(asv_tab) <- sub(">", "", asv_headers)
write.table(asv_tab, "ASVs_counts.tsv", sep="\t", quote=F, col.names=NA) #writes table to path directory 

taxonomy via dada2 tax resolution method

taxa <- assignTaxonomy(seqtab.nochim,"~/Desktop/Praccomp2022-final-project-main/data/references.12s.miya.dada.taxonomy.v251.fasta", multithread=TRUE)
taxa <- addSpecies(taxa, "~/Desktop/Praccomp2022-final-project-main/data/references.12s.miya.dada.species.v251.fasta")

# here we use reference databases to compare our sample reads and assign taxonomy 
taxa.print <- taxa 
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
##      Kingdom    Phylum           Class             Order         Family    
## [1,] "Chordata" "Actinopterygii" "Scorpaeniformes" "Agonidae"    "Agonus"  
## [2,] "Chordata" "Actinopterygii" "Atheriniformes"  "Atherinidae" "Atherina"
## [3,] "Chordata" "Actinopterygii" "Lophiiformes"    NA            NA        
## [4,] "Chordata" "Actinopterygii" "Cypriniformes"   "Cyprinidae"  "Rutilus" 
## [5,] "Chordata" "Actinopterygii" "Osmeriformes"    "Osmeridae"   "Osmerus" 
## [6,] "Chordata" "Actinopterygii" "Perciformes"     "Labridae"    "Labrus"  
##      Genus                 Species
## [1,] "Agonus cataphractus" NA     
## [2,] "Atherina boyeri"     NA     
## [3,] NA                    NA     
## [4,] "Rutilus rutilus"     NA     
## [5,] "Osmerus eperlanus"   NA     
## [6,] "Labrus bergylta"     NA
rownames(asv_tab) <- NULL

asv_taxa <- cbind(taxa.print, asv_tab) #combines our taxa table and the asv table 
## read in the example seqtab.nochim ASV table
data(seqtab.nochim)
## Warning in data(seqtab.nochim): data set 'seqtab.nochim' not found
## get sequences from table column names
y <- char2dna(colnames(seqtab.nochim))
## name the sequences sequentially
names(y) <- paste0("ASV", seq_along(y))

analysis

creating objects to use in our analysis

samples.out <- rownames(seqtab.nochim)
subject <- sapply(strsplit(samples.out, "D"), `[`, 1)
gender <- substr(subject,1,1)
subject <- substr(subject,2,999)
day <- as.integer(sapply(strsplit(samples.out, "D"), `[`, 2))
df <- data.frame(Subject=subject)
rownames(df) <- samples.out


ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), 
               tax_table(taxa))

dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 13 taxa and 4 samples ]
## tax_table()   Taxonomy Table:    [ 13 taxa by 7 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 13 reference sequences ]
theme_set(theme_bw())

This is hashed out due to some issues

# Transform data to proportions as appropriate for Bray-Curtis distances
# ps.prop <- transform_sample_counts(ps, function(otu) otu/sum(otu))
# ord.nmds.bray <- ordinate(ps.prop, method="NMDS", distance="bray")
# plot_ordination(ps.prop, ord.nmds.bray, color="Samples", title="Bray NMDS")

shannon species richness estimation

plot_richness(ps, measures = "shannon")
## Warning in estimate_richness(physeq, split = TRUE, measures = measures): The data you have provided does not have
## any singletons. This is highly suspicious. Results of richness
## estimates (for example) are probably unreliable, or wrong, if you have already
## trimmed low-abundance taxa from the data.
## 
## We recommended that you find the un-trimmed data and retry.

abundance bar graph

data("ps")
## Warning in data("ps"): data set 'ps' not found
gp.ch = subset_taxa(ps, Phylum == "Actinopterygii")
plot_bar(gp.ch, fill="Genus")