load packages

library(BiocManager)
library(ShortRead)
library("dada2")

Inspect read quality

Get sample names

# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "_R1_001.fastq.gz", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "_R2_001.fastq.gz", full.names = TRUE))

# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)
plotQualityProfile(cutFs[1:2])
#reverse read quality
plotQualityProfile(cutRs[1:2])
filtFs <- file.path(path.cut, "filtered", basename(cutFs))
filtRs <- file.path(path.cut, "filtered", basename(cutRs))
# get sample names to see which files match
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.namesF <- unname(sapply(cutFs, get.sample.name))
sample.namesR <- unname(sapply(cutRs, get.sample.name))

Check for differences between lists and remove files that don’t have a matching pair

difsR <- setdiff(sample.namesR,sample.namesF)
difsR
difsF <- setdiff(sample.namesF,sample.namesR)
difsF
#only run this chunk if all the files are going through
grep("2019APRP12VeA-COI-P1",filtFs)
grep("2019JULP22VeA-COI-P1",filtFs)
grep("2019APRP22Ve-COI-48samples",filtRs)

#remove non-matched files
indicesF <- c(35,51)
filtFs <- filtFs[-indicesF]

indicesR <- c(36)
filtRs <- filtRs[-indicesR]

#do the same for cut files
grep("2019APRP12VeA-COI-P1",cutFs)
grep("2019JULP22VeA-COI-P1",cutFs)
grep("2019APRP22Ve-COI-48samples",cutRs)

cutFs <- cutFs[-indicesF]
cutRs <- cutRs[-indicesR]

filter reads using quality thresholds

maxN: allows zero Ns max EE: set to 5 for forward and reverse truncQ: set to 2 minlen: minimum length is 100 reads rm.phix:remove phix

out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN = 0, maxEE = c(5, 5), 
    truncQ = 2, minLen = 100, rm.phix = TRUE, compress = TRUE, multithread = TRUE) 
head(out)

Check how many samples went through filtering

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt

ls | wc -l

cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt/filtered

ls | wc -l

learn the error rates

errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)

plot the errors

plotErrors(errF, nominalQ=TRUE)
#looks okay! good to proceed
plotErrors(errR, nominalQ=TRUE)
#looks okay! good to proceed