load packages
library(BiocManager)
library(ShortRead)
library("dada2")
Get sample names
# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "_R1_001.fastq.gz", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "_R2_001.fastq.gz", full.names = TRUE))
# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)
plotQualityProfile(cutFs[1:2])
#reverse read quality
plotQualityProfile(cutRs[1:2])
filtFs <- file.path(path.cut, "filtered", basename(cutFs))
filtRs <- file.path(path.cut, "filtered", basename(cutRs))
# get sample names to see which files match
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.namesF <- unname(sapply(cutFs, get.sample.name))
sample.namesR <- unname(sapply(cutRs, get.sample.name))
Check for differences between lists and remove files that don’t have a matching pair
difsR <- setdiff(sample.namesR,sample.namesF)
difsR
difsF <- setdiff(sample.namesF,sample.namesR)
difsF
#only run this chunk if all the files are going through
grep("2019APRP12VeA-COI-P1",filtFs)
grep("2019JULP22VeA-COI-P1",filtFs)
grep("2019APRP22Ve-COI-48samples",filtRs)
#remove non-matched files
indicesF <- c(35,51)
filtFs <- filtFs[-indicesF]
indicesR <- c(36)
filtRs <- filtRs[-indicesR]
#do the same for cut files
grep("2019APRP12VeA-COI-P1",cutFs)
grep("2019JULP22VeA-COI-P1",cutFs)
grep("2019APRP22Ve-COI-48samples",cutRs)
cutFs <- cutFs[-indicesF]
cutRs <- cutRs[-indicesR]
maxN: allows zero Ns max EE: set to 5 for forward and reverse truncQ: set to 2 minlen: minimum length is 100 reads rm.phix:remove phix
out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN = 0, maxEE = c(5, 5),
truncQ = 2, minLen = 100, rm.phix = TRUE, compress = TRUE, multithread = TRUE)
head(out)
Check how many samples went through filtering
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt
ls | wc -l
cd /Users/hailaschultz/GitHub/haila-ZoopMetabarcoding/data/fastq-files/cutadapt/filtered
ls | wc -l
learn the error rates
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)
plot the errors
plotErrors(errF, nominalQ=TRUE)
#looks okay! good to proceed
plotErrors(errR, nominalQ=TRUE)
#looks okay! good to proceed