Konza_synoptic_ITS1_bioinformatics

Setup

It loads the package before it begins or else it gets the error again

library(dada2)
packageVersion("dada2")

## [1] '1.26.0'

library(ShortRead)
packageVersion("ShortRead")

## [1] '1.56.1'

library(Biostrings)
packageVersion("Biostrings")

## [1] '2.66.0'

library(phyloseq)
library(vegan)
library(ggplot2)
library(tidyr)
library(dplyr)
library(pheatmap)
library(GUniFrac)
#library(metagMisc)
#library(raster)
#library(pals)
#library(RColorBrewer)
#library(ragg)
library(ggpubr)

Going to drop some citations for key packages here: McMurdie PJ, Holmes S (2013). “phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data.” PLoS ONE, 8(4), e61217. http://dx.plos.org/10.1371/journal.pone.0061217.

Now the our libraries are loaded, ## Set Theme Setting themes determines how our figures will look, readability and publishability. This can be modified later, but here is the starting point:

#Set theme
theme_set(theme_bw() + theme(
              plot.title = element_blank(),
              axis.text.x = element_text(size=10, color="black"),
              axis.text.y = element_text(size=10, color="black"),
              axis.title.x = element_text(size=10),
              axis.title.y = element_text(size=10),
              legend.text = element_text(size=10),
              legend.title = element_text(size=10),
              legend.position = "bottom",
              legend.key=element_blank(),
              legend.key.size = unit(0.5, "cm"),
              legend.spacing.x = unit(0.1, "cm"),
              legend.spacing.y = unit(0.1, "cm"),
              panel.background = element_blank(), 
              panel.border = element_rect(colour = "black", fill=NA, size=1),
              plot.background = element_blank()))

Now that our workspace is set up, time for the actual pipeline.

DADA2 Amplicon Sequence Data Pipeline:

Defining file path so R can find our raw fastq files. Make sure forward reads (R1) and reverse reads (R2) are in the same folder. This will need to be changed for different analyses.

path <- "/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS/fastq"
list.files(path)

##   [1] "01m01-B_S34_L001_R1_001.fastq"  "01m01-B_S34_L001_R2_001.fastq" 
##   [3] "01m01-L_S4_L001_R1_001.fastq"   "01m01-L_S4_L001_R2_001.fastq"  
##   [5] "01m01-S_S148_L001_R1_001.fastq" "01m01-S_S148_L001_R2_001.fastq"
##   [7] "01m02-B_S46_L001_R1_001.fastq"  "01m02-B_S46_L001_R2_001.fastq" 
##   [9] "01m02-L_S16_L001_R1_001.fastq"  "01m02-L_S16_L001_R2_001.fastq" 
##  [11] "01m02-S_S160_L001_R1_001.fastq" "01m02-S_S160_L001_R2_001.fastq"
##  [13] "01m03-B_S58_L001_R1_001.fastq"  "01m03-B_S58_L001_R2_001.fastq" 
##  [15] "01m03-L_S28_L001_R1_001.fastq"  "01m03-L_S28_L001_R2_001.fastq" 
##  [17] "01m03-S_S172_L001_R1_001.fastq" "01m03-S_S172_L001_R2_001.fastq"
##  [19] "01m03-W_S142_L001_R1_001.fastq" "01m03-W_S142_L001_R2_001.fastq"
##  [21] "01m04-B_S70_L001_R1_001.fastq"  "01m04-B_S70_L001_R2_001.fastq" 
##  [23] "01m04-L_S40_L001_R1_001.fastq"  "01m04-L_S40_L001_R2_001.fastq" 
##  [25] "01m04-S_S184_L001_R1_001.fastq" "01m04-S_S184_L001_R2_001.fastq"
##  [27] "01m05-B_S82_L001_R1_001.fastq"  "01m05-B_S82_L001_R2_001.fastq" 
##  [29] "01m05-L_S52_L001_R1_001.fastq"  "01m05-L_S52_L001_R2_001.fastq" 
##  [31] "01m05-S_S101_L001_R1_001.fastq" "01m05-S_S101_L001_R2_001.fastq"
##  [33] "01m06-B_S94_L001_R1_001.fastq"  "01m06-B_S94_L001_R2_001.fastq" 
##  [35] "01m06-L_S64_L001_R1_001.fastq"  "01m06-L_S64_L001_R2_001.fastq" 
##  [37] "01m06-S_S113_L001_R1_001.fastq" "01m06-S_S113_L001_R2_001.fastq"
##  [39] "02m01-B_S92_L001_R1_001.fastq"  "02m01-B_S92_L001_R2_001.fastq" 
##  [41] "02m01-L_S74_L001_R1_001.fastq"  "02m01-L_S74_L001_R2_001.fastq" 
##  [43] "02m01-S_S111_L001_R1_001.fastq" "02m01-S_S111_L001_R2_001.fastq"
##  [45] "02m02-B_S9_L001_R1_001.fastq"   "02m02-B_S9_L001_R2_001.fastq"  
##  [47] "02m02-L_S86_L001_R1_001.fastq"  "02m02-L_S86_L001_R2_001.fastq" 
##  [49] "02m02-S_S123_L001_R1_001.fastq" "02m02-S_S123_L001_R2_001.fastq"
##  [51] "02m03-B_S21_L001_R1_001.fastq"  "02m03-B_S21_L001_R2_001.fastq" 
##  [53] "02m03-L_S3_L001_R1_001.fastq"   "02m03-L_S3_L001_R2_001.fastq"  
##  [55] "02m03-S_S135_L001_R1_001.fastq" "02m03-S_S135_L001_R2_001.fastq"
##  [57] "02m04-L_S15_L001_R1_001.fastq"  "02m04-L_S15_L001_R2_001.fastq" 
##  [59] "02m04-S_S147_L001_R1_001.fastq" "02m04-S_S147_L001_R2_001.fastq"
##  [61] "02m05-B_S45_L001_R1_001.fastq"  "02m05-B_S45_L001_R2_001.fastq" 
##  [63] "02m05-L_S27_L001_R1_001.fastq"  "02m05-L_S27_L001_R2_001.fastq" 
##  [65] "02m06-B_S57_L001_R1_001.fastq"  "02m06-B_S57_L001_R2_001.fastq" 
##  [67] "02m06-L_S39_L001_R1_001.fastq"  "02m06-L_S39_L001_R2_001.fastq" 
##  [69] "02m06-S_S171_L001_R1_001.fastq" "02m06-S_S171_L001_R2_001.fastq"
##  [71] "02m07-B_S69_L001_R1_001.fastq"  "02m07-B_S69_L001_R2_001.fastq" 
##  [73] "02m07-L_S51_L001_R1_001.fastq"  "02m07-L_S51_L001_R2_001.fastq" 
##  [75] "02m07-S_S183_L001_R1_001.fastq" "02m07-S_S183_L001_R2_001.fastq"
##  [77] "02m08-B_S81_L001_R1_001.fastq"  "02m08-B_S81_L001_R2_001.fastq" 
##  [79] "02m08-L_S63_L001_R1_001.fastq"  "02m08-L_S63_L001_R2_001.fastq" 
##  [81] "02m08-S_S100_L001_R1_001.fastq" "02m08-S_S100_L001_R2_001.fastq"
##  [83] "02m09-B_S93_L001_R1_001.fastq"  "02m09-B_S93_L001_R2_001.fastq" 
##  [85] "02m09-L_S75_L001_R1_001.fastq"  "02m09-L_S75_L001_R2_001.fastq" 
##  [87] "02m09-S_S112_L001_R1_001.fastq" "02m09-S_S112_L001_R2_001.fastq"
##  [89] "02m10-B_S10_L001_R1_001.fastq"  "02m10-B_S10_L001_R2_001.fastq" 
##  [91] "02m10-L_S87_L001_R1_001.fastq"  "02m10-L_S87_L001_R2_001.fastq" 
##  [93] "02m10-S_S124_L001_R1_001.fastq" "02m10-S_S124_L001_R2_001.fastq"
##  [95] "02m11-B_S22_L001_R1_001.fastq"  "02m11-B_S22_L001_R2_001.fastq" 
##  [97] "02m11-S_S136_L001_R1_001.fastq" "02m11-S_S136_L001_R2_001.fastq"
##  [99] "02m11-W_S106_L001_R1_001.fastq" "02m11-W_S106_L001_R2_001.fastq"
## [101] "04m01-B_S11_L001_R1_001.fastq"  "04m01-B_S11_L001_R2_001.fastq" 
## [103] "04m01-L_S76_L001_R1_001.fastq"  "04m01-L_S76_L001_R2_001.fastq" 
## [105] "04m01-S_S125_L001_R1_001.fastq" "04m01-S_S125_L001_R2_001.fastq"
## [107] "04m02-B_S23_L001_R1_001.fastq"  "04m02-B_S23_L001_R2_001.fastq" 
## [109] "04m02-L_S88_L001_R1_001.fastq"  "04m02-L_S88_L001_R2_001.fastq" 
## [111] "04m02-S_S137_L001_R1_001.fastq" "04m02-S_S137_L001_R2_001.fastq"
## [113] "04m03-B_S35_L001_R1_001.fastq"  "04m03-B_S35_L001_R2_001.fastq" 
## [115] "04m03-L_S5_L001_R1_001.fastq"   "04m03-L_S5_L001_R2_001.fastq"  
## [117] "04m03-S_S149_L001_R1_001.fastq" "04m03-S_S149_L001_R2_001.fastq"
## [119] "04m04-B_S47_L001_R1_001.fastq"  "04m04-B_S47_L001_R2_001.fastq" 
## [121] "04m04-L_S17_L001_R1_001.fastq"  "04m04-L_S17_L001_R2_001.fastq" 
## [123] "04m04-S_S161_L001_R1_001.fastq" "04m04-S_S161_L001_R2_001.fastq"
## [125] "04m05-B_S59_L001_R1_001.fastq"  "04m05-B_S59_L001_R2_001.fastq" 
## [127] "04m05-L_S29_L001_R1_001.fastq"  "04m05-L_S29_L001_R2_001.fastq" 
## [129] "04m05-S_S173_L001_R1_001.fastq" "04m05-S_S173_L001_R2_001.fastq"
## [131] "04m06-B_S71_L001_R1_001.fastq"  "04m06-B_S71_L001_R2_001.fastq" 
## [133] "04m06-L_S41_L001_R1_001.fastq"  "04m06-L_S41_L001_R2_001.fastq" 
## [135] "04m06-S_S185_L001_R1_001.fastq" "04m06-S_S185_L001_R2_001.fastq"
## [137] "04m07-L_S53_L001_R1_001.fastq"  "04m07-L_S53_L001_R2_001.fastq" 
## [139] "04m07-S_S102_L001_R1_001.fastq" "04m07-S_S102_L001_R2_001.fastq"
## [141] "04m08-B_S83_L001_R1_001.fastq"  "04m08-B_S83_L001_R2_001.fastq" 
## [143] "04m08-L_S65_L001_R1_001.fastq"  "04m08-L_S65_L001_R2_001.fastq" 
## [145] "04m08-S_S114_L001_R1_001.fastq" "04m08-S_S114_L001_R2_001.fastq"
## [147] "04m09-B_S95_L001_R1_001.fastq"  "04m09-B_S95_L001_R2_001.fastq" 
## [149] "04m09-L_S77_L001_R1_001.fastq"  "04m09-L_S77_L001_R2_001.fastq" 
## [151] "04m09-S_S126_L001_R1_001.fastq" "04m09-S_S126_L001_R2_001.fastq"
## [153] "04m10-B_S12_L001_R1_001.fastq"  "04m10-B_S12_L001_R2_001.fastq" 
## [155] "04m10-L_S89_L001_R1_001.fastq"  "04m10-L_S89_L001_R2_001.fastq" 
## [157] "04m10-S_S138_L001_R1_001.fastq" "04m10-S_S138_L001_R2_001.fastq"
## [159] "04m11-B_S24_L001_R1_001.fastq"  "04m11-B_S24_L001_R2_001.fastq" 
## [161] "04m11-L_S6_L001_R1_001.fastq"   "04m11-L_S6_L001_R2_001.fastq"  
## [163] "04m11-S_S150_L001_R1_001.fastq" "04m11-S_S150_L001_R2_001.fastq"
## [165] "04m11-W_S108_L001_R1_001.fastq" "04m11-W_S108_L001_R2_001.fastq"
## [167] "04m12-B_S36_L001_R1_001.fastq"  "04m12-B_S36_L001_R2_001.fastq" 
## [169] "04m12-L_S18_L001_R1_001.fastq"  "04m12-L_S18_L001_R2_001.fastq" 
## [171] "04m12-S_S162_L001_R1_001.fastq" "04m12-S_S162_L001_R2_001.fastq"
## [173] "04m13-B_S48_L001_R1_001.fastq"  "04m13-B_S48_L001_R2_001.fastq" 
## [175] "04m13-L_S30_L001_R1_001.fastq"  "04m13-L_S30_L001_R2_001.fastq" 
## [177] "04m13-S_S174_L001_R1_001.fastq" "04m13-S_S174_L001_R2_001.fastq"
## [179] "04t01-B_S55_L001_R1_001.fastq"  "04t01-B_S55_L001_R2_001.fastq" 
## [181] "04t01-L_S37_L001_R1_001.fastq"  "04t01-L_S37_L001_R2_001.fastq" 
## [183] "04t01-S_S169_L001_R1_001.fastq" "04t01-S_S169_L001_R2_001.fastq"
## [185] "04t02-B_S67_L001_R1_001.fastq"  "04t02-B_S67_L001_R2_001.fastq" 
## [187] "04t02-L_S49_L001_R1_001.fastq"  "04t02-L_S49_L001_R2_001.fastq" 
## [189] "04t02-S_S181_L001_R1_001.fastq" "04t02-S_S181_L001_R2_001.fastq"
## [191] "04w01-B_S79_L001_R1_001.fastq"  "04w01-B_S79_L001_R2_001.fastq" 
## [193] "04w01-L_S61_L001_R1_001.fastq"  "04w01-L_S61_L001_R2_001.fastq" 
## [195] "04w01-S_S98_L001_R1_001.fastq"  "04w01-S_S98_L001_R2_001.fastq" 
## [197] "04w02-B_S91_L001_R1_001.fastq"  "04w02-B_S91_L001_R2_001.fastq" 
## [199] "04w02-L_S73_L001_R1_001.fastq"  "04w02-L_S73_L001_R2_001.fastq" 
## [201] "04w02-S_S110_L001_R1_001.fastq" "04w02-S_S110_L001_R2_001.fastq"
## [203] "04w03-B_S8_L001_R1_001.fastq"   "04w03-B_S8_L001_R2_001.fastq"  
## [205] "04w03-L_S85_L001_R1_001.fastq"  "04w03-L_S85_L001_R2_001.fastq" 
## [207] "04w03-S_S122_L001_R1_001.fastq" "04w03-S_S122_L001_R2_001.fastq"
## [209] "04w04-L_S2_L001_R1_001.fastq"   "04w04-L_S2_L001_R2_001.fastq"  
## [211] "04w04-S_S134_L001_R1_001.fastq" "04w04-S_S134_L001_R2_001.fastq"
## [213] "20m01-L_S14_L001_R1_001.fastq"  "20m01-L_S14_L001_R2_001.fastq" 
## [215] "20m01-S_S146_L001_R1_001.fastq" "20m01-S_S146_L001_R2_001.fastq"
## [217] "20m02-B_S44_L001_R1_001.fastq"  "20m02-B_S44_L001_R2_001.fastq" 
## [219] "20m02-L_S26_L001_R1_001.fastq"  "20m02-L_S26_L001_R2_001.fastq" 
## [221] "20m02-S_S158_L001_R1_001.fastq" "20m02-S_S158_L001_R2_001.fastq"
## [223] "20m02-W_S152_L001_R1_001.fastq" "20m02-W_S152_L001_R2_001.fastq"
## [225] "20m03-B_S56_L001_R1_001.fastq"  "20m03-B_S56_L001_R2_001.fastq" 
## [227] "20m03-L_S38_L001_R1_001.fastq"  "20m03-L_S38_L001_R2_001.fastq" 
## [229] "20m03-S_S170_L001_R1_001.fastq" "20m03-S_S170_L001_R2_001.fastq"
## [231] "20m04-B_S68_L001_R1_001.fastq"  "20m04-B_S68_L001_R2_001.fastq" 
## [233] "20m04-L_S50_L001_R1_001.fastq"  "20m04-L_S50_L001_R2_001.fastq" 
## [235] "20m04-S_S182_L001_R1_001.fastq" "20m04-S_S182_L001_R2_001.fastq"
## [237] "20m04-W_S164_L001_R1_001.fastq" "20m04-W_S164_L001_R2_001.fastq"
## [239] "20m05-B_S80_L001_R1_001.fastq"  "20m05-B_S80_L001_R2_001.fastq" 
## [241] "20m05-L_S62_L001_R1_001.fastq"  "20m05-L_S62_L001_R2_001.fastq" 
## [243] "20m05-S_S99_L001_R1_001.fastq"  "20m05-S_S99_L001_R2_001.fastq" 
## [245] "cutadapt"                       "filtN"                         
## [247] "sfm01-B_S60_L001_R1_001.fastq"  "sfm01-B_S60_L001_R2_001.fastq" 
## [249] "sfm01-L_S42_L001_R1_001.fastq"  "sfm01-L_S42_L001_R2_001.fastq" 
## [251] "sfm01-S_S186_L001_R1_001.fastq" "sfm01-S_S186_L001_R2_001.fastq"
## [253] "sfm02-B_S72_L001_R1_001.fastq"  "sfm02-B_S72_L001_R2_001.fastq" 
## [255] "sfm02-L_S54_L001_R1_001.fastq"  "sfm02-L_S54_L001_R2_001.fastq" 
## [257] "sfm02-S_S103_L001_R1_001.fastq" "sfm02-S_S103_L001_R2_001.fastq"
## [259] "sfm03-B_S84_L001_R1_001.fastq"  "sfm03-B_S84_L001_R2_001.fastq" 
## [261] "sfm03-L_S66_L001_R1_001.fastq"  "sfm03-L_S66_L001_R2_001.fastq" 
## [263] "sfm03-S_S115_L001_R1_001.fastq" "sfm03-S_S115_L001_R2_001.fastq"
## [265] "sfm04-B_S96_L001_R1_001.fastq"  "sfm04-B_S96_L001_R2_001.fastq" 
## [267] "sfm04-L_S78_L001_R1_001.fastq"  "sfm04-L_S78_L001_R2_001.fastq" 
## [269] "sfm04-S_S127_L001_R1_001.fastq" "sfm04-S_S127_L001_R2_001.fastq"
## [271] "sfm05-B_S109_L001_R1_001.fastq" "sfm05-B_S109_L001_R2_001.fastq"
## [273] "sfm05-L_S90_L001_R1_001.fastq"  "sfm05-L_S90_L001_R2_001.fastq" 
## [275] "sfm05-S_S139_L001_R1_001.fastq" "sfm05-S_S139_L001_R2_001.fastq"
## [277] "sfm06-B_S121_L001_R1_001.fastq" "sfm06-B_S121_L001_R2_001.fastq"
## [279] "sfm06-L_S7_L001_R1_001.fastq"   "sfm06-L_S7_L001_R2_001.fastq"  
## [281] "sfm06-S_S151_L001_R1_001.fastq" "sfm06-S_S151_L001_R2_001.fastq"
## [283] "sfm07-B_S133_L001_R1_001.fastq" "sfm07-B_S133_L001_R2_001.fastq"
## [285] "sfm07-L_S19_L001_R1_001.fastq"  "sfm07-L_S19_L001_R2_001.fastq" 
## [287] "sfm07-S_S163_L001_R1_001.fastq" "sfm07-S_S163_L001_R2_001.fastq"
## [289] "Sft01-B_S31_L001_R1_001.fastq"  "Sft01-B_S31_L001_R2_001.fastq" 
## [291] "Sft01-L_S13_L001_R1_001.fastq"  "Sft01-L_S13_L001_R2_001.fastq" 
## [293] "Sft01-S_S145_L001_R1_001.fastq" "Sft01-S_S145_L001_R2_001.fastq"
## [295] "Sft02-B_S43_L001_R1_001.fastq"  "Sft02-B_S43_L001_R2_001.fastq" 
## [297] "Sft02-L_S25_L001_R1_001.fastq"  "Sft02-L_S25_L001_R2_001.fastq" 
## [299] "Sft02-S_S157_L001_R1_001.fastq" "Sft02-S_S157_L001_R2_001.fastq"

Now that forward and reverse reads are loaded, we’ll sort and name the samples.

#Forward and reverse fastq filenames 
fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))

library(stringr)
#Extract sample names
sample.names <- sapply(strsplit(basename(fnFs), "_L001_R1_001"), `[`, 1)
sample.names <- sapply(strsplit(basename(fnFs), "_S"), `[`, 1)
sample.names <- sapply(str_replace(sample.names, "-", "_"),`[`, 1)
sample.names <- sapply(paste0('kz.s_', sample.names),`[`, 1)
sample.names

##   kz.s_01m01_B   kz.s_01m01_L   kz.s_01m01_S   kz.s_01m02_B   kz.s_01m02_L 
## "kz.s_01m01_B" "kz.s_01m01_L" "kz.s_01m01_S" "kz.s_01m02_B" "kz.s_01m02_L" 
##   kz.s_01m02_S   kz.s_01m03_B   kz.s_01m03_L   kz.s_01m03_S   kz.s_01m03_W 
## "kz.s_01m02_S" "kz.s_01m03_B" "kz.s_01m03_L" "kz.s_01m03_S" "kz.s_01m03_W" 
##   kz.s_01m04_B   kz.s_01m04_L   kz.s_01m04_S   kz.s_01m05_B   kz.s_01m05_L 
## "kz.s_01m04_B" "kz.s_01m04_L" "kz.s_01m04_S" "kz.s_01m05_B" "kz.s_01m05_L" 
##   kz.s_01m05_S   kz.s_01m06_B   kz.s_01m06_L   kz.s_01m06_S   kz.s_02m01_B 
## "kz.s_01m05_S" "kz.s_01m06_B" "kz.s_01m06_L" "kz.s_01m06_S" "kz.s_02m01_B" 
##   kz.s_02m01_L   kz.s_02m01_S   kz.s_02m02_B   kz.s_02m02_L   kz.s_02m02_S 
## "kz.s_02m01_L" "kz.s_02m01_S" "kz.s_02m02_B" "kz.s_02m02_L" "kz.s_02m02_S" 
##   kz.s_02m03_B   kz.s_02m03_L   kz.s_02m03_S   kz.s_02m04_L   kz.s_02m04_S 
## "kz.s_02m03_B" "kz.s_02m03_L" "kz.s_02m03_S" "kz.s_02m04_L" "kz.s_02m04_S" 
##   kz.s_02m05_B   kz.s_02m05_L   kz.s_02m06_B   kz.s_02m06_L   kz.s_02m06_S 
## "kz.s_02m05_B" "kz.s_02m05_L" "kz.s_02m06_B" "kz.s_02m06_L" "kz.s_02m06_S" 
##   kz.s_02m07_B   kz.s_02m07_L   kz.s_02m07_S   kz.s_02m08_B   kz.s_02m08_L 
## "kz.s_02m07_B" "kz.s_02m07_L" "kz.s_02m07_S" "kz.s_02m08_B" "kz.s_02m08_L" 
##   kz.s_02m08_S   kz.s_02m09_B   kz.s_02m09_L   kz.s_02m09_S   kz.s_02m10_B 
## "kz.s_02m08_S" "kz.s_02m09_B" "kz.s_02m09_L" "kz.s_02m09_S" "kz.s_02m10_B" 
##   kz.s_02m10_L   kz.s_02m10_S   kz.s_02m11_B   kz.s_02m11_S   kz.s_02m11_W 
## "kz.s_02m10_L" "kz.s_02m10_S" "kz.s_02m11_B" "kz.s_02m11_S" "kz.s_02m11_W" 
##   kz.s_04m01_B   kz.s_04m01_L   kz.s_04m01_S   kz.s_04m02_B   kz.s_04m02_L 
## "kz.s_04m01_B" "kz.s_04m01_L" "kz.s_04m01_S" "kz.s_04m02_B" "kz.s_04m02_L" 
##   kz.s_04m02_S   kz.s_04m03_B   kz.s_04m03_L   kz.s_04m03_S   kz.s_04m04_B 
## "kz.s_04m02_S" "kz.s_04m03_B" "kz.s_04m03_L" "kz.s_04m03_S" "kz.s_04m04_B" 
##   kz.s_04m04_L   kz.s_04m04_S   kz.s_04m05_B   kz.s_04m05_L   kz.s_04m05_S 
## "kz.s_04m04_L" "kz.s_04m04_S" "kz.s_04m05_B" "kz.s_04m05_L" "kz.s_04m05_S" 
##   kz.s_04m06_B   kz.s_04m06_L   kz.s_04m06_S   kz.s_04m07_L   kz.s_04m07_S 
## "kz.s_04m06_B" "kz.s_04m06_L" "kz.s_04m06_S" "kz.s_04m07_L" "kz.s_04m07_S" 
##   kz.s_04m08_B   kz.s_04m08_L   kz.s_04m08_S   kz.s_04m09_B   kz.s_04m09_L 
## "kz.s_04m08_B" "kz.s_04m08_L" "kz.s_04m08_S" "kz.s_04m09_B" "kz.s_04m09_L" 
##   kz.s_04m09_S   kz.s_04m10_B   kz.s_04m10_L   kz.s_04m10_S   kz.s_04m11_B 
## "kz.s_04m09_S" "kz.s_04m10_B" "kz.s_04m10_L" "kz.s_04m10_S" "kz.s_04m11_B" 
##   kz.s_04m11_L   kz.s_04m11_S   kz.s_04m11_W   kz.s_04m12_B   kz.s_04m12_L 
## "kz.s_04m11_L" "kz.s_04m11_S" "kz.s_04m11_W" "kz.s_04m12_B" "kz.s_04m12_L" 
##   kz.s_04m12_S   kz.s_04m13_B   kz.s_04m13_L   kz.s_04m13_S   kz.s_04t01_B 
## "kz.s_04m12_S" "kz.s_04m13_B" "kz.s_04m13_L" "kz.s_04m13_S" "kz.s_04t01_B" 
##   kz.s_04t01_L   kz.s_04t01_S   kz.s_04t02_B   kz.s_04t02_L   kz.s_04t02_S 
## "kz.s_04t01_L" "kz.s_04t01_S" "kz.s_04t02_B" "kz.s_04t02_L" "kz.s_04t02_S" 
##   kz.s_04w01_B   kz.s_04w01_L   kz.s_04w01_S   kz.s_04w02_B   kz.s_04w02_L 
## "kz.s_04w01_B" "kz.s_04w01_L" "kz.s_04w01_S" "kz.s_04w02_B" "kz.s_04w02_L" 
##   kz.s_04w02_S   kz.s_04w03_B   kz.s_04w03_L   kz.s_04w03_S   kz.s_04w04_L 
## "kz.s_04w02_S" "kz.s_04w03_B" "kz.s_04w03_L" "kz.s_04w03_S" "kz.s_04w04_L" 
##   kz.s_04w04_S   kz.s_20m01_L   kz.s_20m01_S   kz.s_20m02_B   kz.s_20m02_L 
## "kz.s_04w04_S" "kz.s_20m01_L" "kz.s_20m01_S" "kz.s_20m02_B" "kz.s_20m02_L" 
##   kz.s_20m02_S   kz.s_20m02_W   kz.s_20m03_B   kz.s_20m03_L   kz.s_20m03_S 
## "kz.s_20m02_S" "kz.s_20m02_W" "kz.s_20m03_B" "kz.s_20m03_L" "kz.s_20m03_S" 
##   kz.s_20m04_B   kz.s_20m04_L   kz.s_20m04_S   kz.s_20m04_W   kz.s_20m05_B 
## "kz.s_20m04_B" "kz.s_20m04_L" "kz.s_20m04_S" "kz.s_20m04_W" "kz.s_20m05_B" 
##   kz.s_20m05_L   kz.s_20m05_S   kz.s_sfm01_B   kz.s_sfm01_L   kz.s_sfm01_S 
## "kz.s_20m05_L" "kz.s_20m05_S" "kz.s_sfm01_B" "kz.s_sfm01_L" "kz.s_sfm01_S" 
##   kz.s_sfm02_B   kz.s_sfm02_L   kz.s_sfm02_S   kz.s_sfm03_B   kz.s_sfm03_L 
## "kz.s_sfm02_B" "kz.s_sfm02_L" "kz.s_sfm02_S" "kz.s_sfm03_B" "kz.s_sfm03_L" 
##   kz.s_sfm03_S   kz.s_sfm04_B   kz.s_sfm04_L   kz.s_sfm04_S   kz.s_sfm05_B 
## "kz.s_sfm03_S" "kz.s_sfm04_B" "kz.s_sfm04_L" "kz.s_sfm04_S" "kz.s_sfm05_B" 
##   kz.s_sfm05_L   kz.s_sfm05_S   kz.s_sfm06_B   kz.s_sfm06_L   kz.s_sfm06_S 
## "kz.s_sfm05_L" "kz.s_sfm05_S" "kz.s_sfm06_B" "kz.s_sfm06_L" "kz.s_sfm06_S" 
##   kz.s_sfm07_B   kz.s_sfm07_L   kz.s_sfm07_S   kz.s_Sft01_B   kz.s_Sft01_L 
## "kz.s_sfm07_B" "kz.s_sfm07_L" "kz.s_sfm07_S" "kz.s_Sft01_B" "kz.s_Sft01_L" 
##   kz.s_Sft01_S   kz.s_Sft02_B   kz.s_Sft02_L   kz.s_Sft02_S 
## "kz.s_Sft01_S" "kz.s_Sft02_B" "kz.s_Sft02_L" "kz.s_Sft02_S"

Identify primers

I am adding this step to ensure that primers have been properly removed

#BITS
FWD <- "ACCTGCGGARGGATCA"  

## B58S3
REV <- "GAGATCCRTTGYTRAAAGTT" 

allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))  # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients

##            Forward         Complement            Reverse            RevComp 
## "ACCTGCGGARGGATCA" "TGGACGCCTYCCTAGT" "ACTAGGRAGGCGTCCA" "TGATCCYTCCGCAGGT"

###The presence of ambiguous bases (Ns) in the sequencing reads makes accurate mapping of short primer sequences difficult. Next we are going to “pre-filter” the sequences just to remove those with Ns, but perform no other filtering.
fnFs.filtN <- file.path(path, "filtN", basename(fnFs)) 
# Put N-filterd files in filtN/ subdirectory
fnRs.filtN <- file.path(path, "filtN", basename(fnRs))
filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = TRUE)

#We are now ready to count the number of times the primers appear in the forward and reverse read, while considering all possible primer orientations. Identifying and counting the primers on one set of paired end FASTQ files is sufficient, assuming all the files were created using the same library preparation, so we’ll just process the first sample.

primerHits <- function(primer, fn) {
    # Counts number of reads in which the primer is found
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
    return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]), 
    FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]), 
    REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]), 
    REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))

##                  Forward Complement Reverse RevComp
## FWD.ForwardReads       0          0       0       0
## FWD.ReverseReads       0          0       0    9753
## REV.ForwardReads       0          0       0   13874
## REV.ReverseReads       0          0       0       0

             Forward Complement Reverse RevComp

FWD.ForwardReads 0 0 0 0 FWD.ReverseReads 0 0 0 9753 REV.ForwardReads 0 0 0 13874 REV.ReverseReads 0 0 0 0

#cutadapt <- "/Users/kuehnlab/opt/anaconda3/envs/cutadaptenv/bin/cutadapt" # CHANGE ME to the cutadapt path on your machine
cutadapt <- "/Users/kuehnlab/miniconda3/envs/qiime2-2022.11/bin/cutadapt"

system2(cutadapt, args = "--version") # Run shell commands from R

NOTE: -m 45 == minimum length of 45, because my I’d like to go ahead and drop those itty bitty reads altogether.

path.cut <- file.path(path, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))

FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC) 
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC) 
# Run Cutadapt

### Note, Charlie initially added argument "-m 1" below to remove remnant reads of length zero. Now attempting with min length more stringent min length, something just below downstream minimum length, e.g. 45 is below downstream length of 50...

for(i in seq_along(fnFs)) {
  system2(cutadapt, args = c(R1.flags, R2.flags, "-m", 45, "-n", 2, # -n 2 required to remove FWD and REV from reads
                             "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
                             fnFs.filtN[i], fnRs.filtN[i])) # input files
}

###Charlie note: The output gives a warning to check the adapter sequences: WARNING: The adapter is preceded by 'A' extremely often. The provided adapter sequence could be incomplete at its 5' end. Ignore this warning when trimming primers.

HOWEVER, we can Ignore this warning , because we’re trimming primers. Adapters are on the outside of the primers, so by trimming primers, we are without a doubt also trimming off adapters. So just ignore the warning, we’re good.

#Sanity check, did it work?

rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), 
    FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), 
    REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]), 
    REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))

##                  Forward Complement Reverse RevComp
## FWD.ForwardReads       0          0       0       0
## FWD.ReverseReads       0          0       0       0
## REV.ForwardReads       0          0       0       0
## REV.ReverseReads       0          0       0       0

# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "_R1_001.fastq", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "_R2_001.fastq", full.names = TRUE))

# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)

## [1] "01m01-B" "01m01-L" "01m01-S" "01m02-B" "01m02-L" "01m02-S"

library(stringr)
#Extract sample names
sample.names <- sapply(str_replace(sample.names, "-", "_"),`[`, 1)
#sample.names <- sapply(paste0('kz.s_', sample.names),`[`, 1)
sample.names

##   01m01_B   01m01_L   01m01_S   01m02_B   01m02_L   01m02_S   01m03_B   01m03_L 
## "01m01_B" "01m01_L" "01m01_S" "01m02_B" "01m02_L" "01m02_S" "01m03_B" "01m03_L" 
##   01m03_S   01m03_W   01m04_B   01m04_L   01m04_S   01m05_B   01m05_L   01m05_S 
## "01m03_S" "01m03_W" "01m04_B" "01m04_L" "01m04_S" "01m05_B" "01m05_L" "01m05_S" 
##   01m06_B   01m06_L   01m06_S   02m01_B   02m01_L   02m01_S   02m02_B   02m02_L 
## "01m06_B" "01m06_L" "01m06_S" "02m01_B" "02m01_L" "02m01_S" "02m02_B" "02m02_L" 
##   02m02_S   02m03_B   02m03_L   02m03_S   02m04_L   02m04_S   02m05_B   02m05_L 
## "02m02_S" "02m03_B" "02m03_L" "02m03_S" "02m04_L" "02m04_S" "02m05_B" "02m05_L" 
##   02m06_B   02m06_L   02m06_S   02m07_B   02m07_L   02m07_S   02m08_B   02m08_L 
## "02m06_B" "02m06_L" "02m06_S" "02m07_B" "02m07_L" "02m07_S" "02m08_B" "02m08_L" 
##   02m08_S   02m09_B   02m09_L   02m09_S   02m10_B   02m10_L   02m10_S   02m11_B 
## "02m08_S" "02m09_B" "02m09_L" "02m09_S" "02m10_B" "02m10_L" "02m10_S" "02m11_B" 
##   02m11_S   02m11_W   04m01_B   04m01_L   04m01_S   04m02_B   04m02_L   04m02_S 
## "02m11_S" "02m11_W" "04m01_B" "04m01_L" "04m01_S" "04m02_B" "04m02_L" "04m02_S" 
##   04m03_B   04m03_L   04m03_S   04m04_B   04m04_L   04m04_S   04m05_B   04m05_L 
## "04m03_B" "04m03_L" "04m03_S" "04m04_B" "04m04_L" "04m04_S" "04m05_B" "04m05_L" 
##   04m05_S   04m06_B   04m06_L   04m06_S   04m07_L   04m07_S   04m08_B   04m08_L 
## "04m05_S" "04m06_B" "04m06_L" "04m06_S" "04m07_L" "04m07_S" "04m08_B" "04m08_L" 
##   04m08_S   04m09_B   04m09_L   04m09_S   04m10_B   04m10_L   04m10_S   04m11_B 
## "04m08_S" "04m09_B" "04m09_L" "04m09_S" "04m10_B" "04m10_L" "04m10_S" "04m11_B" 
##   04m11_L   04m11_S   04m11_W   04m12_B   04m12_L   04m12_S   04m13_B   04m13_L 
## "04m11_L" "04m11_S" "04m11_W" "04m12_B" "04m12_L" "04m12_S" "04m13_B" "04m13_L" 
##   04m13_S   04t01_B   04t01_L   04t01_S   04t02_B   04t02_L   04t02_S   04w01_B 
## "04m13_S" "04t01_B" "04t01_L" "04t01_S" "04t02_B" "04t02_L" "04t02_S" "04w01_B" 
##   04w01_L   04w01_S   04w02_B   04w02_L   04w02_S   04w03_B   04w03_L   04w03_S 
## "04w01_L" "04w01_S" "04w02_B" "04w02_L" "04w02_S" "04w03_B" "04w03_L" "04w03_S" 
##   04w04_L   04w04_S   20m01_L   20m01_S   20m02_B   20m02_L   20m02_S   20m02_W 
## "04w04_L" "04w04_S" "20m01_L" "20m01_S" "20m02_B" "20m02_L" "20m02_S" "20m02_W" 
##   20m03_B   20m03_L   20m03_S   20m04_B   20m04_L   20m04_S   20m04_W   20m05_B 
## "20m03_B" "20m03_L" "20m03_S" "20m04_B" "20m04_L" "20m04_S" "20m04_W" "20m05_B" 
##   20m05_L   20m05_S   sfm01_B   sfm01_L   sfm01_S   sfm02_B   sfm02_L   sfm02_S 
## "20m05_L" "20m05_S" "sfm01_B" "sfm01_L" "sfm01_S" "sfm02_B" "sfm02_L" "sfm02_S" 
##   sfm03_B   sfm03_L   sfm03_S   sfm04_B   sfm04_L   sfm04_S   sfm05_B   sfm05_L 
## "sfm03_B" "sfm03_L" "sfm03_S" "sfm04_B" "sfm04_L" "sfm04_S" "sfm05_B" "sfm05_L" 
##   sfm05_S   sfm06_B   sfm06_L   sfm06_S   sfm07_B   sfm07_L   sfm07_S   Sft01_B 
## "sfm05_S" "sfm06_B" "sfm06_L" "sfm06_S" "sfm07_B" "sfm07_L" "sfm07_S" "Sft01_B" 
##   Sft01_L   Sft01_S   Sft02_B   Sft02_L   Sft02_S 
## "Sft01_L" "Sft01_S" "Sft02_B" "Sft02_L" "Sft02_S"

Inspect and Filter Sequence Quality

#Inspect read quality profiles for forward reads
plotQualityProfile(cutFs[c(6,12,36,62,84,120)])

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the dada2 package.
##   Please report the issue at <https://github.com/benjjneb/dada2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Inspect read quality profiles for reverse reads
plotQualityProfile(cutRs[c(6,12,36,62,84,120)])

Setting file paths for the samples we’re about to trim:

#Filter and Trim
filtFs <- file.path(path.cut, "filtered", basename(cutFs))
filtRs <- file.path(path.cut, "filtered", basename(cutRs))

names(filtFs) <- sample.names
names(filtRs) <- sample.names

and now for the trimming, note that I set multi-thread to TRUE because I am on a Mac. You’ll have to set that to FALSE if you’re on a PC. The default expected error (maxEE) is 2, but I’ve relaxed them here to reflect our lower quality data. Cranking

##Trim trim trim!!
#Filter reads where both sets maintain a quality score ~30+ (error rate 1 in 1,000)
## Mac can multi-threat this, windows cannot ##
out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, 
                     #truncLen=c(220,120),minLen=50, 
                     maxN=0, maxEE=c(2,3), truncQ=2, rm.phix=TRUE,
                     compress=TRUE, multithread=TRUE) 
out

##                                reads.in reads.out
## 01m01-B_S34_L001_R1_001.fastq     43975     35155
## 01m01-L_S4_L001_R1_001.fastq      46129     21641
## 01m01-S_S148_L001_R1_001.fastq    27321     17639
## 01m02-B_S46_L001_R1_001.fastq     76462     48015
## 01m02-L_S16_L001_R1_001.fastq     76187     49960
## 01m02-S_S160_L001_R1_001.fastq     8234      5196
## 01m03-B_S58_L001_R1_001.fastq     60429     47185
## 01m03-L_S28_L001_R1_001.fastq     83613     43967
## 01m03-S_S172_L001_R1_001.fastq    45097     30592
## 01m03-W_S142_L001_R1_001.fastq     5784      4212
## 01m04-B_S70_L001_R1_001.fastq     35636     26533
## 01m04-L_S40_L001_R1_001.fastq     50393     25707
## 01m04-S_S184_L001_R1_001.fastq    37777     26690
## 01m05-B_S82_L001_R1_001.fastq     59978     42177
## 01m05-L_S52_L001_R1_001.fastq     86716     44488
## 01m05-S_S101_L001_R1_001.fastq    25576     17428
## 01m06-B_S94_L001_R1_001.fastq     24238     16770
## 01m06-L_S64_L001_R1_001.fastq     74432     43527
## 01m06-S_S113_L001_R1_001.fastq    38987     24338
## 02m01-B_S92_L001_R1_001.fastq     34548     29051
## 02m01-L_S74_L001_R1_001.fastq     76393     42366
## 02m01-S_S111_L001_R1_001.fastq    40790     31940
## 02m02-B_S9_L001_R1_001.fastq      43955     31931
## 02m02-L_S86_L001_R1_001.fastq     48777     24853
## 02m02-S_S123_L001_R1_001.fastq    35672     26983
## 02m03-B_S21_L001_R1_001.fastq     76013     45518
## 02m03-L_S3_L001_R1_001.fastq      51031     33325
## 02m03-S_S135_L001_R1_001.fastq    64523     49053
## 02m04-L_S15_L001_R1_001.fastq     63636     44504
## 02m04-S_S147_L001_R1_001.fastq    22136     15866
## 02m05-B_S45_L001_R1_001.fastq     56844     44683
## 02m05-L_S27_L001_R1_001.fastq     98563     65773
## 02m06-B_S57_L001_R1_001.fastq     75874     50930
## 02m06-L_S39_L001_R1_001.fastq     76059     55956
## 02m06-S_S171_L001_R1_001.fastq    55988     38555
## 02m07-B_S69_L001_R1_001.fastq     62143     45774
## 02m07-L_S51_L001_R1_001.fastq    118246     76496
## 02m07-S_S183_L001_R1_001.fastq    75934     54335
## 02m08-B_S81_L001_R1_001.fastq     40275     33122
## 02m08-L_S63_L001_R1_001.fastq     86204     51111
## 02m08-S_S100_L001_R1_001.fastq     9311      6092
## 02m09-B_S93_L001_R1_001.fastq     43859     34324
## 02m09-L_S75_L001_R1_001.fastq     90511     59295
## 02m09-S_S112_L001_R1_001.fastq    30696     22450
## 02m10-B_S10_L001_R1_001.fastq     44514     28590
## 02m10-L_S87_L001_R1_001.fastq     69759     33581
## 02m10-S_S124_L001_R1_001.fastq    46159     30020
## 02m11-B_S22_L001_R1_001.fastq     36351     25529
## 02m11-S_S136_L001_R1_001.fastq    56263     41257
## 02m11-W_S106_L001_R1_001.fastq     7937      3909
## 04m01-B_S11_L001_R1_001.fastq     78076     48864
## 04m01-L_S76_L001_R1_001.fastq     92460     38731
## 04m01-S_S125_L001_R1_001.fastq    52804     34502
## 04m02-B_S23_L001_R1_001.fastq     55827     37210
## 04m02-L_S88_L001_R1_001.fastq     68735     38968
## 04m02-S_S137_L001_R1_001.fastq    50943     37498
## 04m03-B_S35_L001_R1_001.fastq     78294     50963
## 04m03-L_S5_L001_R1_001.fastq      18335      7496
## 04m03-S_S149_L001_R1_001.fastq    43851     27207
## 04m04-B_S47_L001_R1_001.fastq     69126     50264
## 04m04-L_S17_L001_R1_001.fastq     47853     32014
## 04m04-S_S161_L001_R1_001.fastq    22364     17493
## 04m05-B_S59_L001_R1_001.fastq     74954     57688
## 04m05-L_S29_L001_R1_001.fastq     68377     34665
## 04m05-S_S173_L001_R1_001.fastq    27680     16379
## 04m06-B_S71_L001_R1_001.fastq     34662     27772
## 04m06-L_S41_L001_R1_001.fastq     75006     44800
## 04m06-S_S185_L001_R1_001.fastq    29846     20424
## 04m07-L_S53_L001_R1_001.fastq     62504     32670
## 04m07-S_S102_L001_R1_001.fastq    29284     23360
## 04m08-B_S83_L001_R1_001.fastq      3731      2517
## 04m08-L_S65_L001_R1_001.fastq     50221     36940
## 04m08-S_S114_L001_R1_001.fastq    17655     12813
## 04m09-B_S95_L001_R1_001.fastq     50351     40705
## 04m09-L_S77_L001_R1_001.fastq     58023     30525
## 04m09-S_S126_L001_R1_001.fastq    13880      8955
## 04m10-B_S12_L001_R1_001.fastq     74336     43139
## 04m10-L_S89_L001_R1_001.fastq     43184      8014
## 04m10-S_S138_L001_R1_001.fastq    59960     43071
## 04m11-B_S24_L001_R1_001.fastq     59008     38772
## 04m11-L_S6_L001_R1_001.fastq      52224     32835
## 04m11-S_S150_L001_R1_001.fastq    44390     29117
## 04m11-W_S108_L001_R1_001.fastq    11267      6099
## 04m12-B_S36_L001_R1_001.fastq     51883     41966
## 04m12-L_S18_L001_R1_001.fastq     43673     26574
## 04m12-S_S162_L001_R1_001.fastq    48382     33630
## 04m13-B_S48_L001_R1_001.fastq     42861     11062
## 04m13-L_S30_L001_R1_001.fastq     96874     67399
## 04m13-S_S174_L001_R1_001.fastq    43386     27330
## 04t01-B_S55_L001_R1_001.fastq     32696     17862
## 04t01-L_S37_L001_R1_001.fastq     63732     39008
## 04t01-S_S169_L001_R1_001.fastq    21724     12956
## 04t02-B_S67_L001_R1_001.fastq     70267     51565
## 04t02-L_S49_L001_R1_001.fastq     85594     52321
## 04t02-S_S181_L001_R1_001.fastq    18697     14196
## 04w01-B_S79_L001_R1_001.fastq     49036     24535
## 04w01-L_S61_L001_R1_001.fastq     93652     53797
## 04w01-S_S98_L001_R1_001.fastq     65217     43712
## 04w02-B_S91_L001_R1_001.fastq     15274     10167
## 04w02-L_S73_L001_R1_001.fastq     78463     40381
## 04w02-S_S110_L001_R1_001.fastq    63734     43429
## 04w03-B_S8_L001_R1_001.fastq      35725     24927
## 04w03-L_S85_L001_R1_001.fastq     46078     27475
## 04w03-S_S122_L001_R1_001.fastq    61651     38401
## 04w04-L_S2_L001_R1_001.fastq      59198     41100
## 04w04-S_S134_L001_R1_001.fastq    23573     19202
## 20m01-L_S14_L001_R1_001.fastq     60831     41520
## 20m01-S_S146_L001_R1_001.fastq    45584     25085
## 20m02-B_S44_L001_R1_001.fastq     43946     26292
## 20m02-L_S26_L001_R1_001.fastq     53083     35968
## 20m02-S_S158_L001_R1_001.fastq    30027     18465
## 20m02-W_S152_L001_R1_001.fastq   104730     26660
## 20m03-B_S56_L001_R1_001.fastq     37303     17611
## 20m03-L_S38_L001_R1_001.fastq     76136     51384
## 20m03-S_S170_L001_R1_001.fastq    51600     27714
## 20m04-B_S68_L001_R1_001.fastq     61875     48388
## 20m04-L_S50_L001_R1_001.fastq     93897     64032
## 20m04-S_S182_L001_R1_001.fastq    81142     55204
## 20m04-W_S164_L001_R1_001.fastq    33894     27346
## 20m05-B_S80_L001_R1_001.fastq     54867     41113
## 20m05-L_S62_L001_R1_001.fastq     70338     38147
## 20m05-S_S99_L001_R1_001.fastq     20796     15475
## sfm01-B_S60_L001_R1_001.fastq     59667     40562
## sfm01-L_S42_L001_R1_001.fastq     64524     36699
## sfm01-S_S186_L001_R1_001.fastq    21906     14603
## sfm02-B_S72_L001_R1_001.fastq     72687     55736
## sfm02-L_S54_L001_R1_001.fastq    100754     41594
## sfm02-S_S103_L001_R1_001.fastq    54966     37870
## sfm03-B_S84_L001_R1_001.fastq     43139     25719
## sfm03-L_S66_L001_R1_001.fastq     95030     48888
## sfm03-S_S115_L001_R1_001.fastq     3453      2427
## sfm04-B_S96_L001_R1_001.fastq     37509     23067
## sfm04-L_S78_L001_R1_001.fastq     93703     54593
## sfm04-S_S127_L001_R1_001.fastq    53871     36920
## sfm05-B_S109_L001_R1_001.fastq    62392     48030
## sfm05-L_S90_L001_R1_001.fastq     81246     35565
## sfm05-S_S139_L001_R1_001.fastq    37735     27047
## sfm06-B_S121_L001_R1_001.fastq    25301     19708
## sfm06-L_S7_L001_R1_001.fastq      37064     25171
## sfm06-S_S151_L001_R1_001.fastq    36389     22485
## sfm07-B_S133_L001_R1_001.fastq    71696     52955
## sfm07-L_S19_L001_R1_001.fastq     57542     34494
## sfm07-S_S163_L001_R1_001.fastq    37167     24037
## Sft01-B_S31_L001_R1_001.fastq     74753     51304
## Sft01-L_S13_L001_R1_001.fastq     62788     40703
## Sft01-S_S145_L001_R1_001.fastq    47648     30106
## Sft02-B_S43_L001_R1_001.fastq     82307     66158
## Sft02-L_S25_L001_R1_001.fastq    100297     63591
## Sft02-S_S157_L001_R1_001.fastq    32398     21410

Now to inspect our filtered sequences:

#Re-inspect read quality profiles
plotQualityProfile(filtFs[c(6,12,36,62,84,120)])

#Re-inspect read quality profiles
plotQualityProfile(filtRs[c(6,12,36,62,84,120)])

Now to track down errors and remove them.

Generate an Error Model

Buckle up, Chuck, this here Machine Learning Algorithm learnErrors will generate a parametric error model for the filtered sequence data. Dune.

#Learn error rates
#!!~30 min each!!
errF <- learnErrors(filtFs, multithread=TRUE)

## 100430093 total bases in 503355 reads from 17 samples will be used for learning the error rates.

#Learn error rates
#!!~30 min each!!
errR <- learnErrors(filtRs, multithread=TRUE)

## 100896686 total bases in 503355 reads from 17 samples will be used for learning the error rates.

Plotting errors

#plot error forward
plotErrors(errF, nominalQ=TRUE)

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

#plot error reverse
plotErrors(errR, nominalQ=TRUE)

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

#Dereplication Dereplications means the computer only needs to analyse each identical sequence once, lumping anlyses and saving brain cells.

#Dereplication
derep_forward <- derepFastq(filtFs, verbose=TRUE)
derep_reverse <- derepFastq(filtRs, verbose=TRUE)
names(derep_forward) <- sample.names
names(derep_reverse) <- sample.names

##Infer ASVs

#Inferring ASVs
#Pool or pseudopool if particularly interested in rare taxa (e.g. singletons)

dadaFs <- dada(derep_forward, err=errF, pool="pseudo", multithread=TRUE)

## Sample 1 - 35155 reads in 7599 unique sequences.
## Sample 2 - 21641 reads in 5481 unique sequences.
## Sample 3 - 17639 reads in 5387 unique sequences.
## Sample 4 - 48015 reads in 11413 unique sequences.
## Sample 5 - 49960 reads in 10715 unique sequences.
## Sample 6 - 5196 reads in 1594 unique sequences.
## Sample 7 - 47185 reads in 10913 unique sequences.
## Sample 8 - 43967 reads in 7861 unique sequences.
## Sample 9 - 30592 reads in 8917 unique sequences.
## Sample 10 - 4212 reads in 939 unique sequences.
## Sample 11 - 26533 reads in 5865 unique sequences.
## Sample 12 - 25707 reads in 5671 unique sequences.
## Sample 13 - 26690 reads in 7788 unique sequences.
## Sample 14 - 42177 reads in 9790 unique sequences.
## Sample 15 - 44488 reads in 10074 unique sequences.
## Sample 16 - 17428 reads in 5966 unique sequences.
## Sample 17 - 16770 reads in 4587 unique sequences.
## Sample 18 - 43527 reads in 7969 unique sequences.
## Sample 19 - 24338 reads in 7825 unique sequences.
## Sample 20 - 29051 reads in 6106 unique sequences.
## Sample 21 - 42366 reads in 9959 unique sequences.
## Sample 22 - 31940 reads in 7605 unique sequences.
## Sample 23 - 31931 reads in 7066 unique sequences.
## Sample 24 - 24853 reads in 6108 unique sequences.
## Sample 25 - 26983 reads in 7225 unique sequences.
## Sample 26 - 45518 reads in 9536 unique sequences.
## Sample 27 - 33325 reads in 7030 unique sequences.
## Sample 28 - 49053 reads in 12983 unique sequences.
## Sample 29 - 44504 reads in 10216 unique sequences.
## Sample 30 - 15866 reads in 4942 unique sequences.
## Sample 31 - 44683 reads in 10600 unique sequences.
## Sample 32 - 65773 reads in 16371 unique sequences.
## Sample 33 - 50930 reads in 13615 unique sequences.
## Sample 34 - 55956 reads in 12619 unique sequences.
## Sample 35 - 38555 reads in 11302 unique sequences.
## Sample 36 - 45774 reads in 13015 unique sequences.
## Sample 37 - 76496 reads in 13285 unique sequences.
## Sample 38 - 54335 reads in 14752 unique sequences.
## Sample 39 - 33122 reads in 5396 unique sequences.
## Sample 40 - 51111 reads in 11328 unique sequences.
## Sample 41 - 6092 reads in 2116 unique sequences.
## Sample 42 - 34324 reads in 6540 unique sequences.
## Sample 43 - 59295 reads in 10301 unique sequences.
## Sample 44 - 22450 reads in 5700 unique sequences.
## Sample 45 - 28590 reads in 6320 unique sequences.
## Sample 46 - 33581 reads in 6108 unique sequences.
## Sample 47 - 30020 reads in 10896 unique sequences.
## Sample 48 - 25529 reads in 5282 unique sequences.
## Sample 49 - 41257 reads in 13471 unique sequences.
## Sample 50 - 3909 reads in 829 unique sequences.
## Sample 51 - 48864 reads in 12472 unique sequences.
## Sample 52 - 38731 reads in 8470 unique sequences.
## Sample 53 - 34502 reads in 11814 unique sequences.
## Sample 54 - 37210 reads in 9353 unique sequences.
## Sample 55 - 38968 reads in 10800 unique sequences.
## Sample 56 - 37498 reads in 11004 unique sequences.
## Sample 57 - 50963 reads in 11834 unique sequences.
## Sample 58 - 7496 reads in 2417 unique sequences.
## Sample 59 - 27207 reads in 10317 unique sequences.
## Sample 60 - 50264 reads in 14496 unique sequences.
## Sample 61 - 32014 reads in 7671 unique sequences.
## Sample 62 - 17493 reads in 4833 unique sequences.
## Sample 63 - 57688 reads in 14843 unique sequences.
## Sample 64 - 34665 reads in 7597 unique sequences.
## Sample 65 - 16379 reads in 5737 unique sequences.
## Sample 66 - 27772 reads in 6224 unique sequences.
## Sample 67 - 44800 reads in 10815 unique sequences.
## Sample 68 - 20424 reads in 6864 unique sequences.
## Sample 69 - 32670 reads in 7476 unique sequences.
## Sample 70 - 23360 reads in 6992 unique sequences.
## Sample 71 - 2517 reads in 644 unique sequences.
## Sample 72 - 36940 reads in 9443 unique sequences.
## Sample 73 - 12813 reads in 3970 unique sequences.
## Sample 74 - 40705 reads in 9377 unique sequences.
## Sample 75 - 30525 reads in 7366 unique sequences.
## Sample 76 - 8955 reads in 3180 unique sequences.
## Sample 77 - 43139 reads in 9379 unique sequences.
## Sample 78 - 8014 reads in 2568 unique sequences.
## Sample 79 - 43071 reads in 13201 unique sequences.
## Sample 80 - 38772 reads in 8791 unique sequences.
## Sample 81 - 32835 reads in 9333 unique sequences.
## Sample 82 - 29117 reads in 10389 unique sequences.
## Sample 83 - 6099 reads in 1602 unique sequences.
## Sample 84 - 41966 reads in 7071 unique sequences.
## Sample 85 - 26574 reads in 4630 unique sequences.
## Sample 86 - 33630 reads in 10155 unique sequences.
## Sample 87 - 11062 reads in 2752 unique sequences.
## Sample 88 - 67399 reads in 10744 unique sequences.
## Sample 89 - 27330 reads in 8238 unique sequences.
## Sample 90 - 17862 reads in 5454 unique sequences.
## Sample 91 - 39008 reads in 8378 unique sequences.
## Sample 92 - 12956 reads in 4208 unique sequences.
## Sample 93 - 51565 reads in 13343 unique sequences.
## Sample 94 - 52321 reads in 12895 unique sequences.
## Sample 95 - 14196 reads in 4309 unique sequences.
## Sample 96 - 24535 reads in 5767 unique sequences.
## Sample 97 - 53797 reads in 8268 unique sequences.
## Sample 98 - 43712 reads in 12838 unique sequences.
## Sample 99 - 10167 reads in 2482 unique sequences.
## Sample 100 - 40381 reads in 8852 unique sequences.
## Sample 101 - 43429 reads in 13556 unique sequences.
## Sample 102 - 24927 reads in 5558 unique sequences.
## Sample 103 - 27475 reads in 4081 unique sequences.
## Sample 104 - 38401 reads in 11979 unique sequences.
## Sample 105 - 41100 reads in 8316 unique sequences.
## Sample 106 - 19202 reads in 4113 unique sequences.
## Sample 107 - 41520 reads in 9419 unique sequences.
## Sample 108 - 25085 reads in 8659 unique sequences.
## Sample 109 - 26292 reads in 7411 unique sequences.
## Sample 110 - 35968 reads in 5852 unique sequences.
## Sample 111 - 18465 reads in 6403 unique sequences.
## Sample 112 - 26660 reads in 8839 unique sequences.
## Sample 113 - 17611 reads in 7322 unique sequences.
## Sample 114 - 51384 reads in 9978 unique sequences.
## Sample 115 - 27714 reads in 8668 unique sequences.
## Sample 116 - 48388 reads in 11825 unique sequences.
## Sample 117 - 64032 reads in 11718 unique sequences.
## Sample 118 - 55204 reads in 14667 unique sequences.
## Sample 119 - 27346 reads in 5977 unique sequences.
## Sample 120 - 41113 reads in 7377 unique sequences.
## Sample 121 - 38147 reads in 8952 unique sequences.
## Sample 122 - 15475 reads in 5501 unique sequences.
## Sample 123 - 40562 reads in 10360 unique sequences.
## Sample 124 - 36699 reads in 9138 unique sequences.
## Sample 125 - 14603 reads in 5395 unique sequences.
## Sample 126 - 55736 reads in 13993 unique sequences.
## Sample 127 - 41594 reads in 8183 unique sequences.
## Sample 128 - 37870 reads in 10587 unique sequences.
## Sample 129 - 25719 reads in 5532 unique sequences.
## Sample 130 - 48888 reads in 11002 unique sequences.
## Sample 131 - 2427 reads in 732 unique sequences.
## Sample 132 - 23067 reads in 5778 unique sequences.
## Sample 133 - 54593 reads in 16248 unique sequences.
## Sample 134 - 36920 reads in 11459 unique sequences.
## Sample 135 - 48030 reads in 14138 unique sequences.
## Sample 136 - 35565 reads in 9469 unique sequences.
## Sample 137 - 27047 reads in 7354 unique sequences.
## Sample 138 - 19708 reads in 4737 unique sequences.
## Sample 139 - 25171 reads in 7147 unique sequences.
## Sample 140 - 22485 reads in 8064 unique sequences.
## Sample 141 - 52955 reads in 16457 unique sequences.
## Sample 142 - 34494 reads in 9953 unique sequences.
## Sample 143 - 24037 reads in 8138 unique sequences.
## Sample 144 - 51304 reads in 14735 unique sequences.
## Sample 145 - 40703 reads in 9651 unique sequences.
## Sample 146 - 30106 reads in 11318 unique sequences.
## Sample 147 - 66158 reads in 14299 unique sequences.
## Sample 148 - 63591 reads in 14108 unique sequences.
## Sample 149 - 21410 reads in 6913 unique sequences.
## 
##    selfConsist step 2

dadaRs <- dada(derep_reverse, err=errR, pool="pseudo", multithread=TRUE)

## Sample 1 - 35155 reads in 11437 unique sequences.
## Sample 2 - 21641 reads in 12686 unique sequences.
## Sample 3 - 17639 reads in 9308 unique sequences.
## Sample 4 - 48015 reads in 20405 unique sequences.
## Sample 5 - 49960 reads in 20340 unique sequences.
## Sample 6 - 5196 reads in 2601 unique sequences.
## Sample 7 - 47185 reads in 16310 unique sequences.
## Sample 8 - 43967 reads in 19406 unique sequences.
## Sample 9 - 30592 reads in 15757 unique sequences.
## Sample 10 - 4212 reads in 1256 unique sequences.
## Sample 11 - 26533 reads in 8925 unique sequences.
## Sample 12 - 25707 reads in 13050 unique sequences.
## Sample 13 - 26690 reads in 12716 unique sequences.
## Sample 14 - 42177 reads in 19209 unique sequences.
## Sample 15 - 44488 reads in 21799 unique sequences.
## Sample 16 - 17428 reads in 9171 unique sequences.
## Sample 17 - 16770 reads in 6970 unique sequences.
## Sample 18 - 43527 reads in 19214 unique sequences.
## Sample 19 - 24338 reads in 12255 unique sequences.
## Sample 20 - 29051 reads in 8740 unique sequences.
## Sample 21 - 42366 reads in 22893 unique sequences.
## Sample 22 - 31940 reads in 11598 unique sequences.
## Sample 23 - 31931 reads in 11927 unique sequences.
## Sample 24 - 24853 reads in 10001 unique sequences.
## Sample 25 - 26983 reads in 10302 unique sequences.
## Sample 26 - 45518 reads in 20988 unique sequences.
## Sample 27 - 33325 reads in 15302 unique sequences.
## Sample 28 - 49053 reads in 18010 unique sequences.
## Sample 29 - 44504 reads in 19442 unique sequences.
## Sample 30 - 15866 reads in 7613 unique sequences.
## Sample 31 - 44683 reads in 17108 unique sequences.
## Sample 32 - 65773 reads in 27050 unique sequences.
## Sample 33 - 50930 reads in 21911 unique sequences.
## Sample 34 - 55956 reads in 21036 unique sequences.
## Sample 35 - 38555 reads in 18003 unique sequences.
## Sample 36 - 45774 reads in 19748 unique sequences.
## Sample 37 - 76496 reads in 33556 unique sequences.
## Sample 38 - 54335 reads in 23710 unique sequences.
## Sample 39 - 33122 reads in 12574 unique sequences.
## Sample 40 - 51111 reads in 25272 unique sequences.
## Sample 41 - 6092 reads in 3648 unique sequences.
## Sample 42 - 34324 reads in 11370 unique sequences.
## Sample 43 - 59295 reads in 27134 unique sequences.
## Sample 44 - 22450 reads in 8829 unique sequences.
## Sample 45 - 28590 reads in 12053 unique sequences.
## Sample 46 - 33581 reads in 15618 unique sequences.
## Sample 47 - 30020 reads in 15598 unique sequences.
## Sample 48 - 25529 reads in 9946 unique sequences.
## Sample 49 - 41257 reads in 18637 unique sequences.
## Sample 50 - 3909 reads in 1701 unique sequences.
## Sample 51 - 48864 reads in 23572 unique sequences.
## Sample 52 - 38731 reads in 24311 unique sequences.
## Sample 53 - 34502 reads in 16904 unique sequences.
## Sample 54 - 37210 reads in 17287 unique sequences.
## Sample 55 - 38968 reads in 14812 unique sequences.
## Sample 56 - 37498 reads in 16147 unique sequences.
## Sample 57 - 50963 reads in 22805 unique sequences.
## Sample 58 - 7496 reads in 4323 unique sequences.
## Sample 59 - 27207 reads in 16174 unique sequences.
## Sample 60 - 50264 reads in 25241 unique sequences.
## Sample 61 - 32014 reads in 16078 unique sequences.
## Sample 62 - 17493 reads in 6877 unique sequences.
## Sample 63 - 57688 reads in 24856 unique sequences.
## Sample 64 - 34665 reads in 18393 unique sequences.
## Sample 65 - 16379 reads in 8838 unique sequences.
## Sample 66 - 27772 reads in 10821 unique sequences.
## Sample 67 - 44800 reads in 20605 unique sequences.
## Sample 68 - 20424 reads in 9824 unique sequences.
## Sample 69 - 32670 reads in 16120 unique sequences.
## Sample 70 - 23360 reads in 9968 unique sequences.
## Sample 71 - 2517 reads in 1271 unique sequences.
## Sample 72 - 36940 reads in 16313 unique sequences.
## Sample 73 - 12813 reads in 5628 unique sequences.
## Sample 74 - 40705 reads in 15227 unique sequences.
## Sample 75 - 30525 reads in 17980 unique sequences.
## Sample 76 - 8955 reads in 4674 unique sequences.
## Sample 77 - 43139 reads in 21394 unique sequences.
## Sample 78 - 8014 reads in 4148 unique sequences.
## Sample 79 - 43071 reads in 16695 unique sequences.
## Sample 80 - 38772 reads in 18204 unique sequences.
## Sample 81 - 32835 reads in 14875 unique sequences.
## Sample 82 - 29117 reads in 15395 unique sequences.
## Sample 83 - 6099 reads in 3186 unique sequences.
## Sample 84 - 41966 reads in 14485 unique sequences.
## Sample 85 - 26574 reads in 12412 unique sequences.
## Sample 86 - 33630 reads in 15899 unique sequences.
## Sample 87 - 11062 reads in 6490 unique sequences.
## Sample 88 - 67399 reads in 22734 unique sequences.
## Sample 89 - 27330 reads in 14887 unique sequences.
## Sample 90 - 17862 reads in 9593 unique sequences.
## Sample 91 - 39008 reads in 17610 unique sequences.
## Sample 92 - 12956 reads in 7687 unique sequences.
## Sample 93 - 51565 reads in 20765 unique sequences.
## Sample 94 - 52321 reads in 20612 unique sequences.
## Sample 95 - 14196 reads in 5241 unique sequences.
## Sample 96 - 24535 reads in 12938 unique sequences.
## Sample 97 - 53797 reads in 24981 unique sequences.
## Sample 98 - 43712 reads in 21120 unique sequences.
## Sample 99 - 10167 reads in 4478 unique sequences.
## Sample 100 - 40381 reads in 21888 unique sequences.
## Sample 101 - 43429 reads in 21262 unique sequences.
## Sample 102 - 24927 reads in 9586 unique sequences.
## Sample 103 - 27475 reads in 6783 unique sequences.
## Sample 104 - 38401 reads in 19877 unique sequences.
## Sample 105 - 41100 reads in 17416 unique sequences.
## Sample 106 - 19202 reads in 6393 unique sequences.
## Sample 107 - 41520 reads in 19343 unique sequences.
## Sample 108 - 25085 reads in 15280 unique sequences.
## Sample 109 - 26292 reads in 13920 unique sequences.
## Sample 110 - 35968 reads in 14181 unique sequences.
## Sample 111 - 18465 reads in 10412 unique sequences.
## Sample 112 - 26660 reads in 18256 unique sequences.
## Sample 113 - 17611 reads in 10805 unique sequences.
## Sample 114 - 51384 reads in 22931 unique sequences.
## Sample 115 - 27714 reads in 17396 unique sequences.
## Sample 116 - 48388 reads in 18471 unique sequences.
## Sample 117 - 64032 reads in 27510 unique sequences.
## Sample 118 - 55204 reads in 28109 unique sequences.
## Sample 119 - 27346 reads in 8893 unique sequences.
## Sample 120 - 41113 reads in 17040 unique sequences.
## Sample 121 - 38147 reads in 19143 unique sequences.
## Sample 122 - 15475 reads in 7448 unique sequences.
## Sample 123 - 40562 reads in 19735 unique sequences.
## Sample 124 - 36699 reads in 21021 unique sequences.
## Sample 125 - 14603 reads in 8060 unique sequences.
## Sample 126 - 55736 reads in 24986 unique sequences.
## Sample 127 - 41594 reads in 20195 unique sequences.
## Sample 128 - 37870 reads in 17026 unique sequences.
## Sample 129 - 25719 reads in 13987 unique sequences.
## Sample 130 - 48888 reads in 23633 unique sequences.
## Sample 131 - 2427 reads in 1030 unique sequences.
## Sample 132 - 23067 reads in 11889 unique sequences.
## Sample 133 - 54593 reads in 28382 unique sequences.
## Sample 134 - 36920 reads in 16240 unique sequences.
## Sample 135 - 48030 reads in 18951 unique sequences.
## Sample 136 - 35565 reads in 17980 unique sequences.
## Sample 137 - 27047 reads in 12144 unique sequences.
## Sample 138 - 19708 reads in 6603 unique sequences.
## Sample 139 - 25171 reads in 11877 unique sequences.
## Sample 140 - 22485 reads in 12389 unique sequences.
## Sample 141 - 52955 reads in 22474 unique sequences.
## Sample 142 - 34494 reads in 16175 unique sequences.
## Sample 143 - 24037 reads in 11831 unique sequences.
## Sample 144 - 51304 reads in 23154 unique sequences.
## Sample 145 - 40703 reads in 19175 unique sequences.
## Sample 146 - 30106 reads in 17560 unique sequences.
## Sample 147 - 66158 reads in 20897 unique sequences.
## Sample 148 - 63591 reads in 25860 unique sequences.
## Sample 149 - 21410 reads in 10885 unique sequences.
## 
##    selfConsist step 2

dadaFs[[1]]

## dada-class: object describing DADA2 denoising results
## 427 sequence variants were inferred from 7599 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

NOW WE WILL MERGE #MERGE F and R

#Merge forward and reverse reads (make contigs)

mergers <- mergePairs(dadaFs, derep_forward, dadaRs, derep_reverse, 
                      #trimOverhang=TRUE, minOverlap=45, 
                      verbose=TRUE)
## Note: new ITS DADA2 tutorial using our primers does not suggest trimming overhang or using a minOverlap greater than default of 12. This should result in retaining more reads, and we'll rely on chimera detection to detect spurious mergers. ...

Now construct a count table for our newly merged ASVs.

##Construct a Count Table

#Construct ASV table
seqtab <- makeSequenceTable(mergers)
dim(seqtab)

## [1]   149 14219

Project1216R: 149 samples, 14219 putative ASVs…

table(nchar(getSequences(seqtab)))

## 
##  47  48  50  52  53  55  57  60  63  65  66  67  68  69  70  71  72  73  74  75 
##   1   1   1   2   1   1   1   1   3   6   2   3   5   1   3   1   5   8   6  13 
##  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95 
##   8  12  12  12  15  12  20  14   5  17  14  12   5   8   5  12  12  12   6   9 
##  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 
##  10  13  17  16  24  39  26  28  39  28  21  42  37  26  44  61  34  76  45  32 
## 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 
##  29  39  40  34  24  26  23  42  40  42  42  47  42  27  27  30  36  42  31  41 
## 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 
##  36  55  98  64  80  88  98 136 116  88 110 101 145 164 116  96  73  80  95  88 
## 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 
## 140  97  93 156  99  94 134 121 136 102 149 132 130  90 135 121  91 120 109 173 
## 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 
## 128 115 111 112  89 104 108  85  82 134 116 107  79 107 103  75 154  90 117 107 
## 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 
## 135 106 119 120  96  95  92 101 107  75  53  77  71  65  50  40  72  44  46  57 
## 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 
##  48  55  61  48  32  37  65  43  34  48  49  66  41  34  40  37  50  34  54  52 
## 236 237 238 239 240 241 242 243 244 245 246 247 248 250 251 252 253 254 255 256 
##  58  40  54  55  70  85  72  37  54  58  58  23  39   1 135 109  97  39  42  28 
## 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 
##  32  37  56  34  24  25  38  41  37  41  44  18  29  30  47  32  27  15  32  31 
## 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 
## 132  27  33  18  29  25  23  16  32  25  19  23  19  22   8  11  10   7  19   7 
## 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 
##  14   9   9  24  38  21  10   7  15  10   7  20  23  22  21  10  26  41   7  13 
## 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 
##  21  19   8   7  13  19   8   8  12  13   5   8   7   8  23   9   3   3   3   8 
## 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 
##  12   9   2   3   2   7  16   6   5   6   6   3  11   9   1  11   5   2   2   5 
## 357 358 359 360 361 362 363 364 365 366 367 369 370 371 372 373 374 375 376 377 
##   1   6  11   3   3   9   6  10   9  16   4   2   2   2   4   3   1   1   2   1 
## 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 
##  10   4   3   3   7   4   2   4   3   2   2   1   1   9   6   3   1   5  10   6 
## 398 399 400 401 402 403 404 405 407 408 409 410 411 412 414 415 416 417 418 419 
##   7   1   3   3   6   4   3   4   1   1   1   2   2   2   1   2   1   2   3   3 
## 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 
##   9   4   1   2   1   1  57   6   6  27   4   4   1   1   1   1   1   8   1   4 
## 440 441 442 443 444 445 446 448 449 450 451 452 455 456 457 458 462 464 465 466 
##   5   3   4   1   3   1   3   2   7  47   3   8   5  23   2   1   1   1   8   1 
## 469 471 472 473 475 476 478 480 481 482 484 
##  22   3   2   8   1   1   1   1   1   1   1

seqtab <- seqtab[,nchar(colnames(seqtab)) %in% 74:430]
table(nchar(getSequences(seqtab)))

## 
##  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93 
##   6  13   8  12  12  12  15  12  20  14   5  17  14  12   5   8   5  12  12  12 
##  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 
##   6   9  10  13  17  16  24  39  26  28  39  28  21  42  37  26  44  61  34  76 
## 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 
##  45  32  29  39  40  34  24  26  23  42  40  42  42  47  42  27  27  30  36  42 
## 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 
##  31  41  36  55  98  64  80  88  98 136 116  88 110 101 145 164 116  96  73  80 
## 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 
##  95  88 140  97  93 156  99  94 134 121 136 102 149 132 130  90 135 121  91 120 
## 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 
## 109 173 128 115 111 112  89 104 108  85  82 134 116 107  79 107 103  75 154  90 
## 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 
## 117 107 135 106 119 120  96  95  92 101 107  75  53  77  71  65  50  40  72  44 
## 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 
##  46  57  48  55  61  48  32  37  65  43  34  48  49  66  41  34  40  37  50  34 
## 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 250 251 252 253 254 
##  54  52  58  40  54  55  70  85  72  37  54  58  58  23  39   1 135 109  97  39 
## 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 
##  42  28  32  37  56  34  24  25  38  41  37  41  44  18  29  30  47  32  27  15 
## 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 
##  32  31 132  27  33  18  29  25  23  16  32  25  19  23  19  22   8  11  10   7 
## 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 
##  19   7  14   9   9  24  38  21  10   7  15  10   7  20  23  22  21  10  26  41 
## 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 
##   7  13  21  19   8   7  13  19   8   8  12  13   5   8   7   8  23   9   3   3 
## 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 
##   3   8  12   9   2   3   2   7  16   6   5   6   6   3  11   9   1  11   5   2 
## 355 356 357 358 359 360 361 362 363 364 365 366 367 369 370 371 372 373 374 375 
##   2   5   1   6  11   3   3   9   6  10   9  16   4   2   2   2   4   3   1   1 
## 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 
##   2   1  10   4   3   3   7   4   2   4   3   2   2   1   1   9   6   3   1   5 
## 396 397 398 399 400 401 402 403 404 405 407 408 409 410 411 412 414 415 416 417 
##  10   6   7   1   3   3   6   4   3   4   1   1   1   2   2   2   1   2   1   2 
## 418 419 420 421 422 423 424 425 426 427 428 429 430 
##   3   3   9   4   1   2   1   1  57   6   6  27   4

dim(seqtab)

## [1]   149 13980

review

13980 ASVs, 149 samples.

Now to remove chimeras:

##Remove Chimeras

#Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

## Identified 124 bimeras out of 13980 input sequences.

## Identified 83 bimeras out of 2858 input sequences.
(sum(seqtab.nochim)/sum(seqtab))

## [1] 0.9962187

(1-(sum(seqtab.nochim)/sum(seqtab)))

## [1] 0.003781343

###OLD
#Identified 35 bimeras out of 7907 input sequences.
#[1] 0.9979256
#[1] 0.002074373


### NEW
#Identified 106 bimeras out of 13589 input sequences.
#[1] 0.9961605
#[1] 0.003839465

### 1216R March...
#Identified 94 bimeras out of 13270 input sequences.
#[1] 0.9962695
#[1] 0.003730497

### 1216R April
#Identified 96 bimeras out of 13660 input sequences.
#[1] 0.9962817
#[1] 0.003718295

##Identified 124 bimeras out of 13980 input sequences.
#[1] 0.9962187
#[1] 0.003781343

Track Number of Reads Retained Through Pipeline

#Count reads dropped through each step in the pipeline
getN <- function(x) sum(getUniques(x))
track <- data.frame(row.names=sample.names, dada2_input=out[,1],
                    filtered=out[,2], dada_f=sapply(dadaFs, getN),
                    dada_r=sapply(dadaRs, getN), merged=sapply(mergers, getN),
                    nonchim=rowSums(seqtab.nochim),
                    final_perc_reads_retained=round(rowSums(seqtab.nochim)/out[,1]*100, 1))
#View(track)
mean(track[,7])

## [1] 56.22013

min(track[,7])

## [1] 14.3

### change file name with new run-throughs!!
write.csv(track, "/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS/dada2_final_read_tracking09-15-2023.csv")

Classify ASV Taxonomy

We are using a combination of taxonomy assignment functions to provide alternatives to the handful of spurious classification provided by the otherwise accurate naive Bayesian approach of the dada2 assignTaxonomy() function. Using the ensembleTex package, we’ll merge the outputs

Using current UNITE general release for all eukaryotes: 9.0 2022-10-16 All eukaryotes 17 683 308 588 Current https://doi.org/10.15156/BIO/2483914 When using this resource, please cite it as follows: Abarenkov, Kessy; Zirk, Allan; Piirmann, Timo; Pöhönen, Raivo; Ivanov, Filipp; Nilsson, R. Henrik; Kõljalg, Urmas (2022): UNITE general FASTA release for eukaryotes 2. Version 16.10.2022. UNITE Community. https://doi.org/10.15156/BIO/2483914

Includes global and 3% distance singletons.

overnight!!!

###
#Set Working Directory
setwd('/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS')
#Classify ASVs
#!!~20 min each!!
tax_info <- assignTaxonomy(seqtab.nochim, "sh_general_release_dynamic_s_all_29.11.2022.fasta", 
                           multithread=TRUE, tryRC=TRUE)

## UNITE fungal taxonomic reference detected.

taxa.print <- tax_info
rownames(taxa.print) <- NULL
head(taxa.print)

##      Kingdom           Phylum             Class                 
## [1,] NA                NA                 NA                    
## [2,] "k__Fungi"        "p__Ascomycota"    "c__Dothideomycetes"  
## [3,] "k__Stramenopila" "p__Ochrophyta"    "c__Bacillariophyceae"
## [4,] NA                NA                 NA                    
## [5,] "k__Fungi"        "p__Ascomycota"    "c__Dothideomycetes"  
## [6,] "k__Fungi"        "p__Basidiomycota" "c__Agaricomycetes"   
##      Order             Family               Genus            
## [1,] NA                NA                   NA               
## [2,] "o__Pleosporales" "f__Pleosporaceae"   "g__Alternaria"  
## [3,] "o__Naviculales"  "f__Naviculaceae"    "g__Navicula"    
## [4,] NA                NA                   NA               
## [5,] "o__Capnodiales"  "f__Cladosporiaceae" "g__Cladosporium"
## [6,] "o__Phallales"    "f__Phallaceae"      "g__Phallus"     
##      Species             
## [1,] NA                  
## [2,] "s__tenuissima"     
## [3,] "s__trivialis"      
## [4,] NA                  
## [5,] "s__cladosporioides"
## [6,] "s__hadriani"

#Generate FASTA file
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
for (i in 1:dim(seqtab.nochim)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep="_")
}
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, "dada2fasta09-15-2023.fasta")

#Generate ASV table (.count_table file)
asv_tab <- t(seqtab.nochim)
row.names(asv_tab) <- sub(">", "", asv_headers)
write.table(t(asv_tab), "practice09-15-2023.count_table", sep="\t", quote=F, col.names=NA)

#Generate taxonomy table
rownames(tax_info) <- gsub(pattern=">", replacement="", x=asv_headers)
write.csv(tax_info, "dada2_taxonomy09-15-2023.csv")


### Chunk 3

library("Biostrings")
library(dplyr)
library(stringr)
library(usethis)
library(DECIPHER); packageVersion("DECIPHER")

## Loading required package: RSQLite

## Loading required package: parallel

## [1] '2.26.0'

## [1] '2.14.0'

# read our sample fastas into DECIPHER format
fas <- "dada2fasta09-15-2023.fasta"
idseqs <- readDNAStringSet(fas) 


load("TrainingSet_UNITE_decipher_03.30.2023.RData") # CHANGE TO THE PATH OF YOUR TRAINING SET
ids <- IdTaxa(idseqs, trainingSet, strand="both", threshold=50, processors=NULL, verbose=FALSE) # use all processors

### The rank labels got messed up somehow, finding a workaround.....
 testtt<- sapply(ids,"[[",1)
 
 taxmatx<-stringi::stri_list2matrix(testtt, byrow=TRUE)

 ranks <- c("root","kingdom", "phylum", "class", "order", "family", "genus", "species") 
colnames(taxmatx)<-ranks
taxid_df<- as.data.frame(taxmatx[1:nrow(as.data.frame(idseqs)),2:8])
row.names(taxid_df)<- names(idseqs)
head(taxid_df)

##                 kingdom           phylum                class
## ASV_1 unclassified_Root             <NA>                 <NA>
## ASV_2          k__Fungi    p__Ascomycota   c__Dothideomycetes
## ASV_3   k__Stramenopila    p__Ochrophyta c__Bacillariophyceae
## ASV_4 unclassified_Root             <NA>                 <NA>
## ASV_5          k__Fungi    p__Ascomycota   c__Dothideomycetes
## ASV_6          k__Fungi p__Basidiomycota    c__Agaricomycetes
##                                 order           family         genus
## ASV_1                            <NA>             <NA>          <NA>
## ASV_2                 o__Pleosporales f__Pleosporaceae g__Alternaria
## ASV_3                  o__Naviculales  f__Naviculaceae   g__Navicula
## ASV_4                            <NA>             <NA>          <NA>
## ASV_5 unclassified_c__Dothideomycetes             <NA>          <NA>
## ASV_6                    o__Phallales    f__Phallaceae    g__Phallus
##                          species
## ASV_1                       <NA>
## ASV_2 unclassified_g__Alternaria
## ASV_3      s__Navicula_trivialis
## ASV_4                       <NA>
## ASV_5                       <NA>
## ASV_6        s__Phallus_hadriani

library(stringr)
library(dplyr)
#library(magrittr)

idtax_table_commonizing<- taxid_df 
is.na(idtax_table_commonizing$kingdom) <- startsWith(idtax_table_commonizing$kingdom, "unclassified_")
is.na(idtax_table_commonizing$phylum) <- startsWith(idtax_table_commonizing$phylum, "unclassified_")
is.na(idtax_table_commonizing$class) <- startsWith(idtax_table_commonizing$class, "unclassified_")
is.na(idtax_table_commonizing$order) <- startsWith(idtax_table_commonizing$order, "unclassified_")
is.na(idtax_table_commonizing$family) <- startsWith(idtax_table_commonizing$family, "unclassified_")
is.na(idtax_table_commonizing$genus) <- startsWith(idtax_table_commonizing$genus, "unclassified_")
is.na(idtax_table_commonizing$species) <- startsWith(idtax_table_commonizing$species, "unclassified_")
is.na(idtax_table_commonizing$species) <- endsWith(idtax_table_commonizing$species, "_sp")

#DECIPHER idtaxa uses redundant labeling, putting "s__Genus_species" in the species rank. We want to remove the "Genus" from the middle of this string to make it interoperable with the Bayesian taxonomy from dada2. 


idtax_table_commonizing$species<- str_replace(idtax_table_commonizing$species, pattern = "__\\w+\\_", replacement = "__")

#blessed be, it worked! Now save it for the love of all that is holy. 

write.csv(idtax_table_commonizing, "idtax_tab_09-15-2023.csv")

##[Corals] not in Kansas anymore The Bayesian approach has misclassified some fungi and stramenopiles as Metazoa and Viridiplantae. For example: for ASV_1178 and ASV_1300 (note that ASV # may change upon re-running) Bayesian approach IDed as:

ASV_1178 k__Metazoa p__Cnidaria c__Anthozoa o__Zoantharia f__Zoanthidae g__Palythoa s__grandis ASV_1300 k__Metazoa p__Cnidaria c__Anthozoa o__Zoantharia f__Zoanthidae g__Palythoa s__grandis

But there are certainly no corals in Kansas. IDTAXA IDed as:

ASV_1178 k__Fungi p__Ascomycota c__Dothideomycetes o__Pleosporales f__Phaeosphaeriaceae g__Leptospora NA ASV_1300 k__Fungi p__Ascomycota c__Dothideomycetes o__Pleosporales f__Phaeosphaeriaceae g__Leptospora NA

The two approachs agree on most fungi and stramenopiles, but it is the bayesian approach that misclassifies a minority of fungal and stramenopile sequences as Metazoans and Viridiplantaeans.

For ASVs identified as Metazoa and Viridiplantae by Bayesian approach, substitute in the taxonomy generated by IDTAXA:

cured_bayes_tax<- as.data.frame(tax_info)
colnames(idtax_table_commonizing)<-colnames(tax_info)

rows_to_replace<- cured_bayes_tax$Kingdom=='k__Metazoa' | cured_bayes_tax$Kingdom=='k__Viridiplantae'

idtemp<-idtax_table_commonizing

# Loop over the rows to perform replacement
for(i in which(rows_to_replace)) {
  cured_bayes_tax[i, ] <- idtemp[i, ]
}

library(tibble)

## 
## Attaching package: 'tibble'

## The following object is masked from 'package:ShortRead':
## 
##     view

taxidf <- tibble::as_data_frame(idtemp)

## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a
##   tibble, or `as.data.frame()` to convert to a data frame.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

row.names(taxidf) <- row.names(idtemp)

## Warning: Setting row names on a tibble is deprecated.

taxidf <- tibble::rownames_to_column(idtemp, "ASV")

bayesdf <- tibble::as_data_frame(cured_bayes_tax)
row.names(bayesdf) <- row.names(cured_bayes_tax)

## Warning: Setting row names on a tibble is deprecated.

bayesdf <- tibble::rownames_to_column(cured_bayes_tax, "ASV")

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ purrr     1.0.1
## ✔ lubridate 1.9.2     ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ purrr::compact()      masks XVector::compact()
## ✖ purrr::compose()      masks ShortRead::compose()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks GenomicAlignments::first(), S4Vectors::first()
## ✖ dplyr::id()           masks ShortRead::id()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ dplyr::last()         masks GenomicAlignments::last()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks GenomicAlignments::second(), S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks XVector::slice(), IRanges::slice()
## ✖ tibble::view()        masks ShortRead::view()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

taxidf <- as.data.frame(taxidf)
bayesdf<- as.data.frame(bayesdf)


library(ensembleTax)

xx <- list(taxidf, bayesdf)
names(xx) <- c("idtax", "bayes")
eTax1 <- assign.ensembleTax(xx, 
                     tablenames = names(xx), 
                     ranknames = colnames(taxidf)[2:ncol(taxidf)],
                     weights=rep(1,length(xx)), 
                   #  ranknames = c("kingdom","phylum","class","order","family","genus","species"),
                     tiebreakz = c("bayes"), 
                     count.na=FALSE, 
                     assign.threshold = 0
                     )
# show the 3 individuals again for easy viewing:
lapply(xx, FUN = head)

## $idtax
##     ASV         Kingdom           Phylum                Class           Order
## 1 ASV_1            <NA>             <NA>                 <NA>            <NA>
## 2 ASV_2        k__Fungi    p__Ascomycota   c__Dothideomycetes o__Pleosporales
## 3 ASV_3 k__Stramenopila    p__Ochrophyta c__Bacillariophyceae  o__Naviculales
## 4 ASV_4            <NA>             <NA>                 <NA>            <NA>
## 5 ASV_5        k__Fungi    p__Ascomycota   c__Dothideomycetes            <NA>
## 6 ASV_6        k__Fungi p__Basidiomycota    c__Agaricomycetes    o__Phallales
##             Family         Genus      Species
## 1             <NA>          <NA>         <NA>
## 2 f__Pleosporaceae g__Alternaria         <NA>
## 3  f__Naviculaceae   g__Navicula s__trivialis
## 4             <NA>          <NA>         <NA>
## 5             <NA>          <NA>         <NA>
## 6    f__Phallaceae    g__Phallus  s__hadriani
## 
## $bayes
##     ASV         Kingdom           Phylum                Class           Order
## 1 ASV_1            <NA>             <NA>                 <NA>            <NA>
## 2 ASV_2        k__Fungi    p__Ascomycota   c__Dothideomycetes o__Pleosporales
## 3 ASV_3 k__Stramenopila    p__Ochrophyta c__Bacillariophyceae  o__Naviculales
## 4 ASV_4            <NA>             <NA>                 <NA>            <NA>
## 5 ASV_5        k__Fungi    p__Ascomycota   c__Dothideomycetes  o__Capnodiales
## 6 ASV_6        k__Fungi p__Basidiomycota    c__Agaricomycetes    o__Phallales
##               Family           Genus            Species
## 1               <NA>            <NA>               <NA>
## 2   f__Pleosporaceae   g__Alternaria      s__tenuissima
## 3    f__Naviculaceae     g__Navicula       s__trivialis
## 4               <NA>            <NA>               <NA>
## 5 f__Cladosporiaceae g__Cladosporium s__cladosporioides
## 6      f__Phallaceae      g__Phallus        s__hadriani

#Set Working Directory
setwd('/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS')

head(eTax1)

##     ASV         Kingdom           Phylum                Class           Order
## 1 ASV_1            <NA>             <NA>                 <NA>            <NA>
## 2 ASV_2        k__Fungi    p__Ascomycota   c__Dothideomycetes o__Pleosporales
## 3 ASV_3 k__Stramenopila    p__Ochrophyta c__Bacillariophyceae  o__Naviculales
## 4 ASV_4            <NA>             <NA>                 <NA>            <NA>
## 5 ASV_5        k__Fungi    p__Ascomycota   c__Dothideomycetes  o__Capnodiales
## 6 ASV_6        k__Fungi p__Basidiomycota    c__Agaricomycetes    o__Phallales
##               Family           Genus            Species
## 1               <NA>            <NA>               <NA>
## 2   f__Pleosporaceae   g__Alternaria      s__tenuissima
## 3    f__Naviculaceae     g__Navicula       s__trivialis
## 4               <NA>            <NA>               <NA>
## 5 f__Cladosporiaceae g__Cladosporium s__cladosporioides
## 6      f__Phallaceae      g__Phallus        s__hadriani

write.csv(eTax1, "ensembleTax_tab_kzsyn_09-15-2023.csv")

Now create a version of fungi-only taxonomy and another for all eukaryotes identified to the Kindgom-level or lower.

row.names(eTax1)<-eTax1$ASV
eTax1<- eTax1[,2:8]

#Set Working Directory
setwd('/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS')

fungitax<- filter(eTax1, Kingdom=='k__Fungi')

write.csv(fungitax, "final_fungi_tax_tab_kzsyn_09-15-2023.csv")

full_euktax<- filter(eTax1, !is.na(Kingdom))

write.csv(full_euktax, "final_eukaryote_tax_tab_kzsyn_09-15-2023.csv")

#THIS IS DONE. With the outputs generated, you can move on to data analysis, which I will conduct in a seperate .RMD file.

Konza_synoptic_ITS1_bioinformatics_pipeline

Charles T. Bond

2023-09-15