It loads the package before it begins or else it gets the error again
library(dada2)
packageVersion("dada2")
## [1] '1.26.0'
library(ShortRead)
packageVersion("ShortRead")
## [1] '1.56.1'
library(Biostrings)
packageVersion("Biostrings")
## [1] '2.66.0'
library(phyloseq)
library(vegan)
library(ggplot2)
library(tidyr)
library(dplyr)
library(pheatmap)
library(GUniFrac)
#library(metagMisc)
#library(raster)
#library(pals)
#library(RColorBrewer)
#library(ragg)
library(ggpubr)
Going to drop some citations for key packages here: McMurdie PJ, Holmes S (2013). “phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data.” PLoS ONE, 8(4), e61217. http://dx.plos.org/10.1371/journal.pone.0061217.
Now the our libraries are loaded, ## Set Theme Setting themes determines how our figures will look, readability and publishability. This can be modified later, but here is the starting point:
#Set theme
theme_set(theme_bw() + theme(
plot.title = element_blank(),
axis.text.x = element_text(size=10, color="black"),
axis.text.y = element_text(size=10, color="black"),
axis.title.x = element_text(size=10),
axis.title.y = element_text(size=10),
legend.text = element_text(size=10),
legend.title = element_text(size=10),
legend.position = "bottom",
legend.key=element_blank(),
legend.key.size = unit(0.5, "cm"),
legend.spacing.x = unit(0.1, "cm"),
legend.spacing.y = unit(0.1, "cm"),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.background = element_blank()))
Now that our workspace is set up, time for the actual pipeline.
Defining file path so R can find our raw fastq files. Make sure forward reads (R1) and reverse reads (R2) are in the same folder. This will need to be changed for different analyses.
path <- "/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS/fastq"
list.files(path)
## [1] "01m01-B_S34_L001_R1_001.fastq" "01m01-B_S34_L001_R2_001.fastq"
## [3] "01m01-L_S4_L001_R1_001.fastq" "01m01-L_S4_L001_R2_001.fastq"
## [5] "01m01-S_S148_L001_R1_001.fastq" "01m01-S_S148_L001_R2_001.fastq"
## [7] "01m02-B_S46_L001_R1_001.fastq" "01m02-B_S46_L001_R2_001.fastq"
## [9] "01m02-L_S16_L001_R1_001.fastq" "01m02-L_S16_L001_R2_001.fastq"
## [11] "01m02-S_S160_L001_R1_001.fastq" "01m02-S_S160_L001_R2_001.fastq"
## [13] "01m03-B_S58_L001_R1_001.fastq" "01m03-B_S58_L001_R2_001.fastq"
## [15] "01m03-L_S28_L001_R1_001.fastq" "01m03-L_S28_L001_R2_001.fastq"
## [17] "01m03-S_S172_L001_R1_001.fastq" "01m03-S_S172_L001_R2_001.fastq"
## [19] "01m03-W_S142_L001_R1_001.fastq" "01m03-W_S142_L001_R2_001.fastq"
## [21] "01m04-B_S70_L001_R1_001.fastq" "01m04-B_S70_L001_R2_001.fastq"
## [23] "01m04-L_S40_L001_R1_001.fastq" "01m04-L_S40_L001_R2_001.fastq"
## [25] "01m04-S_S184_L001_R1_001.fastq" "01m04-S_S184_L001_R2_001.fastq"
## [27] "01m05-B_S82_L001_R1_001.fastq" "01m05-B_S82_L001_R2_001.fastq"
## [29] "01m05-L_S52_L001_R1_001.fastq" "01m05-L_S52_L001_R2_001.fastq"
## [31] "01m05-S_S101_L001_R1_001.fastq" "01m05-S_S101_L001_R2_001.fastq"
## [33] "01m06-B_S94_L001_R1_001.fastq" "01m06-B_S94_L001_R2_001.fastq"
## [35] "01m06-L_S64_L001_R1_001.fastq" "01m06-L_S64_L001_R2_001.fastq"
## [37] "01m06-S_S113_L001_R1_001.fastq" "01m06-S_S113_L001_R2_001.fastq"
## [39] "02m01-B_S92_L001_R1_001.fastq" "02m01-B_S92_L001_R2_001.fastq"
## [41] "02m01-L_S74_L001_R1_001.fastq" "02m01-L_S74_L001_R2_001.fastq"
## [43] "02m01-S_S111_L001_R1_001.fastq" "02m01-S_S111_L001_R2_001.fastq"
## [45] "02m02-B_S9_L001_R1_001.fastq" "02m02-B_S9_L001_R2_001.fastq"
## [47] "02m02-L_S86_L001_R1_001.fastq" "02m02-L_S86_L001_R2_001.fastq"
## [49] "02m02-S_S123_L001_R1_001.fastq" "02m02-S_S123_L001_R2_001.fastq"
## [51] "02m03-B_S21_L001_R1_001.fastq" "02m03-B_S21_L001_R2_001.fastq"
## [53] "02m03-L_S3_L001_R1_001.fastq" "02m03-L_S3_L001_R2_001.fastq"
## [55] "02m03-S_S135_L001_R1_001.fastq" "02m03-S_S135_L001_R2_001.fastq"
## [57] "02m04-L_S15_L001_R1_001.fastq" "02m04-L_S15_L001_R2_001.fastq"
## [59] "02m04-S_S147_L001_R1_001.fastq" "02m04-S_S147_L001_R2_001.fastq"
## [61] "02m05-B_S45_L001_R1_001.fastq" "02m05-B_S45_L001_R2_001.fastq"
## [63] "02m05-L_S27_L001_R1_001.fastq" "02m05-L_S27_L001_R2_001.fastq"
## [65] "02m06-B_S57_L001_R1_001.fastq" "02m06-B_S57_L001_R2_001.fastq"
## [67] "02m06-L_S39_L001_R1_001.fastq" "02m06-L_S39_L001_R2_001.fastq"
## [69] "02m06-S_S171_L001_R1_001.fastq" "02m06-S_S171_L001_R2_001.fastq"
## [71] "02m07-B_S69_L001_R1_001.fastq" "02m07-B_S69_L001_R2_001.fastq"
## [73] "02m07-L_S51_L001_R1_001.fastq" "02m07-L_S51_L001_R2_001.fastq"
## [75] "02m07-S_S183_L001_R1_001.fastq" "02m07-S_S183_L001_R2_001.fastq"
## [77] "02m08-B_S81_L001_R1_001.fastq" "02m08-B_S81_L001_R2_001.fastq"
## [79] "02m08-L_S63_L001_R1_001.fastq" "02m08-L_S63_L001_R2_001.fastq"
## [81] "02m08-S_S100_L001_R1_001.fastq" "02m08-S_S100_L001_R2_001.fastq"
## [83] "02m09-B_S93_L001_R1_001.fastq" "02m09-B_S93_L001_R2_001.fastq"
## [85] "02m09-L_S75_L001_R1_001.fastq" "02m09-L_S75_L001_R2_001.fastq"
## [87] "02m09-S_S112_L001_R1_001.fastq" "02m09-S_S112_L001_R2_001.fastq"
## [89] "02m10-B_S10_L001_R1_001.fastq" "02m10-B_S10_L001_R2_001.fastq"
## [91] "02m10-L_S87_L001_R1_001.fastq" "02m10-L_S87_L001_R2_001.fastq"
## [93] "02m10-S_S124_L001_R1_001.fastq" "02m10-S_S124_L001_R2_001.fastq"
## [95] "02m11-B_S22_L001_R1_001.fastq" "02m11-B_S22_L001_R2_001.fastq"
## [97] "02m11-S_S136_L001_R1_001.fastq" "02m11-S_S136_L001_R2_001.fastq"
## [99] "02m11-W_S106_L001_R1_001.fastq" "02m11-W_S106_L001_R2_001.fastq"
## [101] "04m01-B_S11_L001_R1_001.fastq" "04m01-B_S11_L001_R2_001.fastq"
## [103] "04m01-L_S76_L001_R1_001.fastq" "04m01-L_S76_L001_R2_001.fastq"
## [105] "04m01-S_S125_L001_R1_001.fastq" "04m01-S_S125_L001_R2_001.fastq"
## [107] "04m02-B_S23_L001_R1_001.fastq" "04m02-B_S23_L001_R2_001.fastq"
## [109] "04m02-L_S88_L001_R1_001.fastq" "04m02-L_S88_L001_R2_001.fastq"
## [111] "04m02-S_S137_L001_R1_001.fastq" "04m02-S_S137_L001_R2_001.fastq"
## [113] "04m03-B_S35_L001_R1_001.fastq" "04m03-B_S35_L001_R2_001.fastq"
## [115] "04m03-L_S5_L001_R1_001.fastq" "04m03-L_S5_L001_R2_001.fastq"
## [117] "04m03-S_S149_L001_R1_001.fastq" "04m03-S_S149_L001_R2_001.fastq"
## [119] "04m04-B_S47_L001_R1_001.fastq" "04m04-B_S47_L001_R2_001.fastq"
## [121] "04m04-L_S17_L001_R1_001.fastq" "04m04-L_S17_L001_R2_001.fastq"
## [123] "04m04-S_S161_L001_R1_001.fastq" "04m04-S_S161_L001_R2_001.fastq"
## [125] "04m05-B_S59_L001_R1_001.fastq" "04m05-B_S59_L001_R2_001.fastq"
## [127] "04m05-L_S29_L001_R1_001.fastq" "04m05-L_S29_L001_R2_001.fastq"
## [129] "04m05-S_S173_L001_R1_001.fastq" "04m05-S_S173_L001_R2_001.fastq"
## [131] "04m06-B_S71_L001_R1_001.fastq" "04m06-B_S71_L001_R2_001.fastq"
## [133] "04m06-L_S41_L001_R1_001.fastq" "04m06-L_S41_L001_R2_001.fastq"
## [135] "04m06-S_S185_L001_R1_001.fastq" "04m06-S_S185_L001_R2_001.fastq"
## [137] "04m07-L_S53_L001_R1_001.fastq" "04m07-L_S53_L001_R2_001.fastq"
## [139] "04m07-S_S102_L001_R1_001.fastq" "04m07-S_S102_L001_R2_001.fastq"
## [141] "04m08-B_S83_L001_R1_001.fastq" "04m08-B_S83_L001_R2_001.fastq"
## [143] "04m08-L_S65_L001_R1_001.fastq" "04m08-L_S65_L001_R2_001.fastq"
## [145] "04m08-S_S114_L001_R1_001.fastq" "04m08-S_S114_L001_R2_001.fastq"
## [147] "04m09-B_S95_L001_R1_001.fastq" "04m09-B_S95_L001_R2_001.fastq"
## [149] "04m09-L_S77_L001_R1_001.fastq" "04m09-L_S77_L001_R2_001.fastq"
## [151] "04m09-S_S126_L001_R1_001.fastq" "04m09-S_S126_L001_R2_001.fastq"
## [153] "04m10-B_S12_L001_R1_001.fastq" "04m10-B_S12_L001_R2_001.fastq"
## [155] "04m10-L_S89_L001_R1_001.fastq" "04m10-L_S89_L001_R2_001.fastq"
## [157] "04m10-S_S138_L001_R1_001.fastq" "04m10-S_S138_L001_R2_001.fastq"
## [159] "04m11-B_S24_L001_R1_001.fastq" "04m11-B_S24_L001_R2_001.fastq"
## [161] "04m11-L_S6_L001_R1_001.fastq" "04m11-L_S6_L001_R2_001.fastq"
## [163] "04m11-S_S150_L001_R1_001.fastq" "04m11-S_S150_L001_R2_001.fastq"
## [165] "04m11-W_S108_L001_R1_001.fastq" "04m11-W_S108_L001_R2_001.fastq"
## [167] "04m12-B_S36_L001_R1_001.fastq" "04m12-B_S36_L001_R2_001.fastq"
## [169] "04m12-L_S18_L001_R1_001.fastq" "04m12-L_S18_L001_R2_001.fastq"
## [171] "04m12-S_S162_L001_R1_001.fastq" "04m12-S_S162_L001_R2_001.fastq"
## [173] "04m13-B_S48_L001_R1_001.fastq" "04m13-B_S48_L001_R2_001.fastq"
## [175] "04m13-L_S30_L001_R1_001.fastq" "04m13-L_S30_L001_R2_001.fastq"
## [177] "04m13-S_S174_L001_R1_001.fastq" "04m13-S_S174_L001_R2_001.fastq"
## [179] "04t01-B_S55_L001_R1_001.fastq" "04t01-B_S55_L001_R2_001.fastq"
## [181] "04t01-L_S37_L001_R1_001.fastq" "04t01-L_S37_L001_R2_001.fastq"
## [183] "04t01-S_S169_L001_R1_001.fastq" "04t01-S_S169_L001_R2_001.fastq"
## [185] "04t02-B_S67_L001_R1_001.fastq" "04t02-B_S67_L001_R2_001.fastq"
## [187] "04t02-L_S49_L001_R1_001.fastq" "04t02-L_S49_L001_R2_001.fastq"
## [189] "04t02-S_S181_L001_R1_001.fastq" "04t02-S_S181_L001_R2_001.fastq"
## [191] "04w01-B_S79_L001_R1_001.fastq" "04w01-B_S79_L001_R2_001.fastq"
## [193] "04w01-L_S61_L001_R1_001.fastq" "04w01-L_S61_L001_R2_001.fastq"
## [195] "04w01-S_S98_L001_R1_001.fastq" "04w01-S_S98_L001_R2_001.fastq"
## [197] "04w02-B_S91_L001_R1_001.fastq" "04w02-B_S91_L001_R2_001.fastq"
## [199] "04w02-L_S73_L001_R1_001.fastq" "04w02-L_S73_L001_R2_001.fastq"
## [201] "04w02-S_S110_L001_R1_001.fastq" "04w02-S_S110_L001_R2_001.fastq"
## [203] "04w03-B_S8_L001_R1_001.fastq" "04w03-B_S8_L001_R2_001.fastq"
## [205] "04w03-L_S85_L001_R1_001.fastq" "04w03-L_S85_L001_R2_001.fastq"
## [207] "04w03-S_S122_L001_R1_001.fastq" "04w03-S_S122_L001_R2_001.fastq"
## [209] "04w04-L_S2_L001_R1_001.fastq" "04w04-L_S2_L001_R2_001.fastq"
## [211] "04w04-S_S134_L001_R1_001.fastq" "04w04-S_S134_L001_R2_001.fastq"
## [213] "20m01-L_S14_L001_R1_001.fastq" "20m01-L_S14_L001_R2_001.fastq"
## [215] "20m01-S_S146_L001_R1_001.fastq" "20m01-S_S146_L001_R2_001.fastq"
## [217] "20m02-B_S44_L001_R1_001.fastq" "20m02-B_S44_L001_R2_001.fastq"
## [219] "20m02-L_S26_L001_R1_001.fastq" "20m02-L_S26_L001_R2_001.fastq"
## [221] "20m02-S_S158_L001_R1_001.fastq" "20m02-S_S158_L001_R2_001.fastq"
## [223] "20m02-W_S152_L001_R1_001.fastq" "20m02-W_S152_L001_R2_001.fastq"
## [225] "20m03-B_S56_L001_R1_001.fastq" "20m03-B_S56_L001_R2_001.fastq"
## [227] "20m03-L_S38_L001_R1_001.fastq" "20m03-L_S38_L001_R2_001.fastq"
## [229] "20m03-S_S170_L001_R1_001.fastq" "20m03-S_S170_L001_R2_001.fastq"
## [231] "20m04-B_S68_L001_R1_001.fastq" "20m04-B_S68_L001_R2_001.fastq"
## [233] "20m04-L_S50_L001_R1_001.fastq" "20m04-L_S50_L001_R2_001.fastq"
## [235] "20m04-S_S182_L001_R1_001.fastq" "20m04-S_S182_L001_R2_001.fastq"
## [237] "20m04-W_S164_L001_R1_001.fastq" "20m04-W_S164_L001_R2_001.fastq"
## [239] "20m05-B_S80_L001_R1_001.fastq" "20m05-B_S80_L001_R2_001.fastq"
## [241] "20m05-L_S62_L001_R1_001.fastq" "20m05-L_S62_L001_R2_001.fastq"
## [243] "20m05-S_S99_L001_R1_001.fastq" "20m05-S_S99_L001_R2_001.fastq"
## [245] "cutadapt" "filtN"
## [247] "sfm01-B_S60_L001_R1_001.fastq" "sfm01-B_S60_L001_R2_001.fastq"
## [249] "sfm01-L_S42_L001_R1_001.fastq" "sfm01-L_S42_L001_R2_001.fastq"
## [251] "sfm01-S_S186_L001_R1_001.fastq" "sfm01-S_S186_L001_R2_001.fastq"
## [253] "sfm02-B_S72_L001_R1_001.fastq" "sfm02-B_S72_L001_R2_001.fastq"
## [255] "sfm02-L_S54_L001_R1_001.fastq" "sfm02-L_S54_L001_R2_001.fastq"
## [257] "sfm02-S_S103_L001_R1_001.fastq" "sfm02-S_S103_L001_R2_001.fastq"
## [259] "sfm03-B_S84_L001_R1_001.fastq" "sfm03-B_S84_L001_R2_001.fastq"
## [261] "sfm03-L_S66_L001_R1_001.fastq" "sfm03-L_S66_L001_R2_001.fastq"
## [263] "sfm03-S_S115_L001_R1_001.fastq" "sfm03-S_S115_L001_R2_001.fastq"
## [265] "sfm04-B_S96_L001_R1_001.fastq" "sfm04-B_S96_L001_R2_001.fastq"
## [267] "sfm04-L_S78_L001_R1_001.fastq" "sfm04-L_S78_L001_R2_001.fastq"
## [269] "sfm04-S_S127_L001_R1_001.fastq" "sfm04-S_S127_L001_R2_001.fastq"
## [271] "sfm05-B_S109_L001_R1_001.fastq" "sfm05-B_S109_L001_R2_001.fastq"
## [273] "sfm05-L_S90_L001_R1_001.fastq" "sfm05-L_S90_L001_R2_001.fastq"
## [275] "sfm05-S_S139_L001_R1_001.fastq" "sfm05-S_S139_L001_R2_001.fastq"
## [277] "sfm06-B_S121_L001_R1_001.fastq" "sfm06-B_S121_L001_R2_001.fastq"
## [279] "sfm06-L_S7_L001_R1_001.fastq" "sfm06-L_S7_L001_R2_001.fastq"
## [281] "sfm06-S_S151_L001_R1_001.fastq" "sfm06-S_S151_L001_R2_001.fastq"
## [283] "sfm07-B_S133_L001_R1_001.fastq" "sfm07-B_S133_L001_R2_001.fastq"
## [285] "sfm07-L_S19_L001_R1_001.fastq" "sfm07-L_S19_L001_R2_001.fastq"
## [287] "sfm07-S_S163_L001_R1_001.fastq" "sfm07-S_S163_L001_R2_001.fastq"
## [289] "Sft01-B_S31_L001_R1_001.fastq" "Sft01-B_S31_L001_R2_001.fastq"
## [291] "Sft01-L_S13_L001_R1_001.fastq" "Sft01-L_S13_L001_R2_001.fastq"
## [293] "Sft01-S_S145_L001_R1_001.fastq" "Sft01-S_S145_L001_R2_001.fastq"
## [295] "Sft02-B_S43_L001_R1_001.fastq" "Sft02-B_S43_L001_R2_001.fastq"
## [297] "Sft02-L_S25_L001_R1_001.fastq" "Sft02-L_S25_L001_R2_001.fastq"
## [299] "Sft02-S_S157_L001_R1_001.fastq" "Sft02-S_S157_L001_R2_001.fastq"
Now that forward and reverse reads are loaded, we’ll sort and name the samples.
#Forward and reverse fastq filenames
fnFs <- sort(list.files(path, pattern="_R1_001.fastq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2_001.fastq", full.names = TRUE))
library(stringr)
#Extract sample names
sample.names <- sapply(strsplit(basename(fnFs), "_L001_R1_001"), `[`, 1)
sample.names <- sapply(strsplit(basename(fnFs), "_S"), `[`, 1)
sample.names <- sapply(str_replace(sample.names, "-", "_"),`[`, 1)
sample.names <- sapply(paste0('kz.s_', sample.names),`[`, 1)
sample.names
## kz.s_01m01_B kz.s_01m01_L kz.s_01m01_S kz.s_01m02_B kz.s_01m02_L
## "kz.s_01m01_B" "kz.s_01m01_L" "kz.s_01m01_S" "kz.s_01m02_B" "kz.s_01m02_L"
## kz.s_01m02_S kz.s_01m03_B kz.s_01m03_L kz.s_01m03_S kz.s_01m03_W
## "kz.s_01m02_S" "kz.s_01m03_B" "kz.s_01m03_L" "kz.s_01m03_S" "kz.s_01m03_W"
## kz.s_01m04_B kz.s_01m04_L kz.s_01m04_S kz.s_01m05_B kz.s_01m05_L
## "kz.s_01m04_B" "kz.s_01m04_L" "kz.s_01m04_S" "kz.s_01m05_B" "kz.s_01m05_L"
## kz.s_01m05_S kz.s_01m06_B kz.s_01m06_L kz.s_01m06_S kz.s_02m01_B
## "kz.s_01m05_S" "kz.s_01m06_B" "kz.s_01m06_L" "kz.s_01m06_S" "kz.s_02m01_B"
## kz.s_02m01_L kz.s_02m01_S kz.s_02m02_B kz.s_02m02_L kz.s_02m02_S
## "kz.s_02m01_L" "kz.s_02m01_S" "kz.s_02m02_B" "kz.s_02m02_L" "kz.s_02m02_S"
## kz.s_02m03_B kz.s_02m03_L kz.s_02m03_S kz.s_02m04_L kz.s_02m04_S
## "kz.s_02m03_B" "kz.s_02m03_L" "kz.s_02m03_S" "kz.s_02m04_L" "kz.s_02m04_S"
## kz.s_02m05_B kz.s_02m05_L kz.s_02m06_B kz.s_02m06_L kz.s_02m06_S
## "kz.s_02m05_B" "kz.s_02m05_L" "kz.s_02m06_B" "kz.s_02m06_L" "kz.s_02m06_S"
## kz.s_02m07_B kz.s_02m07_L kz.s_02m07_S kz.s_02m08_B kz.s_02m08_L
## "kz.s_02m07_B" "kz.s_02m07_L" "kz.s_02m07_S" "kz.s_02m08_B" "kz.s_02m08_L"
## kz.s_02m08_S kz.s_02m09_B kz.s_02m09_L kz.s_02m09_S kz.s_02m10_B
## "kz.s_02m08_S" "kz.s_02m09_B" "kz.s_02m09_L" "kz.s_02m09_S" "kz.s_02m10_B"
## kz.s_02m10_L kz.s_02m10_S kz.s_02m11_B kz.s_02m11_S kz.s_02m11_W
## "kz.s_02m10_L" "kz.s_02m10_S" "kz.s_02m11_B" "kz.s_02m11_S" "kz.s_02m11_W"
## kz.s_04m01_B kz.s_04m01_L kz.s_04m01_S kz.s_04m02_B kz.s_04m02_L
## "kz.s_04m01_B" "kz.s_04m01_L" "kz.s_04m01_S" "kz.s_04m02_B" "kz.s_04m02_L"
## kz.s_04m02_S kz.s_04m03_B kz.s_04m03_L kz.s_04m03_S kz.s_04m04_B
## "kz.s_04m02_S" "kz.s_04m03_B" "kz.s_04m03_L" "kz.s_04m03_S" "kz.s_04m04_B"
## kz.s_04m04_L kz.s_04m04_S kz.s_04m05_B kz.s_04m05_L kz.s_04m05_S
## "kz.s_04m04_L" "kz.s_04m04_S" "kz.s_04m05_B" "kz.s_04m05_L" "kz.s_04m05_S"
## kz.s_04m06_B kz.s_04m06_L kz.s_04m06_S kz.s_04m07_L kz.s_04m07_S
## "kz.s_04m06_B" "kz.s_04m06_L" "kz.s_04m06_S" "kz.s_04m07_L" "kz.s_04m07_S"
## kz.s_04m08_B kz.s_04m08_L kz.s_04m08_S kz.s_04m09_B kz.s_04m09_L
## "kz.s_04m08_B" "kz.s_04m08_L" "kz.s_04m08_S" "kz.s_04m09_B" "kz.s_04m09_L"
## kz.s_04m09_S kz.s_04m10_B kz.s_04m10_L kz.s_04m10_S kz.s_04m11_B
## "kz.s_04m09_S" "kz.s_04m10_B" "kz.s_04m10_L" "kz.s_04m10_S" "kz.s_04m11_B"
## kz.s_04m11_L kz.s_04m11_S kz.s_04m11_W kz.s_04m12_B kz.s_04m12_L
## "kz.s_04m11_L" "kz.s_04m11_S" "kz.s_04m11_W" "kz.s_04m12_B" "kz.s_04m12_L"
## kz.s_04m12_S kz.s_04m13_B kz.s_04m13_L kz.s_04m13_S kz.s_04t01_B
## "kz.s_04m12_S" "kz.s_04m13_B" "kz.s_04m13_L" "kz.s_04m13_S" "kz.s_04t01_B"
## kz.s_04t01_L kz.s_04t01_S kz.s_04t02_B kz.s_04t02_L kz.s_04t02_S
## "kz.s_04t01_L" "kz.s_04t01_S" "kz.s_04t02_B" "kz.s_04t02_L" "kz.s_04t02_S"
## kz.s_04w01_B kz.s_04w01_L kz.s_04w01_S kz.s_04w02_B kz.s_04w02_L
## "kz.s_04w01_B" "kz.s_04w01_L" "kz.s_04w01_S" "kz.s_04w02_B" "kz.s_04w02_L"
## kz.s_04w02_S kz.s_04w03_B kz.s_04w03_L kz.s_04w03_S kz.s_04w04_L
## "kz.s_04w02_S" "kz.s_04w03_B" "kz.s_04w03_L" "kz.s_04w03_S" "kz.s_04w04_L"
## kz.s_04w04_S kz.s_20m01_L kz.s_20m01_S kz.s_20m02_B kz.s_20m02_L
## "kz.s_04w04_S" "kz.s_20m01_L" "kz.s_20m01_S" "kz.s_20m02_B" "kz.s_20m02_L"
## kz.s_20m02_S kz.s_20m02_W kz.s_20m03_B kz.s_20m03_L kz.s_20m03_S
## "kz.s_20m02_S" "kz.s_20m02_W" "kz.s_20m03_B" "kz.s_20m03_L" "kz.s_20m03_S"
## kz.s_20m04_B kz.s_20m04_L kz.s_20m04_S kz.s_20m04_W kz.s_20m05_B
## "kz.s_20m04_B" "kz.s_20m04_L" "kz.s_20m04_S" "kz.s_20m04_W" "kz.s_20m05_B"
## kz.s_20m05_L kz.s_20m05_S kz.s_sfm01_B kz.s_sfm01_L kz.s_sfm01_S
## "kz.s_20m05_L" "kz.s_20m05_S" "kz.s_sfm01_B" "kz.s_sfm01_L" "kz.s_sfm01_S"
## kz.s_sfm02_B kz.s_sfm02_L kz.s_sfm02_S kz.s_sfm03_B kz.s_sfm03_L
## "kz.s_sfm02_B" "kz.s_sfm02_L" "kz.s_sfm02_S" "kz.s_sfm03_B" "kz.s_sfm03_L"
## kz.s_sfm03_S kz.s_sfm04_B kz.s_sfm04_L kz.s_sfm04_S kz.s_sfm05_B
## "kz.s_sfm03_S" "kz.s_sfm04_B" "kz.s_sfm04_L" "kz.s_sfm04_S" "kz.s_sfm05_B"
## kz.s_sfm05_L kz.s_sfm05_S kz.s_sfm06_B kz.s_sfm06_L kz.s_sfm06_S
## "kz.s_sfm05_L" "kz.s_sfm05_S" "kz.s_sfm06_B" "kz.s_sfm06_L" "kz.s_sfm06_S"
## kz.s_sfm07_B kz.s_sfm07_L kz.s_sfm07_S kz.s_Sft01_B kz.s_Sft01_L
## "kz.s_sfm07_B" "kz.s_sfm07_L" "kz.s_sfm07_S" "kz.s_Sft01_B" "kz.s_Sft01_L"
## kz.s_Sft01_S kz.s_Sft02_B kz.s_Sft02_L kz.s_Sft02_S
## "kz.s_Sft01_S" "kz.s_Sft02_B" "kz.s_Sft02_L" "kz.s_Sft02_S"
I am adding this step to ensure that primers have been properly removed
#BITS
FWD <- "ACCTGCGGARGGATCA"
## B58S3
REV <- "GAGATCCRTTGYTRAAAGTT"
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients
## Forward Complement Reverse RevComp
## "ACCTGCGGARGGATCA" "TGGACGCCTYCCTAGT" "ACTAGGRAGGCGTCCA" "TGATCCYTCCGCAGGT"
###The presence of ambiguous bases (Ns) in the sequencing reads makes accurate mapping of short primer sequences difficult. Next we are going to “pre-filter” the sequences just to remove those with Ns, but perform no other filtering.
fnFs.filtN <- file.path(path, "filtN", basename(fnFs))
# Put N-filterd files in filtN/ subdirectory
fnRs.filtN <- file.path(path, "filtN", basename(fnRs))
filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = TRUE)
#We are now ready to count the number of times the primers appear in the forward and reverse read, while considering all possible primer orientations. Identifying and counting the primers on one set of paired end FASTQ files is sufficient, assuming all the files were created using the same library preparation, so we’ll just process the first sample.
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))
## Forward Complement Reverse RevComp
## FWD.ForwardReads 0 0 0 0
## FWD.ReverseReads 0 0 0 9753
## REV.ForwardReads 0 0 0 13874
## REV.ReverseReads 0 0 0 0
Forward Complement Reverse RevComp
FWD.ForwardReads 0 0 0 0 FWD.ReverseReads 0 0 0 9753 REV.ForwardReads 0 0 0 13874 REV.ReverseReads 0 0 0 0
#cutadapt <- "/Users/kuehnlab/opt/anaconda3/envs/cutadaptenv/bin/cutadapt" # CHANGE ME to the cutadapt path on your machine
cutadapt <- "/Users/kuehnlab/miniconda3/envs/qiime2-2022.11/bin/cutadapt"
system2(cutadapt, args = "--version") # Run shell commands from R
NOTE: -m 45 == minimum length of 45, because my I’d like to go ahead and drop those itty bitty reads altogether.
path.cut <- file.path(path, "cutadapt")
if(!dir.exists(path.cut)) dir.create(path.cut)
fnFs.cut <- file.path(path.cut, basename(fnFs))
fnRs.cut <- file.path(path.cut, basename(fnRs))
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC)
# Run Cutadapt
### Note, Charlie initially added argument "-m 1" below to remove remnant reads of length zero. Now attempting with min length more stringent min length, something just below downstream minimum length, e.g. 45 is below downstream length of 50...
for(i in seq_along(fnFs)) {
system2(cutadapt, args = c(R1.flags, R2.flags, "-m", 45, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
fnFs.filtN[i], fnRs.filtN[i])) # input files
}
###Charlie note: The output gives a warning to check the adapter
sequences: WARNING:
The adapter is preceded by 'A' extremely often.
The provided adapter sequence could be incomplete at its 5' end.
Ignore this warning when trimming primers.
HOWEVER, we can Ignore this warning , because we’re
trimming primers. Adapters are on the outside of the
primers, so by trimming primers, we are without a doubt also trimming
off adapters. So just ignore the warning, we’re good.
#Sanity check, did it work?
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
## Forward Complement Reverse RevComp
## FWD.ForwardReads 0 0 0 0
## FWD.ReverseReads 0 0 0 0
## REV.ForwardReads 0 0 0 0
## REV.ReverseReads 0 0 0 0
# Forward and reverse fastq filenames have the format:
cutFs <- sort(list.files(path.cut, pattern = "_R1_001.fastq", full.names = TRUE))
cutRs <- sort(list.files(path.cut, pattern = "_R2_001.fastq", full.names = TRUE))
# Extract sample names, assuming filenames have format:
get.sample.name <- function(fname) strsplit(basename(fname), "_")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
head(sample.names)
## [1] "01m01-B" "01m01-L" "01m01-S" "01m02-B" "01m02-L" "01m02-S"
library(stringr)
#Extract sample names
sample.names <- sapply(str_replace(sample.names, "-", "_"),`[`, 1)
#sample.names <- sapply(paste0('kz.s_', sample.names),`[`, 1)
sample.names
## 01m01_B 01m01_L 01m01_S 01m02_B 01m02_L 01m02_S 01m03_B 01m03_L
## "01m01_B" "01m01_L" "01m01_S" "01m02_B" "01m02_L" "01m02_S" "01m03_B" "01m03_L"
## 01m03_S 01m03_W 01m04_B 01m04_L 01m04_S 01m05_B 01m05_L 01m05_S
## "01m03_S" "01m03_W" "01m04_B" "01m04_L" "01m04_S" "01m05_B" "01m05_L" "01m05_S"
## 01m06_B 01m06_L 01m06_S 02m01_B 02m01_L 02m01_S 02m02_B 02m02_L
## "01m06_B" "01m06_L" "01m06_S" "02m01_B" "02m01_L" "02m01_S" "02m02_B" "02m02_L"
## 02m02_S 02m03_B 02m03_L 02m03_S 02m04_L 02m04_S 02m05_B 02m05_L
## "02m02_S" "02m03_B" "02m03_L" "02m03_S" "02m04_L" "02m04_S" "02m05_B" "02m05_L"
## 02m06_B 02m06_L 02m06_S 02m07_B 02m07_L 02m07_S 02m08_B 02m08_L
## "02m06_B" "02m06_L" "02m06_S" "02m07_B" "02m07_L" "02m07_S" "02m08_B" "02m08_L"
## 02m08_S 02m09_B 02m09_L 02m09_S 02m10_B 02m10_L 02m10_S 02m11_B
## "02m08_S" "02m09_B" "02m09_L" "02m09_S" "02m10_B" "02m10_L" "02m10_S" "02m11_B"
## 02m11_S 02m11_W 04m01_B 04m01_L 04m01_S 04m02_B 04m02_L 04m02_S
## "02m11_S" "02m11_W" "04m01_B" "04m01_L" "04m01_S" "04m02_B" "04m02_L" "04m02_S"
## 04m03_B 04m03_L 04m03_S 04m04_B 04m04_L 04m04_S 04m05_B 04m05_L
## "04m03_B" "04m03_L" "04m03_S" "04m04_B" "04m04_L" "04m04_S" "04m05_B" "04m05_L"
## 04m05_S 04m06_B 04m06_L 04m06_S 04m07_L 04m07_S 04m08_B 04m08_L
## "04m05_S" "04m06_B" "04m06_L" "04m06_S" "04m07_L" "04m07_S" "04m08_B" "04m08_L"
## 04m08_S 04m09_B 04m09_L 04m09_S 04m10_B 04m10_L 04m10_S 04m11_B
## "04m08_S" "04m09_B" "04m09_L" "04m09_S" "04m10_B" "04m10_L" "04m10_S" "04m11_B"
## 04m11_L 04m11_S 04m11_W 04m12_B 04m12_L 04m12_S 04m13_B 04m13_L
## "04m11_L" "04m11_S" "04m11_W" "04m12_B" "04m12_L" "04m12_S" "04m13_B" "04m13_L"
## 04m13_S 04t01_B 04t01_L 04t01_S 04t02_B 04t02_L 04t02_S 04w01_B
## "04m13_S" "04t01_B" "04t01_L" "04t01_S" "04t02_B" "04t02_L" "04t02_S" "04w01_B"
## 04w01_L 04w01_S 04w02_B 04w02_L 04w02_S 04w03_B 04w03_L 04w03_S
## "04w01_L" "04w01_S" "04w02_B" "04w02_L" "04w02_S" "04w03_B" "04w03_L" "04w03_S"
## 04w04_L 04w04_S 20m01_L 20m01_S 20m02_B 20m02_L 20m02_S 20m02_W
## "04w04_L" "04w04_S" "20m01_L" "20m01_S" "20m02_B" "20m02_L" "20m02_S" "20m02_W"
## 20m03_B 20m03_L 20m03_S 20m04_B 20m04_L 20m04_S 20m04_W 20m05_B
## "20m03_B" "20m03_L" "20m03_S" "20m04_B" "20m04_L" "20m04_S" "20m04_W" "20m05_B"
## 20m05_L 20m05_S sfm01_B sfm01_L sfm01_S sfm02_B sfm02_L sfm02_S
## "20m05_L" "20m05_S" "sfm01_B" "sfm01_L" "sfm01_S" "sfm02_B" "sfm02_L" "sfm02_S"
## sfm03_B sfm03_L sfm03_S sfm04_B sfm04_L sfm04_S sfm05_B sfm05_L
## "sfm03_B" "sfm03_L" "sfm03_S" "sfm04_B" "sfm04_L" "sfm04_S" "sfm05_B" "sfm05_L"
## sfm05_S sfm06_B sfm06_L sfm06_S sfm07_B sfm07_L sfm07_S Sft01_B
## "sfm05_S" "sfm06_B" "sfm06_L" "sfm06_S" "sfm07_B" "sfm07_L" "sfm07_S" "Sft01_B"
## Sft01_L Sft01_S Sft02_B Sft02_L Sft02_S
## "Sft01_L" "Sft01_S" "Sft02_B" "Sft02_L" "Sft02_S"
#Inspect read quality profiles for forward reads
plotQualityProfile(cutFs[c(6,12,36,62,84,120)])
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the dada2 package.
## Please report the issue at <https://github.com/benjjneb/dada2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Inspect read quality profiles for reverse reads
plotQualityProfile(cutRs[c(6,12,36,62,84,120)])
Setting file paths for the samples we’re about to trim:
#Filter and Trim
filtFs <- file.path(path.cut, "filtered", basename(cutFs))
filtRs <- file.path(path.cut, "filtered", basename(cutRs))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
and now for the trimming, note that I set multi-thread to TRUE because I am on a Mac. You’ll have to set that to FALSE if you’re on a PC. The default expected error (maxEE) is 2, but I’ve relaxed them here to reflect our lower quality data. Cranking
##Trim trim trim!!
#Filter reads where both sets maintain a quality score ~30+ (error rate 1 in 1,000)
## Mac can multi-threat this, windows cannot ##
out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs,
#truncLen=c(220,120),minLen=50,
maxN=0, maxEE=c(2,3), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE)
out
## reads.in reads.out
## 01m01-B_S34_L001_R1_001.fastq 43975 35155
## 01m01-L_S4_L001_R1_001.fastq 46129 21641
## 01m01-S_S148_L001_R1_001.fastq 27321 17639
## 01m02-B_S46_L001_R1_001.fastq 76462 48015
## 01m02-L_S16_L001_R1_001.fastq 76187 49960
## 01m02-S_S160_L001_R1_001.fastq 8234 5196
## 01m03-B_S58_L001_R1_001.fastq 60429 47185
## 01m03-L_S28_L001_R1_001.fastq 83613 43967
## 01m03-S_S172_L001_R1_001.fastq 45097 30592
## 01m03-W_S142_L001_R1_001.fastq 5784 4212
## 01m04-B_S70_L001_R1_001.fastq 35636 26533
## 01m04-L_S40_L001_R1_001.fastq 50393 25707
## 01m04-S_S184_L001_R1_001.fastq 37777 26690
## 01m05-B_S82_L001_R1_001.fastq 59978 42177
## 01m05-L_S52_L001_R1_001.fastq 86716 44488
## 01m05-S_S101_L001_R1_001.fastq 25576 17428
## 01m06-B_S94_L001_R1_001.fastq 24238 16770
## 01m06-L_S64_L001_R1_001.fastq 74432 43527
## 01m06-S_S113_L001_R1_001.fastq 38987 24338
## 02m01-B_S92_L001_R1_001.fastq 34548 29051
## 02m01-L_S74_L001_R1_001.fastq 76393 42366
## 02m01-S_S111_L001_R1_001.fastq 40790 31940
## 02m02-B_S9_L001_R1_001.fastq 43955 31931
## 02m02-L_S86_L001_R1_001.fastq 48777 24853
## 02m02-S_S123_L001_R1_001.fastq 35672 26983
## 02m03-B_S21_L001_R1_001.fastq 76013 45518
## 02m03-L_S3_L001_R1_001.fastq 51031 33325
## 02m03-S_S135_L001_R1_001.fastq 64523 49053
## 02m04-L_S15_L001_R1_001.fastq 63636 44504
## 02m04-S_S147_L001_R1_001.fastq 22136 15866
## 02m05-B_S45_L001_R1_001.fastq 56844 44683
## 02m05-L_S27_L001_R1_001.fastq 98563 65773
## 02m06-B_S57_L001_R1_001.fastq 75874 50930
## 02m06-L_S39_L001_R1_001.fastq 76059 55956
## 02m06-S_S171_L001_R1_001.fastq 55988 38555
## 02m07-B_S69_L001_R1_001.fastq 62143 45774
## 02m07-L_S51_L001_R1_001.fastq 118246 76496
## 02m07-S_S183_L001_R1_001.fastq 75934 54335
## 02m08-B_S81_L001_R1_001.fastq 40275 33122
## 02m08-L_S63_L001_R1_001.fastq 86204 51111
## 02m08-S_S100_L001_R1_001.fastq 9311 6092
## 02m09-B_S93_L001_R1_001.fastq 43859 34324
## 02m09-L_S75_L001_R1_001.fastq 90511 59295
## 02m09-S_S112_L001_R1_001.fastq 30696 22450
## 02m10-B_S10_L001_R1_001.fastq 44514 28590
## 02m10-L_S87_L001_R1_001.fastq 69759 33581
## 02m10-S_S124_L001_R1_001.fastq 46159 30020
## 02m11-B_S22_L001_R1_001.fastq 36351 25529
## 02m11-S_S136_L001_R1_001.fastq 56263 41257
## 02m11-W_S106_L001_R1_001.fastq 7937 3909
## 04m01-B_S11_L001_R1_001.fastq 78076 48864
## 04m01-L_S76_L001_R1_001.fastq 92460 38731
## 04m01-S_S125_L001_R1_001.fastq 52804 34502
## 04m02-B_S23_L001_R1_001.fastq 55827 37210
## 04m02-L_S88_L001_R1_001.fastq 68735 38968
## 04m02-S_S137_L001_R1_001.fastq 50943 37498
## 04m03-B_S35_L001_R1_001.fastq 78294 50963
## 04m03-L_S5_L001_R1_001.fastq 18335 7496
## 04m03-S_S149_L001_R1_001.fastq 43851 27207
## 04m04-B_S47_L001_R1_001.fastq 69126 50264
## 04m04-L_S17_L001_R1_001.fastq 47853 32014
## 04m04-S_S161_L001_R1_001.fastq 22364 17493
## 04m05-B_S59_L001_R1_001.fastq 74954 57688
## 04m05-L_S29_L001_R1_001.fastq 68377 34665
## 04m05-S_S173_L001_R1_001.fastq 27680 16379
## 04m06-B_S71_L001_R1_001.fastq 34662 27772
## 04m06-L_S41_L001_R1_001.fastq 75006 44800
## 04m06-S_S185_L001_R1_001.fastq 29846 20424
## 04m07-L_S53_L001_R1_001.fastq 62504 32670
## 04m07-S_S102_L001_R1_001.fastq 29284 23360
## 04m08-B_S83_L001_R1_001.fastq 3731 2517
## 04m08-L_S65_L001_R1_001.fastq 50221 36940
## 04m08-S_S114_L001_R1_001.fastq 17655 12813
## 04m09-B_S95_L001_R1_001.fastq 50351 40705
## 04m09-L_S77_L001_R1_001.fastq 58023 30525
## 04m09-S_S126_L001_R1_001.fastq 13880 8955
## 04m10-B_S12_L001_R1_001.fastq 74336 43139
## 04m10-L_S89_L001_R1_001.fastq 43184 8014
## 04m10-S_S138_L001_R1_001.fastq 59960 43071
## 04m11-B_S24_L001_R1_001.fastq 59008 38772
## 04m11-L_S6_L001_R1_001.fastq 52224 32835
## 04m11-S_S150_L001_R1_001.fastq 44390 29117
## 04m11-W_S108_L001_R1_001.fastq 11267 6099
## 04m12-B_S36_L001_R1_001.fastq 51883 41966
## 04m12-L_S18_L001_R1_001.fastq 43673 26574
## 04m12-S_S162_L001_R1_001.fastq 48382 33630
## 04m13-B_S48_L001_R1_001.fastq 42861 11062
## 04m13-L_S30_L001_R1_001.fastq 96874 67399
## 04m13-S_S174_L001_R1_001.fastq 43386 27330
## 04t01-B_S55_L001_R1_001.fastq 32696 17862
## 04t01-L_S37_L001_R1_001.fastq 63732 39008
## 04t01-S_S169_L001_R1_001.fastq 21724 12956
## 04t02-B_S67_L001_R1_001.fastq 70267 51565
## 04t02-L_S49_L001_R1_001.fastq 85594 52321
## 04t02-S_S181_L001_R1_001.fastq 18697 14196
## 04w01-B_S79_L001_R1_001.fastq 49036 24535
## 04w01-L_S61_L001_R1_001.fastq 93652 53797
## 04w01-S_S98_L001_R1_001.fastq 65217 43712
## 04w02-B_S91_L001_R1_001.fastq 15274 10167
## 04w02-L_S73_L001_R1_001.fastq 78463 40381
## 04w02-S_S110_L001_R1_001.fastq 63734 43429
## 04w03-B_S8_L001_R1_001.fastq 35725 24927
## 04w03-L_S85_L001_R1_001.fastq 46078 27475
## 04w03-S_S122_L001_R1_001.fastq 61651 38401
## 04w04-L_S2_L001_R1_001.fastq 59198 41100
## 04w04-S_S134_L001_R1_001.fastq 23573 19202
## 20m01-L_S14_L001_R1_001.fastq 60831 41520
## 20m01-S_S146_L001_R1_001.fastq 45584 25085
## 20m02-B_S44_L001_R1_001.fastq 43946 26292
## 20m02-L_S26_L001_R1_001.fastq 53083 35968
## 20m02-S_S158_L001_R1_001.fastq 30027 18465
## 20m02-W_S152_L001_R1_001.fastq 104730 26660
## 20m03-B_S56_L001_R1_001.fastq 37303 17611
## 20m03-L_S38_L001_R1_001.fastq 76136 51384
## 20m03-S_S170_L001_R1_001.fastq 51600 27714
## 20m04-B_S68_L001_R1_001.fastq 61875 48388
## 20m04-L_S50_L001_R1_001.fastq 93897 64032
## 20m04-S_S182_L001_R1_001.fastq 81142 55204
## 20m04-W_S164_L001_R1_001.fastq 33894 27346
## 20m05-B_S80_L001_R1_001.fastq 54867 41113
## 20m05-L_S62_L001_R1_001.fastq 70338 38147
## 20m05-S_S99_L001_R1_001.fastq 20796 15475
## sfm01-B_S60_L001_R1_001.fastq 59667 40562
## sfm01-L_S42_L001_R1_001.fastq 64524 36699
## sfm01-S_S186_L001_R1_001.fastq 21906 14603
## sfm02-B_S72_L001_R1_001.fastq 72687 55736
## sfm02-L_S54_L001_R1_001.fastq 100754 41594
## sfm02-S_S103_L001_R1_001.fastq 54966 37870
## sfm03-B_S84_L001_R1_001.fastq 43139 25719
## sfm03-L_S66_L001_R1_001.fastq 95030 48888
## sfm03-S_S115_L001_R1_001.fastq 3453 2427
## sfm04-B_S96_L001_R1_001.fastq 37509 23067
## sfm04-L_S78_L001_R1_001.fastq 93703 54593
## sfm04-S_S127_L001_R1_001.fastq 53871 36920
## sfm05-B_S109_L001_R1_001.fastq 62392 48030
## sfm05-L_S90_L001_R1_001.fastq 81246 35565
## sfm05-S_S139_L001_R1_001.fastq 37735 27047
## sfm06-B_S121_L001_R1_001.fastq 25301 19708
## sfm06-L_S7_L001_R1_001.fastq 37064 25171
## sfm06-S_S151_L001_R1_001.fastq 36389 22485
## sfm07-B_S133_L001_R1_001.fastq 71696 52955
## sfm07-L_S19_L001_R1_001.fastq 57542 34494
## sfm07-S_S163_L001_R1_001.fastq 37167 24037
## Sft01-B_S31_L001_R1_001.fastq 74753 51304
## Sft01-L_S13_L001_R1_001.fastq 62788 40703
## Sft01-S_S145_L001_R1_001.fastq 47648 30106
## Sft02-B_S43_L001_R1_001.fastq 82307 66158
## Sft02-L_S25_L001_R1_001.fastq 100297 63591
## Sft02-S_S157_L001_R1_001.fastq 32398 21410
Now to inspect our filtered sequences:
#Re-inspect read quality profiles
plotQualityProfile(filtFs[c(6,12,36,62,84,120)])
#Re-inspect read quality profiles
plotQualityProfile(filtRs[c(6,12,36,62,84,120)])
Now to track down errors and remove them.
Buckle up, Chuck, this here Machine Learning
Algorithm learnErrors will generate a parametric
error model for the filtered sequence data. Dune.
#Learn error rates
#!!~30 min each!!
errF <- learnErrors(filtFs, multithread=TRUE)
## 100430093 total bases in 503355 reads from 17 samples will be used for learning the error rates.
#Learn error rates
#!!~30 min each!!
errR <- learnErrors(filtRs, multithread=TRUE)
## 100896686 total bases in 503355 reads from 17 samples will be used for learning the error rates.
Plotting errors
#plot error forward
plotErrors(errF, nominalQ=TRUE)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
#plot error reverse
plotErrors(errR, nominalQ=TRUE)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
#Dereplication Dereplications means the computer only needs to analyse each identical sequence once, lumping anlyses and saving brain cells.
#Dereplication
derep_forward <- derepFastq(filtFs, verbose=TRUE)
derep_reverse <- derepFastq(filtRs, verbose=TRUE)
names(derep_forward) <- sample.names
names(derep_reverse) <- sample.names
##Infer ASVs
#Inferring ASVs
#Pool or pseudopool if particularly interested in rare taxa (e.g. singletons)
dadaFs <- dada(derep_forward, err=errF, pool="pseudo", multithread=TRUE)
## Sample 1 - 35155 reads in 7599 unique sequences.
## Sample 2 - 21641 reads in 5481 unique sequences.
## Sample 3 - 17639 reads in 5387 unique sequences.
## Sample 4 - 48015 reads in 11413 unique sequences.
## Sample 5 - 49960 reads in 10715 unique sequences.
## Sample 6 - 5196 reads in 1594 unique sequences.
## Sample 7 - 47185 reads in 10913 unique sequences.
## Sample 8 - 43967 reads in 7861 unique sequences.
## Sample 9 - 30592 reads in 8917 unique sequences.
## Sample 10 - 4212 reads in 939 unique sequences.
## Sample 11 - 26533 reads in 5865 unique sequences.
## Sample 12 - 25707 reads in 5671 unique sequences.
## Sample 13 - 26690 reads in 7788 unique sequences.
## Sample 14 - 42177 reads in 9790 unique sequences.
## Sample 15 - 44488 reads in 10074 unique sequences.
## Sample 16 - 17428 reads in 5966 unique sequences.
## Sample 17 - 16770 reads in 4587 unique sequences.
## Sample 18 - 43527 reads in 7969 unique sequences.
## Sample 19 - 24338 reads in 7825 unique sequences.
## Sample 20 - 29051 reads in 6106 unique sequences.
## Sample 21 - 42366 reads in 9959 unique sequences.
## Sample 22 - 31940 reads in 7605 unique sequences.
## Sample 23 - 31931 reads in 7066 unique sequences.
## Sample 24 - 24853 reads in 6108 unique sequences.
## Sample 25 - 26983 reads in 7225 unique sequences.
## Sample 26 - 45518 reads in 9536 unique sequences.
## Sample 27 - 33325 reads in 7030 unique sequences.
## Sample 28 - 49053 reads in 12983 unique sequences.
## Sample 29 - 44504 reads in 10216 unique sequences.
## Sample 30 - 15866 reads in 4942 unique sequences.
## Sample 31 - 44683 reads in 10600 unique sequences.
## Sample 32 - 65773 reads in 16371 unique sequences.
## Sample 33 - 50930 reads in 13615 unique sequences.
## Sample 34 - 55956 reads in 12619 unique sequences.
## Sample 35 - 38555 reads in 11302 unique sequences.
## Sample 36 - 45774 reads in 13015 unique sequences.
## Sample 37 - 76496 reads in 13285 unique sequences.
## Sample 38 - 54335 reads in 14752 unique sequences.
## Sample 39 - 33122 reads in 5396 unique sequences.
## Sample 40 - 51111 reads in 11328 unique sequences.
## Sample 41 - 6092 reads in 2116 unique sequences.
## Sample 42 - 34324 reads in 6540 unique sequences.
## Sample 43 - 59295 reads in 10301 unique sequences.
## Sample 44 - 22450 reads in 5700 unique sequences.
## Sample 45 - 28590 reads in 6320 unique sequences.
## Sample 46 - 33581 reads in 6108 unique sequences.
## Sample 47 - 30020 reads in 10896 unique sequences.
## Sample 48 - 25529 reads in 5282 unique sequences.
## Sample 49 - 41257 reads in 13471 unique sequences.
## Sample 50 - 3909 reads in 829 unique sequences.
## Sample 51 - 48864 reads in 12472 unique sequences.
## Sample 52 - 38731 reads in 8470 unique sequences.
## Sample 53 - 34502 reads in 11814 unique sequences.
## Sample 54 - 37210 reads in 9353 unique sequences.
## Sample 55 - 38968 reads in 10800 unique sequences.
## Sample 56 - 37498 reads in 11004 unique sequences.
## Sample 57 - 50963 reads in 11834 unique sequences.
## Sample 58 - 7496 reads in 2417 unique sequences.
## Sample 59 - 27207 reads in 10317 unique sequences.
## Sample 60 - 50264 reads in 14496 unique sequences.
## Sample 61 - 32014 reads in 7671 unique sequences.
## Sample 62 - 17493 reads in 4833 unique sequences.
## Sample 63 - 57688 reads in 14843 unique sequences.
## Sample 64 - 34665 reads in 7597 unique sequences.
## Sample 65 - 16379 reads in 5737 unique sequences.
## Sample 66 - 27772 reads in 6224 unique sequences.
## Sample 67 - 44800 reads in 10815 unique sequences.
## Sample 68 - 20424 reads in 6864 unique sequences.
## Sample 69 - 32670 reads in 7476 unique sequences.
## Sample 70 - 23360 reads in 6992 unique sequences.
## Sample 71 - 2517 reads in 644 unique sequences.
## Sample 72 - 36940 reads in 9443 unique sequences.
## Sample 73 - 12813 reads in 3970 unique sequences.
## Sample 74 - 40705 reads in 9377 unique sequences.
## Sample 75 - 30525 reads in 7366 unique sequences.
## Sample 76 - 8955 reads in 3180 unique sequences.
## Sample 77 - 43139 reads in 9379 unique sequences.
## Sample 78 - 8014 reads in 2568 unique sequences.
## Sample 79 - 43071 reads in 13201 unique sequences.
## Sample 80 - 38772 reads in 8791 unique sequences.
## Sample 81 - 32835 reads in 9333 unique sequences.
## Sample 82 - 29117 reads in 10389 unique sequences.
## Sample 83 - 6099 reads in 1602 unique sequences.
## Sample 84 - 41966 reads in 7071 unique sequences.
## Sample 85 - 26574 reads in 4630 unique sequences.
## Sample 86 - 33630 reads in 10155 unique sequences.
## Sample 87 - 11062 reads in 2752 unique sequences.
## Sample 88 - 67399 reads in 10744 unique sequences.
## Sample 89 - 27330 reads in 8238 unique sequences.
## Sample 90 - 17862 reads in 5454 unique sequences.
## Sample 91 - 39008 reads in 8378 unique sequences.
## Sample 92 - 12956 reads in 4208 unique sequences.
## Sample 93 - 51565 reads in 13343 unique sequences.
## Sample 94 - 52321 reads in 12895 unique sequences.
## Sample 95 - 14196 reads in 4309 unique sequences.
## Sample 96 - 24535 reads in 5767 unique sequences.
## Sample 97 - 53797 reads in 8268 unique sequences.
## Sample 98 - 43712 reads in 12838 unique sequences.
## Sample 99 - 10167 reads in 2482 unique sequences.
## Sample 100 - 40381 reads in 8852 unique sequences.
## Sample 101 - 43429 reads in 13556 unique sequences.
## Sample 102 - 24927 reads in 5558 unique sequences.
## Sample 103 - 27475 reads in 4081 unique sequences.
## Sample 104 - 38401 reads in 11979 unique sequences.
## Sample 105 - 41100 reads in 8316 unique sequences.
## Sample 106 - 19202 reads in 4113 unique sequences.
## Sample 107 - 41520 reads in 9419 unique sequences.
## Sample 108 - 25085 reads in 8659 unique sequences.
## Sample 109 - 26292 reads in 7411 unique sequences.
## Sample 110 - 35968 reads in 5852 unique sequences.
## Sample 111 - 18465 reads in 6403 unique sequences.
## Sample 112 - 26660 reads in 8839 unique sequences.
## Sample 113 - 17611 reads in 7322 unique sequences.
## Sample 114 - 51384 reads in 9978 unique sequences.
## Sample 115 - 27714 reads in 8668 unique sequences.
## Sample 116 - 48388 reads in 11825 unique sequences.
## Sample 117 - 64032 reads in 11718 unique sequences.
## Sample 118 - 55204 reads in 14667 unique sequences.
## Sample 119 - 27346 reads in 5977 unique sequences.
## Sample 120 - 41113 reads in 7377 unique sequences.
## Sample 121 - 38147 reads in 8952 unique sequences.
## Sample 122 - 15475 reads in 5501 unique sequences.
## Sample 123 - 40562 reads in 10360 unique sequences.
## Sample 124 - 36699 reads in 9138 unique sequences.
## Sample 125 - 14603 reads in 5395 unique sequences.
## Sample 126 - 55736 reads in 13993 unique sequences.
## Sample 127 - 41594 reads in 8183 unique sequences.
## Sample 128 - 37870 reads in 10587 unique sequences.
## Sample 129 - 25719 reads in 5532 unique sequences.
## Sample 130 - 48888 reads in 11002 unique sequences.
## Sample 131 - 2427 reads in 732 unique sequences.
## Sample 132 - 23067 reads in 5778 unique sequences.
## Sample 133 - 54593 reads in 16248 unique sequences.
## Sample 134 - 36920 reads in 11459 unique sequences.
## Sample 135 - 48030 reads in 14138 unique sequences.
## Sample 136 - 35565 reads in 9469 unique sequences.
## Sample 137 - 27047 reads in 7354 unique sequences.
## Sample 138 - 19708 reads in 4737 unique sequences.
## Sample 139 - 25171 reads in 7147 unique sequences.
## Sample 140 - 22485 reads in 8064 unique sequences.
## Sample 141 - 52955 reads in 16457 unique sequences.
## Sample 142 - 34494 reads in 9953 unique sequences.
## Sample 143 - 24037 reads in 8138 unique sequences.
## Sample 144 - 51304 reads in 14735 unique sequences.
## Sample 145 - 40703 reads in 9651 unique sequences.
## Sample 146 - 30106 reads in 11318 unique sequences.
## Sample 147 - 66158 reads in 14299 unique sequences.
## Sample 148 - 63591 reads in 14108 unique sequences.
## Sample 149 - 21410 reads in 6913 unique sequences.
##
## selfConsist step 2
dadaRs <- dada(derep_reverse, err=errR, pool="pseudo", multithread=TRUE)
## Sample 1 - 35155 reads in 11437 unique sequences.
## Sample 2 - 21641 reads in 12686 unique sequences.
## Sample 3 - 17639 reads in 9308 unique sequences.
## Sample 4 - 48015 reads in 20405 unique sequences.
## Sample 5 - 49960 reads in 20340 unique sequences.
## Sample 6 - 5196 reads in 2601 unique sequences.
## Sample 7 - 47185 reads in 16310 unique sequences.
## Sample 8 - 43967 reads in 19406 unique sequences.
## Sample 9 - 30592 reads in 15757 unique sequences.
## Sample 10 - 4212 reads in 1256 unique sequences.
## Sample 11 - 26533 reads in 8925 unique sequences.
## Sample 12 - 25707 reads in 13050 unique sequences.
## Sample 13 - 26690 reads in 12716 unique sequences.
## Sample 14 - 42177 reads in 19209 unique sequences.
## Sample 15 - 44488 reads in 21799 unique sequences.
## Sample 16 - 17428 reads in 9171 unique sequences.
## Sample 17 - 16770 reads in 6970 unique sequences.
## Sample 18 - 43527 reads in 19214 unique sequences.
## Sample 19 - 24338 reads in 12255 unique sequences.
## Sample 20 - 29051 reads in 8740 unique sequences.
## Sample 21 - 42366 reads in 22893 unique sequences.
## Sample 22 - 31940 reads in 11598 unique sequences.
## Sample 23 - 31931 reads in 11927 unique sequences.
## Sample 24 - 24853 reads in 10001 unique sequences.
## Sample 25 - 26983 reads in 10302 unique sequences.
## Sample 26 - 45518 reads in 20988 unique sequences.
## Sample 27 - 33325 reads in 15302 unique sequences.
## Sample 28 - 49053 reads in 18010 unique sequences.
## Sample 29 - 44504 reads in 19442 unique sequences.
## Sample 30 - 15866 reads in 7613 unique sequences.
## Sample 31 - 44683 reads in 17108 unique sequences.
## Sample 32 - 65773 reads in 27050 unique sequences.
## Sample 33 - 50930 reads in 21911 unique sequences.
## Sample 34 - 55956 reads in 21036 unique sequences.
## Sample 35 - 38555 reads in 18003 unique sequences.
## Sample 36 - 45774 reads in 19748 unique sequences.
## Sample 37 - 76496 reads in 33556 unique sequences.
## Sample 38 - 54335 reads in 23710 unique sequences.
## Sample 39 - 33122 reads in 12574 unique sequences.
## Sample 40 - 51111 reads in 25272 unique sequences.
## Sample 41 - 6092 reads in 3648 unique sequences.
## Sample 42 - 34324 reads in 11370 unique sequences.
## Sample 43 - 59295 reads in 27134 unique sequences.
## Sample 44 - 22450 reads in 8829 unique sequences.
## Sample 45 - 28590 reads in 12053 unique sequences.
## Sample 46 - 33581 reads in 15618 unique sequences.
## Sample 47 - 30020 reads in 15598 unique sequences.
## Sample 48 - 25529 reads in 9946 unique sequences.
## Sample 49 - 41257 reads in 18637 unique sequences.
## Sample 50 - 3909 reads in 1701 unique sequences.
## Sample 51 - 48864 reads in 23572 unique sequences.
## Sample 52 - 38731 reads in 24311 unique sequences.
## Sample 53 - 34502 reads in 16904 unique sequences.
## Sample 54 - 37210 reads in 17287 unique sequences.
## Sample 55 - 38968 reads in 14812 unique sequences.
## Sample 56 - 37498 reads in 16147 unique sequences.
## Sample 57 - 50963 reads in 22805 unique sequences.
## Sample 58 - 7496 reads in 4323 unique sequences.
## Sample 59 - 27207 reads in 16174 unique sequences.
## Sample 60 - 50264 reads in 25241 unique sequences.
## Sample 61 - 32014 reads in 16078 unique sequences.
## Sample 62 - 17493 reads in 6877 unique sequences.
## Sample 63 - 57688 reads in 24856 unique sequences.
## Sample 64 - 34665 reads in 18393 unique sequences.
## Sample 65 - 16379 reads in 8838 unique sequences.
## Sample 66 - 27772 reads in 10821 unique sequences.
## Sample 67 - 44800 reads in 20605 unique sequences.
## Sample 68 - 20424 reads in 9824 unique sequences.
## Sample 69 - 32670 reads in 16120 unique sequences.
## Sample 70 - 23360 reads in 9968 unique sequences.
## Sample 71 - 2517 reads in 1271 unique sequences.
## Sample 72 - 36940 reads in 16313 unique sequences.
## Sample 73 - 12813 reads in 5628 unique sequences.
## Sample 74 - 40705 reads in 15227 unique sequences.
## Sample 75 - 30525 reads in 17980 unique sequences.
## Sample 76 - 8955 reads in 4674 unique sequences.
## Sample 77 - 43139 reads in 21394 unique sequences.
## Sample 78 - 8014 reads in 4148 unique sequences.
## Sample 79 - 43071 reads in 16695 unique sequences.
## Sample 80 - 38772 reads in 18204 unique sequences.
## Sample 81 - 32835 reads in 14875 unique sequences.
## Sample 82 - 29117 reads in 15395 unique sequences.
## Sample 83 - 6099 reads in 3186 unique sequences.
## Sample 84 - 41966 reads in 14485 unique sequences.
## Sample 85 - 26574 reads in 12412 unique sequences.
## Sample 86 - 33630 reads in 15899 unique sequences.
## Sample 87 - 11062 reads in 6490 unique sequences.
## Sample 88 - 67399 reads in 22734 unique sequences.
## Sample 89 - 27330 reads in 14887 unique sequences.
## Sample 90 - 17862 reads in 9593 unique sequences.
## Sample 91 - 39008 reads in 17610 unique sequences.
## Sample 92 - 12956 reads in 7687 unique sequences.
## Sample 93 - 51565 reads in 20765 unique sequences.
## Sample 94 - 52321 reads in 20612 unique sequences.
## Sample 95 - 14196 reads in 5241 unique sequences.
## Sample 96 - 24535 reads in 12938 unique sequences.
## Sample 97 - 53797 reads in 24981 unique sequences.
## Sample 98 - 43712 reads in 21120 unique sequences.
## Sample 99 - 10167 reads in 4478 unique sequences.
## Sample 100 - 40381 reads in 21888 unique sequences.
## Sample 101 - 43429 reads in 21262 unique sequences.
## Sample 102 - 24927 reads in 9586 unique sequences.
## Sample 103 - 27475 reads in 6783 unique sequences.
## Sample 104 - 38401 reads in 19877 unique sequences.
## Sample 105 - 41100 reads in 17416 unique sequences.
## Sample 106 - 19202 reads in 6393 unique sequences.
## Sample 107 - 41520 reads in 19343 unique sequences.
## Sample 108 - 25085 reads in 15280 unique sequences.
## Sample 109 - 26292 reads in 13920 unique sequences.
## Sample 110 - 35968 reads in 14181 unique sequences.
## Sample 111 - 18465 reads in 10412 unique sequences.
## Sample 112 - 26660 reads in 18256 unique sequences.
## Sample 113 - 17611 reads in 10805 unique sequences.
## Sample 114 - 51384 reads in 22931 unique sequences.
## Sample 115 - 27714 reads in 17396 unique sequences.
## Sample 116 - 48388 reads in 18471 unique sequences.
## Sample 117 - 64032 reads in 27510 unique sequences.
## Sample 118 - 55204 reads in 28109 unique sequences.
## Sample 119 - 27346 reads in 8893 unique sequences.
## Sample 120 - 41113 reads in 17040 unique sequences.
## Sample 121 - 38147 reads in 19143 unique sequences.
## Sample 122 - 15475 reads in 7448 unique sequences.
## Sample 123 - 40562 reads in 19735 unique sequences.
## Sample 124 - 36699 reads in 21021 unique sequences.
## Sample 125 - 14603 reads in 8060 unique sequences.
## Sample 126 - 55736 reads in 24986 unique sequences.
## Sample 127 - 41594 reads in 20195 unique sequences.
## Sample 128 - 37870 reads in 17026 unique sequences.
## Sample 129 - 25719 reads in 13987 unique sequences.
## Sample 130 - 48888 reads in 23633 unique sequences.
## Sample 131 - 2427 reads in 1030 unique sequences.
## Sample 132 - 23067 reads in 11889 unique sequences.
## Sample 133 - 54593 reads in 28382 unique sequences.
## Sample 134 - 36920 reads in 16240 unique sequences.
## Sample 135 - 48030 reads in 18951 unique sequences.
## Sample 136 - 35565 reads in 17980 unique sequences.
## Sample 137 - 27047 reads in 12144 unique sequences.
## Sample 138 - 19708 reads in 6603 unique sequences.
## Sample 139 - 25171 reads in 11877 unique sequences.
## Sample 140 - 22485 reads in 12389 unique sequences.
## Sample 141 - 52955 reads in 22474 unique sequences.
## Sample 142 - 34494 reads in 16175 unique sequences.
## Sample 143 - 24037 reads in 11831 unique sequences.
## Sample 144 - 51304 reads in 23154 unique sequences.
## Sample 145 - 40703 reads in 19175 unique sequences.
## Sample 146 - 30106 reads in 17560 unique sequences.
## Sample 147 - 66158 reads in 20897 unique sequences.
## Sample 148 - 63591 reads in 25860 unique sequences.
## Sample 149 - 21410 reads in 10885 unique sequences.
##
## selfConsist step 2
dadaFs[[1]]
## dada-class: object describing DADA2 denoising results
## 427 sequence variants were inferred from 7599 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
NOW WE WILL MERGE #MERGE F and R
#Merge forward and reverse reads (make contigs)
mergers <- mergePairs(dadaFs, derep_forward, dadaRs, derep_reverse,
#trimOverhang=TRUE, minOverlap=45,
verbose=TRUE)
## Note: new ITS DADA2 tutorial using our primers does not suggest trimming overhang or using a minOverlap greater than default of 12. This should result in retaining more reads, and we'll rely on chimera detection to detect spurious mergers. ...
Now construct a count table for our newly merged ASVs.
##Construct a Count Table
#Construct ASV table
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
## [1] 149 14219
Project1216R: 149 samples, 14219 putative ASVs…
table(nchar(getSequences(seqtab)))
##
## 47 48 50 52 53 55 57 60 63 65 66 67 68 69 70 71 72 73 74 75
## 1 1 1 2 1 1 1 1 3 6 2 3 5 1 3 1 5 8 6 13
## 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## 8 12 12 12 15 12 20 14 5 17 14 12 5 8 5 12 12 12 6 9
## 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## 10 13 17 16 24 39 26 28 39 28 21 42 37 26 44 61 34 76 45 32
## 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
## 29 39 40 34 24 26 23 42 40 42 42 47 42 27 27 30 36 42 31 41
## 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
## 36 55 98 64 80 88 98 136 116 88 110 101 145 164 116 96 73 80 95 88
## 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
## 140 97 93 156 99 94 134 121 136 102 149 132 130 90 135 121 91 120 109 173
## 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
## 128 115 111 112 89 104 108 85 82 134 116 107 79 107 103 75 154 90 117 107
## 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
## 135 106 119 120 96 95 92 101 107 75 53 77 71 65 50 40 72 44 46 57
## 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
## 48 55 61 48 32 37 65 43 34 48 49 66 41 34 40 37 50 34 54 52
## 236 237 238 239 240 241 242 243 244 245 246 247 248 250 251 252 253 254 255 256
## 58 40 54 55 70 85 72 37 54 58 58 23 39 1 135 109 97 39 42 28
## 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
## 32 37 56 34 24 25 38 41 37 41 44 18 29 30 47 32 27 15 32 31
## 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
## 132 27 33 18 29 25 23 16 32 25 19 23 19 22 8 11 10 7 19 7
## 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316
## 14 9 9 24 38 21 10 7 15 10 7 20 23 22 21 10 26 41 7 13
## 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
## 21 19 8 7 13 19 8 8 12 13 5 8 7 8 23 9 3 3 3 8
## 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
## 12 9 2 3 2 7 16 6 5 6 6 3 11 9 1 11 5 2 2 5
## 357 358 359 360 361 362 363 364 365 366 367 369 370 371 372 373 374 375 376 377
## 1 6 11 3 3 9 6 10 9 16 4 2 2 2 4 3 1 1 2 1
## 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
## 10 4 3 3 7 4 2 4 3 2 2 1 1 9 6 3 1 5 10 6
## 398 399 400 401 402 403 404 405 407 408 409 410 411 412 414 415 416 417 418 419
## 7 1 3 3 6 4 3 4 1 1 1 2 2 2 1 2 1 2 3 3
## 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439
## 9 4 1 2 1 1 57 6 6 27 4 4 1 1 1 1 1 8 1 4
## 440 441 442 443 444 445 446 448 449 450 451 452 455 456 457 458 462 464 465 466
## 5 3 4 1 3 1 3 2 7 47 3 8 5 23 2 1 1 1 8 1
## 469 471 472 473 475 476 478 480 481 482 484
## 22 3 2 8 1 1 1 1 1 1 1
seqtab <- seqtab[,nchar(colnames(seqtab)) %in% 74:430]
table(nchar(getSequences(seqtab)))
##
## 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
## 6 13 8 12 12 12 15 12 20 14 5 17 14 12 5 8 5 12 12 12
## 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
## 6 9 10 13 17 16 24 39 26 28 39 28 21 42 37 26 44 61 34 76
## 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
## 45 32 29 39 40 34 24 26 23 42 40 42 42 47 42 27 27 30 36 42
## 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
## 31 41 36 55 98 64 80 88 98 136 116 88 110 101 145 164 116 96 73 80
## 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
## 95 88 140 97 93 156 99 94 134 121 136 102 149 132 130 90 135 121 91 120
## 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
## 109 173 128 115 111 112 89 104 108 85 82 134 116 107 79 107 103 75 154 90
## 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
## 117 107 135 106 119 120 96 95 92 101 107 75 53 77 71 65 50 40 72 44
## 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
## 46 57 48 55 61 48 32 37 65 43 34 48 49 66 41 34 40 37 50 34
## 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 250 251 252 253 254
## 54 52 58 40 54 55 70 85 72 37 54 58 58 23 39 1 135 109 97 39
## 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
## 42 28 32 37 56 34 24 25 38 41 37 41 44 18 29 30 47 32 27 15
## 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
## 32 31 132 27 33 18 29 25 23 16 32 25 19 23 19 22 8 11 10 7
## 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
## 19 7 14 9 9 24 38 21 10 7 15 10 7 20 23 22 21 10 26 41
## 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334
## 7 13 21 19 8 7 13 19 8 8 12 13 5 8 7 8 23 9 3 3
## 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
## 3 8 12 9 2 3 2 7 16 6 5 6 6 3 11 9 1 11 5 2
## 355 356 357 358 359 360 361 362 363 364 365 366 367 369 370 371 372 373 374 375
## 2 5 1 6 11 3 3 9 6 10 9 16 4 2 2 2 4 3 1 1
## 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
## 2 1 10 4 3 3 7 4 2 4 3 2 2 1 1 9 6 3 1 5
## 396 397 398 399 400 401 402 403 404 405 407 408 409 410 411 412 414 415 416 417
## 10 6 7 1 3 3 6 4 3 4 1 1 1 2 2 2 1 2 1 2
## 418 419 420 421 422 423 424 425 426 427 428 429 430
## 3 3 9 4 1 2 1 1 57 6 6 27 4
dim(seqtab)
## [1] 149 13980
13980 ASVs, 149 samples.
Now to remove chimeras:
##Remove Chimeras
#Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
## Identified 124 bimeras out of 13980 input sequences.
## Identified 83 bimeras out of 2858 input sequences.
(sum(seqtab.nochim)/sum(seqtab))
## [1] 0.9962187
(1-(sum(seqtab.nochim)/sum(seqtab)))
## [1] 0.003781343
###OLD
#Identified 35 bimeras out of 7907 input sequences.
#[1] 0.9979256
#[1] 0.002074373
### NEW
#Identified 106 bimeras out of 13589 input sequences.
#[1] 0.9961605
#[1] 0.003839465
### 1216R March...
#Identified 94 bimeras out of 13270 input sequences.
#[1] 0.9962695
#[1] 0.003730497
### 1216R April
#Identified 96 bimeras out of 13660 input sequences.
#[1] 0.9962817
#[1] 0.003718295
##Identified 124 bimeras out of 13980 input sequences.
#[1] 0.9962187
#[1] 0.003781343
#Count reads dropped through each step in the pipeline
getN <- function(x) sum(getUniques(x))
track <- data.frame(row.names=sample.names, dada2_input=out[,1],
filtered=out[,2], dada_f=sapply(dadaFs, getN),
dada_r=sapply(dadaRs, getN), merged=sapply(mergers, getN),
nonchim=rowSums(seqtab.nochim),
final_perc_reads_retained=round(rowSums(seqtab.nochim)/out[,1]*100, 1))
#View(track)
mean(track[,7])
## [1] 56.22013
min(track[,7])
## [1] 14.3
### change file name with new run-throughs!!
write.csv(track, "/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS/dada2_final_read_tracking09-15-2023.csv")
We are using a combination of taxonomy assignment functions to provide alternatives to the handful of spurious classification provided by the otherwise accurate naive Bayesian approach of the dada2 assignTaxonomy() function. Using the ensembleTex package, we’ll merge the outputs
Using current UNITE general release for all eukaryotes: 9.0 2022-10-16 All eukaryotes 17 683 308 588 Current https://doi.org/10.15156/BIO/2483914 When using this resource, please cite it as follows: Abarenkov, Kessy; Zirk, Allan; Piirmann, Timo; Pöhönen, Raivo; Ivanov, Filipp; Nilsson, R. Henrik; Kõljalg, Urmas (2022): UNITE general FASTA release for eukaryotes 2. Version 16.10.2022. UNITE Community. https://doi.org/10.15156/BIO/2483914
Includes global and 3% distance singletons.
overnight!!!
###
#Set Working Directory
setwd('/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS')
#Classify ASVs
#!!~20 min each!!
tax_info <- assignTaxonomy(seqtab.nochim, "sh_general_release_dynamic_s_all_29.11.2022.fasta",
multithread=TRUE, tryRC=TRUE)
## UNITE fungal taxonomic reference detected.
taxa.print <- tax_info
rownames(taxa.print) <- NULL
head(taxa.print)
## Kingdom Phylum Class
## [1,] NA NA NA
## [2,] "k__Fungi" "p__Ascomycota" "c__Dothideomycetes"
## [3,] "k__Stramenopila" "p__Ochrophyta" "c__Bacillariophyceae"
## [4,] NA NA NA
## [5,] "k__Fungi" "p__Ascomycota" "c__Dothideomycetes"
## [6,] "k__Fungi" "p__Basidiomycota" "c__Agaricomycetes"
## Order Family Genus
## [1,] NA NA NA
## [2,] "o__Pleosporales" "f__Pleosporaceae" "g__Alternaria"
## [3,] "o__Naviculales" "f__Naviculaceae" "g__Navicula"
## [4,] NA NA NA
## [5,] "o__Capnodiales" "f__Cladosporiaceae" "g__Cladosporium"
## [6,] "o__Phallales" "f__Phallaceae" "g__Phallus"
## Species
## [1,] NA
## [2,] "s__tenuissima"
## [3,] "s__trivialis"
## [4,] NA
## [5,] "s__cladosporioides"
## [6,] "s__hadriani"
#Generate FASTA file
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
for (i in 1:dim(seqtab.nochim)[2]) {
asv_headers[i] <- paste(">ASV", i, sep="_")
}
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, "dada2fasta09-15-2023.fasta")
#Generate ASV table (.count_table file)
asv_tab <- t(seqtab.nochim)
row.names(asv_tab) <- sub(">", "", asv_headers)
write.table(t(asv_tab), "practice09-15-2023.count_table", sep="\t", quote=F, col.names=NA)
#Generate taxonomy table
rownames(tax_info) <- gsub(pattern=">", replacement="", x=asv_headers)
write.csv(tax_info, "dada2_taxonomy09-15-2023.csv")
### Chunk 3
library("Biostrings")
library(dplyr)
library(stringr)
library(usethis)
library(DECIPHER); packageVersion("DECIPHER")
## Loading required package: RSQLite
## Loading required package: parallel
## [1] '2.26.0'
## [1] '2.14.0'
# read our sample fastas into DECIPHER format
fas <- "dada2fasta09-15-2023.fasta"
idseqs <- readDNAStringSet(fas)
load("TrainingSet_UNITE_decipher_03.30.2023.RData") # CHANGE TO THE PATH OF YOUR TRAINING SET
ids <- IdTaxa(idseqs, trainingSet, strand="both", threshold=50, processors=NULL, verbose=FALSE) # use all processors
### The rank labels got messed up somehow, finding a workaround.....
testtt<- sapply(ids,"[[",1)
taxmatx<-stringi::stri_list2matrix(testtt, byrow=TRUE)
ranks <- c("root","kingdom", "phylum", "class", "order", "family", "genus", "species")
colnames(taxmatx)<-ranks
taxid_df<- as.data.frame(taxmatx[1:nrow(as.data.frame(idseqs)),2:8])
row.names(taxid_df)<- names(idseqs)
head(taxid_df)
## kingdom phylum class
## ASV_1 unclassified_Root <NA> <NA>
## ASV_2 k__Fungi p__Ascomycota c__Dothideomycetes
## ASV_3 k__Stramenopila p__Ochrophyta c__Bacillariophyceae
## ASV_4 unclassified_Root <NA> <NA>
## ASV_5 k__Fungi p__Ascomycota c__Dothideomycetes
## ASV_6 k__Fungi p__Basidiomycota c__Agaricomycetes
## order family genus
## ASV_1 <NA> <NA> <NA>
## ASV_2 o__Pleosporales f__Pleosporaceae g__Alternaria
## ASV_3 o__Naviculales f__Naviculaceae g__Navicula
## ASV_4 <NA> <NA> <NA>
## ASV_5 unclassified_c__Dothideomycetes <NA> <NA>
## ASV_6 o__Phallales f__Phallaceae g__Phallus
## species
## ASV_1 <NA>
## ASV_2 unclassified_g__Alternaria
## ASV_3 s__Navicula_trivialis
## ASV_4 <NA>
## ASV_5 <NA>
## ASV_6 s__Phallus_hadriani
library(stringr)
library(dplyr)
#library(magrittr)
idtax_table_commonizing<- taxid_df
is.na(idtax_table_commonizing$kingdom) <- startsWith(idtax_table_commonizing$kingdom, "unclassified_")
is.na(idtax_table_commonizing$phylum) <- startsWith(idtax_table_commonizing$phylum, "unclassified_")
is.na(idtax_table_commonizing$class) <- startsWith(idtax_table_commonizing$class, "unclassified_")
is.na(idtax_table_commonizing$order) <- startsWith(idtax_table_commonizing$order, "unclassified_")
is.na(idtax_table_commonizing$family) <- startsWith(idtax_table_commonizing$family, "unclassified_")
is.na(idtax_table_commonizing$genus) <- startsWith(idtax_table_commonizing$genus, "unclassified_")
is.na(idtax_table_commonizing$species) <- startsWith(idtax_table_commonizing$species, "unclassified_")
is.na(idtax_table_commonizing$species) <- endsWith(idtax_table_commonizing$species, "_sp")
#DECIPHER idtaxa uses redundant labeling, putting "s__Genus_species" in the species rank. We want to remove the "Genus" from the middle of this string to make it interoperable with the Bayesian taxonomy from dada2.
idtax_table_commonizing$species<- str_replace(idtax_table_commonizing$species, pattern = "__\\w+\\_", replacement = "__")
#blessed be, it worked! Now save it for the love of all that is holy.
write.csv(idtax_table_commonizing, "idtax_tab_09-15-2023.csv")
##[Corals] not in Kansas anymore The Bayesian approach has misclassified some fungi and stramenopiles as Metazoa and Viridiplantae. For example: for ASV_1178 and ASV_1300 (note that ASV # may change upon re-running) Bayesian approach IDed as:
ASV_1178 k__Metazoa p__Cnidaria c__Anthozoa o__Zoantharia f__Zoanthidae g__Palythoa s__grandis ASV_1300 k__Metazoa p__Cnidaria c__Anthozoa o__Zoantharia f__Zoanthidae g__Palythoa s__grandis
But there are certainly no corals in Kansas. IDTAXA IDed as:
ASV_1178 k__Fungi p__Ascomycota c__Dothideomycetes o__Pleosporales f__Phaeosphaeriaceae g__Leptospora NA ASV_1300 k__Fungi p__Ascomycota c__Dothideomycetes o__Pleosporales f__Phaeosphaeriaceae g__Leptospora NA
The two approachs agree on most fungi and stramenopiles, but it is the bayesian approach that misclassifies a minority of fungal and stramenopile sequences as Metazoans and Viridiplantaeans.
For ASVs identified as Metazoa and Viridiplantae by Bayesian approach, substitute in the taxonomy generated by IDTAXA:
cured_bayes_tax<- as.data.frame(tax_info)
colnames(idtax_table_commonizing)<-colnames(tax_info)
rows_to_replace<- cured_bayes_tax$Kingdom=='k__Metazoa' | cured_bayes_tax$Kingdom=='k__Viridiplantae'
idtemp<-idtax_table_commonizing
# Loop over the rows to perform replacement
for(i in which(rows_to_replace)) {
cured_bayes_tax[i, ] <- idtemp[i, ]
}
library(tibble)
##
## Attaching package: 'tibble'
## The following object is masked from 'package:ShortRead':
##
## view
taxidf <- tibble::as_data_frame(idtemp)
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a
## tibble, or `as.data.frame()` to convert to a data frame.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
row.names(taxidf) <- row.names(idtemp)
## Warning: Setting row names on a tibble is deprecated.
taxidf <- tibble::rownames_to_column(idtemp, "ASV")
bayesdf <- tibble::as_data_frame(cured_bayes_tax)
row.names(bayesdf) <- row.names(cured_bayes_tax)
## Warning: Setting row names on a tibble is deprecated.
bayesdf <- tibble::rownames_to_column(cured_bayes_tax, "ASV")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ purrr 1.0.1
## ✔ lubridate 1.9.2 ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks Biostrings::collapse(), IRanges::collapse()
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ purrr::compact() masks XVector::compact()
## ✖ purrr::compose() masks ShortRead::compose()
## ✖ dplyr::count() masks matrixStats::count()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks GenomicAlignments::first(), S4Vectors::first()
## ✖ dplyr::id() masks ShortRead::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks GenomicAlignments::last()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks GenomicAlignments::second(), S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks XVector::slice(), IRanges::slice()
## ✖ tibble::view() masks ShortRead::view()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
taxidf <- as.data.frame(taxidf)
bayesdf<- as.data.frame(bayesdf)
library(ensembleTax)
xx <- list(taxidf, bayesdf)
names(xx) <- c("idtax", "bayes")
eTax1 <- assign.ensembleTax(xx,
tablenames = names(xx),
ranknames = colnames(taxidf)[2:ncol(taxidf)],
weights=rep(1,length(xx)),
# ranknames = c("kingdom","phylum","class","order","family","genus","species"),
tiebreakz = c("bayes"),
count.na=FALSE,
assign.threshold = 0
)
# show the 3 individuals again for easy viewing:
lapply(xx, FUN = head)
## $idtax
## ASV Kingdom Phylum Class Order
## 1 ASV_1 <NA> <NA> <NA> <NA>
## 2 ASV_2 k__Fungi p__Ascomycota c__Dothideomycetes o__Pleosporales
## 3 ASV_3 k__Stramenopila p__Ochrophyta c__Bacillariophyceae o__Naviculales
## 4 ASV_4 <NA> <NA> <NA> <NA>
## 5 ASV_5 k__Fungi p__Ascomycota c__Dothideomycetes <NA>
## 6 ASV_6 k__Fungi p__Basidiomycota c__Agaricomycetes o__Phallales
## Family Genus Species
## 1 <NA> <NA> <NA>
## 2 f__Pleosporaceae g__Alternaria <NA>
## 3 f__Naviculaceae g__Navicula s__trivialis
## 4 <NA> <NA> <NA>
## 5 <NA> <NA> <NA>
## 6 f__Phallaceae g__Phallus s__hadriani
##
## $bayes
## ASV Kingdom Phylum Class Order
## 1 ASV_1 <NA> <NA> <NA> <NA>
## 2 ASV_2 k__Fungi p__Ascomycota c__Dothideomycetes o__Pleosporales
## 3 ASV_3 k__Stramenopila p__Ochrophyta c__Bacillariophyceae o__Naviculales
## 4 ASV_4 <NA> <NA> <NA> <NA>
## 5 ASV_5 k__Fungi p__Ascomycota c__Dothideomycetes o__Capnodiales
## 6 ASV_6 k__Fungi p__Basidiomycota c__Agaricomycetes o__Phallales
## Family Genus Species
## 1 <NA> <NA> <NA>
## 2 f__Pleosporaceae g__Alternaria s__tenuissima
## 3 f__Naviculaceae g__Navicula s__trivialis
## 4 <NA> <NA> <NA>
## 5 f__Cladosporiaceae g__Cladosporium s__cladosporioides
## 6 f__Phallaceae g__Phallus s__hadriani
#Set Working Directory
setwd('/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS')
head(eTax1)
## ASV Kingdom Phylum Class Order
## 1 ASV_1 <NA> <NA> <NA> <NA>
## 2 ASV_2 k__Fungi p__Ascomycota c__Dothideomycetes o__Pleosporales
## 3 ASV_3 k__Stramenopila p__Ochrophyta c__Bacillariophyceae o__Naviculales
## 4 ASV_4 <NA> <NA> <NA> <NA>
## 5 ASV_5 k__Fungi p__Ascomycota c__Dothideomycetes o__Capnodiales
## 6 ASV_6 k__Fungi p__Basidiomycota c__Agaricomycetes o__Phallales
## Family Genus Species
## 1 <NA> <NA> <NA>
## 2 f__Pleosporaceae g__Alternaria s__tenuissima
## 3 f__Naviculaceae g__Navicula s__trivialis
## 4 <NA> <NA> <NA>
## 5 f__Cladosporiaceae g__Cladosporium s__cladosporioides
## 6 f__Phallaceae g__Phallus s__hadriani
write.csv(eTax1, "ensembleTax_tab_kzsyn_09-15-2023.csv")
Now create a version of fungi-only taxonomy and another for all eukaryotes identified to the Kindgom-level or lower.
row.names(eTax1)<-eTax1$ASV
eTax1<- eTax1[,2:8]
#Set Working Directory
setwd('/Users/kuehnlab/Publishing/AIMS_KonzaSyn_ITS')
fungitax<- filter(eTax1, Kingdom=='k__Fungi')
write.csv(fungitax, "final_fungi_tax_tab_kzsyn_09-15-2023.csv")
full_euktax<- filter(eTax1, !is.na(Kingdom))
write.csv(full_euktax, "final_eukaryote_tax_tab_kzsyn_09-15-2023.csv")
#THIS IS DONE. With the outputs generated, you can move on to data analysis, which I will conduct in a seperate .RMD file.