library(tidyverse)
# library(xlsx)
# library(rotemfuns)
# library(vegan)
library(funrar)
library(subSeq)
library(qiime2R)
library(ggpubr)
# library(magrittr)
library(forcats)
library(rmarkdown)
# library(kableExtra)
library(gghighlight)
library(microbiomeutilities)
source("/pita/users/rotem/scripts/qiime2_funcs.R")

# Assumes that the merged table already exists in the processed directory.
librarynumber = "14"
mappath = paste0("/pita/pub/data/16S_DBs/maps/DB1-", librarynumber, "_premap_v1.csv")
dbmap = read.csv(mappath)
dbmap$reads_number = dbmap$reads_number %>% as.character() %>% as.numeric()
map = sample_data(dbmap)
sample_names(map) = dbmap %>% pull(1)

Check The integrity of library 14

Import experiment data

biompath = paste0("/pita/pub/data/16S_DBs/processed/16S_DB1-",librarynumber,  "_merged/table.qza")
taxpath = paste0("/pita/pub/data/16S_DBs/processed/16S_DB1-",librarynumber
                 ,"_merged/taxonomy.qza")
biom = read_qza(biompath)$data
biom = biom %>% data.frame() 
# setdiff(dbmap$X.SampleID, colnames(biom))
# biom = biom %>% select(sample_names(map))
min_reads_cutoff = 2000
biom = biom[apply(biom, 2, sum) > min_reads_cutoff]

rare_biom = vegan::rrarefy(biom %>% t(), 10000) %>% t()
otu = phyloseq::otu_table(rare_biom, taxa_are_rows = T)
otu = phyloseq::prune_taxa(taxa_sums(otu) > 10, otu)
relative_otu = otu_table(funrar::make_relative(otu@.Data), taxa_are_rows = T)
relative_otu = filter_taxa(relative_otu,  function(x) mean(x) > 1e-5, TRUE)

pretax = read_qza(taxpath)$data
tax = pretax %>% phyloseq::tax_table()
phyloseq::taxa_names(tax) = pretax$Feature.ID

tree = ape::rtree(ntaxa(tax), rooted = T, tip.label = phyloseq::taxa_names(tax))

abs_exp = phyloseq(otu, map, tax, tree)
exp = phyloseq(relative_otu, map, tax, tree)

Below cutoff

Samples with less than 2000 reads per sample.

dbmap %>% 
  filter(Database == paste0("DB", librarynumber)) %>% 
  filter(reads_number < min_reads_cutoff) %>% select(1, sample_ID, Cohort, reads_number) %>% 
  paged_table()

repliactes <- c("024-M3","042-S","034-S","G-055-1","G-056-1")
repliactes_ID <- sample_data(exp) %>% data.frame() %>% 
                  filter(sample_ID %in% repliactes) %>% rownames()

set.seed(.5781)
number_of_subsamples = 1000
randomsample <- sample_data(exp) %>% data.frame() %>% rownames() %>%
                   sample(number_of_subsamples)
samples <- union(repliactes_ID, randomsample)

subexp <- samples %>% prune_samples(exp)

ord <- ordinate(subexp, method = "PCoA", distance = "unifrac")
pl = plot_ordination(subexp, ord, color = "sample_ID") + theme_q2r()

pl + 
  gghighlight(sample_ID %in% repliactes, use_direct_label = FALSE) + 
  ggtitle(paste("PCoA for random sampling of", number_of_subsamples, "samples from all DB's including constant controls"))

Check Library integrity

Check The integrity of library 14

Import experiment data

Below cutoff