#knitr::opts_chunk$set(echo = FALSE)
#Install tools
#tidyverse and data.table from CRAN
#phyloseq from Bioconductor (run lines 9-12 to install)
# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("phyloseq")

#load libraries
library(tidyverse, quietly = TRUE)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(phyloseq, quietly = TRUE)
library(data.table, quietly = TRUE)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

R Markdown

This document shows the steps for pulling in folders of tabular reports from Kraken2 to do the following: -read all files into a single list object -create dataframes in the phyloseq-style (otuDF, otuTab, sampleTab, taxTab) -calculate Shannon Diversity Index, Richness, F:B ratio, etc for each sample -plot numerical measures on box plots by condition

#list folders/conditions to read in
#must setup "data" folder with a folder for each condition inside of it
myFolders <- list.files('data')

#read taxonomic data from .tabular files in myFolders
#make sampleTab from folder names or looking up info on NCBI
DataList <- list() #otu_table and tax_table basis
sampleTab <- matrix(c("Sample", "Gender", "COVID_Depression"), # data
  nrow = 0, # number of rows
  ncol = 3 ) #number of columns
## Warning in matrix(c("Sample", "Gender", "COVID_Depression"), nrow = 0, ncol =
## 3): non-empty data for zero-extent matrix
   #sample_data

#loop through folders
#set file names without .tabular to be sample names in DataList
for (folder in myFolders){
  foldername <- paste0('data/',folder)
  myFiles<-list.files(foldername)
  for(i in 1:length(myFiles)) { 
    input_name <- paste0(foldername,"/",myFiles[i])
    out_name <- gsub(" .tabular","",myFiles[i])
    col_name <- paste0(out_name,"count")
    df1 <- read.table(input_name,sep="\t",header=FALSE)
    names(df1)<-c('Classification',col_name)
    DataList[[out_name]] <-df1
    myRow <- c(out_name,strsplit(folder,"_")[[1]][1],strsplit(folder,"_")[[1]][2])
    sampleTab <-rbind(sampleTab,myRow)
  }
}
#create sampleTab as a dataframe that is convertible to phyloseq format
#sampleTab has information from survey like Gender that can be coded into folder names
#Clean up sampleTab to be in phyloseq format
row.names(sampleTab)<-sampleTab[,1]
sampleTab<-sampleTab[,-1]
colnames(sampleTab)<-c('Gender','Condition')
sampleTab <- as.data.frame(sampleTab)
#need to create taxTab, and otuTab to build a phyloseq object
#create matrix for taxTab to be in phyloseq format
taxList <- unique(rbindlist(DataList,use.names = FALSE)$Classification)
taxListSpecies <- grepl("\\|s_",taxList)
nTax <- length(taxList)
OTUids <- paste0("OTU",seq(1,nTax))
taxTab = matrix(nrow = nTax, ncol = 7)
rownames(taxTab) <- OTUids
colnames(taxTab) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
for (i in 1:nTax){
  taxTab[i,7] <- gsub("d__.*s__","",taxList[i],perl=TRUE)
  restName <- gsub("\\|s__.*","",taxList[i])
  taxTab[i,6] <- gsub("d__.*g__","",restName,perl=TRUE)
  restName <- gsub("\\|g__.*","",restName)
  taxTab[i,5] <- gsub("d__.*f__","",restName,perl=TRUE)
  restName <- gsub("\\|f__.*","",restName)
  taxTab[i,4] <- gsub("d__.*o__","",restName,perl=TRUE)
  restName <- gsub("\\|o__.*","",restName)
  taxTab[i,3] <- gsub("d__.*c__","",restName,perl=TRUE)
  restName <- gsub("\\|c__.*","",restName)
  taxTab[i,2] <- gsub("d__.*p__","",restName,perl=TRUE)
  restName <- gsub("\\|p__.*","",restName)
  taxTab[i,1] <- gsub("d__","",restName,perl=TRUE)
}
taxTab <- gsub("d__.*","",taxTab)
taxTab <- replace(taxTab, taxTab=='', NA)
#build final part of phyloseq class, a matrix of counts per sample per OTUid
otuDF <- data.frame(Classification=taxList, OTU=OTUids,Sorting=seq(1,nTax))
mySamples<-row.names(sampleTab)
for (i in 1:length(mySamples)){
  otuDF<-merge(otuDF,DataList[[mySamples[i]]], by='Classification',all.x=TRUE)
}
row.names(otuDF)<-otuDF[,2]
otuDF <- otuDF[order(otuDF$Sorting), ]
otuDF<-otuDF[,-c(1,2,3)]
otuDF[is.na(otuDF)] <- 0
otuTab <- data.matrix(otuDF)
colnames(otuTab)<-row.names(sampleTab)

#convert to formal class objects
OTU = otu_table(otuTab, taxa_are_rows = TRUE)
TAX = tax_table(taxTab)
SAM = sample_data(sampleTab)


#create phyloseq object with sample data
physeq = phyloseq(OTU, TAX,SAM,warnings=FALSE)
remotes::install_github("statdivlab/corncob")
## Skipping install of 'corncob' from a github remote, the SHA1 (7c2c9634) has not changed since last install.
##   Use `force = TRUE` to force installation
library(corncob)
library(phyloseq)
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
set.seed(123)
DA_Gender <- differentialTest(formula = ~ Gender,
                        phi.formula = ~ Gender,
                        formula_null = ~ 1,
                        test = "Wald", boot = FALSE,
                        phi.formula_null = ~ 1,
                        data = physeq,
                        fdr_cutoff = 0.05)
## Error in if (is.nan(val) || any(phi <= sqrt(.Machine$double.eps)) || any(phi >=  : 
##   missing value where TRUE/FALSE needed
library(tidyverse)
library(RColorBrewer)
byGender <- otuDF[DA_Gender$significant_taxa,]
library(data.table)
byGenderT <- transpose(byGender)
# get row and colnames in order
colnames(byGenderT) <- rownames(byGender)
newcol<- gsub("count","",colnames(byGender))
lookup <- as.data.frame(cbind(OTUids,taxList))
row.names(lookup)=lookup$OTUids
myTax <- lookup[DA_Gender$significant_taxa,]
rownames(byGenderT) <- newcol
byGenderM <- merge(sampleTab,byGenderT,by='row.names')

byGenderM  %>%
    pivot_longer(cols=rownames(byGender),
    names_to = "OTU", 
    values_to = "Count") ->byGendertidy
byGendertidy2 <- merge(byGendertidy,lookup, by.x='OTU',by.y='OTUids')
byGendertidy3 <- byGendertidy2[grepl("\\|s",byGendertidy2$taxList),]
byGendertidy3$taxList <- gsub("d__.*s__","",byGendertidy3$taxList,perl=TRUE)
p<-byGendertidy3 %>%
    ggplot(data =.,
           aes(y = Gender,
               x = taxList,
               fill = Count))+
    geom_tile()+
    scale_fill_gradientn("Number of Reads",
                         colors = brewer.pal(11,name="Spectral"),
                         limits=c(0,175))+
    theme(axis.text.x = element_text(angle = 90, hjust=1))+
    labs(x="Taxonomic Classification",
         y="OTU",
         title="Differential Abundance by OTU")+
    theme(plot.background = element_rect(fill="gray"),
          legend.background = element_rect(fill="gray"))
p