#knitr::opts_chunk$set(echo = FALSE)
#Install tools
#tidyverse and data.table from CRAN
#phyloseq from Bioconductor (run lines 9-12 to install)
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("phyloseq")
#load libraries
library(tidyverse, quietly = TRUE)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(phyloseq, quietly = TRUE)
library(data.table, quietly = TRUE)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
This document shows the steps for pulling in folders of tabular reports from Kraken2 to do the following: -read all files into a single list object -create dataframes in the phyloseq-style (otuDF, otuTab, sampleTab, taxTab) -calculate Shannon Diversity Index, Richness, F:B ratio, etc for each sample -plot numerical measures on box plots by condition
#list folders/conditions to read in
#must setup "data" folder with a folder for each condition inside of it
myFolders <- list.files('data')
#read taxonomic data from .tabular files in myFolders
#make sampleTab from folder names or looking up info on NCBI
DataList <- list() #otu_table and tax_table basis
sampleTab <- matrix(c("Sample", "Gender", "COVID_Depression"), # data
nrow = 0, # number of rows
ncol = 3 ) #number of columns
## Warning in matrix(c("Sample", "Gender", "COVID_Depression"), nrow = 0, ncol =
## 3): non-empty data for zero-extent matrix
#sample_data
#loop through folders
#set file names without .tabular to be sample names in DataList
for (folder in myFolders){
foldername <- paste0('data/',folder)
myFiles<-list.files(foldername)
for(i in 1:length(myFiles)) {
input_name <- paste0(foldername,"/",myFiles[i])
out_name <- gsub(" .tabular","",myFiles[i])
col_name <- paste0(out_name,"count")
df1 <- read.table(input_name,sep="\t",header=FALSE)
names(df1)<-c('Classification',col_name)
DataList[[out_name]] <-df1
myRow <- c(out_name,strsplit(folder,"_")[[1]][1],strsplit(folder,"_")[[1]][2])
sampleTab <-rbind(sampleTab,myRow)
}
}
#create sampleTab as a dataframe that is convertible to phyloseq format
#sampleTab has information from survey like Gender that can be coded into folder names
#Clean up sampleTab to be in phyloseq format
row.names(sampleTab)<-sampleTab[,1]
sampleTab<-sampleTab[,-1]
colnames(sampleTab)<-c('Gender','Condition')
sampleTab <- as.data.frame(sampleTab)
#need to create taxTab, and otuTab to build a phyloseq object
#create matrix for taxTab to be in phyloseq format
taxList <- unique(rbindlist(DataList,use.names = FALSE)$Classification)
taxListSpecies <- grepl("\\|s_",taxList)
nTax <- length(taxList)
OTUids <- paste0("OTU",seq(1,nTax))
taxTab = matrix(nrow = nTax, ncol = 7)
rownames(taxTab) <- OTUids
colnames(taxTab) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
for (i in 1:nTax){
taxTab[i,7] <- gsub("d__.*s__","",taxList[i],perl=TRUE)
restName <- gsub("\\|s__.*","",taxList[i])
taxTab[i,6] <- gsub("d__.*g__","",restName,perl=TRUE)
restName <- gsub("\\|g__.*","",restName)
taxTab[i,5] <- gsub("d__.*f__","",restName,perl=TRUE)
restName <- gsub("\\|f__.*","",restName)
taxTab[i,4] <- gsub("d__.*o__","",restName,perl=TRUE)
restName <- gsub("\\|o__.*","",restName)
taxTab[i,3] <- gsub("d__.*c__","",restName,perl=TRUE)
restName <- gsub("\\|c__.*","",restName)
taxTab[i,2] <- gsub("d__.*p__","",restName,perl=TRUE)
restName <- gsub("\\|p__.*","",restName)
taxTab[i,1] <- gsub("d__","",restName,perl=TRUE)
}
taxTab <- gsub("d__.*","",taxTab)
taxTab <- replace(taxTab, taxTab=='', NA)
#build final part of phyloseq class, a matrix of counts per sample per OTUid
otuDF <- data.frame(Classification=taxList, OTU=OTUids,Sorting=seq(1,nTax))
mySamples<-row.names(sampleTab)
for (i in 1:length(mySamples)){
otuDF<-merge(otuDF,DataList[[mySamples[i]]], by='Classification',all.x=TRUE)
}
row.names(otuDF)<-otuDF[,2]
otuDF <- otuDF[order(otuDF$Sorting), ]
otuDF<-otuDF[,-c(1,2,3)]
otuDF[is.na(otuDF)] <- 0
otuTab <- data.matrix(otuDF)
colnames(otuTab)<-row.names(sampleTab)
#convert to formal class objects
OTU = otu_table(otuTab, taxa_are_rows = TRUE)
TAX = tax_table(taxTab)
SAM = sample_data(sampleTab)
#create phyloseq object with sample data
physeq = phyloseq(OTU, TAX,SAM,warnings=FALSE)
remotes::install_github("statdivlab/corncob")
## Skipping install of 'corncob' from a github remote, the SHA1 (7c2c9634) has not changed since last install.
## Use `force = TRUE` to force installation
library(corncob)
library(phyloseq)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
set.seed(123)
DA_Gender <- differentialTest(formula = ~ Gender,
phi.formula = ~ Gender,
formula_null = ~ 1,
test = "Wald", boot = FALSE,
phi.formula_null = ~ 1,
data = physeq,
fdr_cutoff = 0.05)
## Error in if (is.nan(val) || any(phi <= sqrt(.Machine$double.eps)) || any(phi >= :
## missing value where TRUE/FALSE needed
library(tidyverse)
library(RColorBrewer)
byGender <- otuDF[DA_Gender$significant_taxa,]
library(data.table)
byGenderT <- transpose(byGender)
# get row and colnames in order
colnames(byGenderT) <- rownames(byGender)
newcol<- gsub("count","",colnames(byGender))
lookup <- as.data.frame(cbind(OTUids,taxList))
row.names(lookup)=lookup$OTUids
myTax <- lookup[DA_Gender$significant_taxa,]
rownames(byGenderT) <- newcol
byGenderM <- merge(sampleTab,byGenderT,by='row.names')
byGenderM %>%
pivot_longer(cols=rownames(byGender),
names_to = "OTU",
values_to = "Count") ->byGendertidy
byGendertidy2 <- merge(byGendertidy,lookup, by.x='OTU',by.y='OTUids')
byGendertidy3 <- byGendertidy2[grepl("\\|s",byGendertidy2$taxList),]
byGendertidy3$taxList <- gsub("d__.*s__","",byGendertidy3$taxList,perl=TRUE)
p<-byGendertidy3 %>%
ggplot(data =.,
aes(y = Gender,
x = taxList,
fill = Count))+
geom_tile()+
scale_fill_gradientn("Number of Reads",
colors = brewer.pal(11,name="Spectral"),
limits=c(0,175))+
theme(axis.text.x = element_text(angle = 90, hjust=1))+
labs(x="Taxonomic Classification",
y="OTU",
title="Differential Abundance by OTU")+
theme(plot.background = element_rect(fill="gray"),
legend.background = element_rect(fill="gray"))
p