DA_Heatmap

R Markdown

This document shows the steps for pulling in folders of tabular reports from Kraken2 to do the following: -read all files into a single list object -create dataframes in the phyloseq-style (otuDF, otuTab, sampleTab, taxTab) -calculate Shannon Diversity Index, Richness, F:B ratio, etc for each sample -plot numerical measures on box plots by condition

#list folders/conditions to read in
#must setup "data" folder with a folder for each condition inside of it
myFolders <- list.files('data')

#read taxonomic data from .tabular files in myFolders
#make sampleTab from folder names or looking up info on NCBI
DataList <- list() #otu_table and tax_table basis
sampleTab <- matrix(c("Sample", "Gender", "COVID_Depression"), # data
  nrow = 0, # number of rows
  ncol = 3 ) #number of columns

## Warning in matrix(c("Sample", "Gender", "COVID_Depression"), nrow = 0, ncol =
## 3): non-empty data for zero-extent matrix

   #sample_data

#loop through folders
#set file names without .tabular to be sample names in DataList
for (folder in myFolders){
  foldername <- paste0('data/',folder)
  myFiles<-list.files(foldername)
  for(i in 1:length(myFiles)) { 
    input_name <- paste0(foldername,"/",myFiles[i])
    out_name <- gsub(" .tabular","",myFiles[i])
    col_name <- paste0(out_name,"count")
    df1 <- read.table(input_name,sep="\t",header=FALSE)
    names(df1)<-c('Classification',col_name)
    DataList[[out_name]] <-df1
    myRow <- c(out_name,strsplit(folder,"_")[[1]][1],strsplit(folder,"_")[[1]][2])
    sampleTab <-rbind(sampleTab,myRow)
  }
}

#create sampleTab as a dataframe that is convertible to phyloseq format
#sampleTab has information from survey like Gender that can be coded into folder names
#Clean up sampleTab to be in phyloseq format
row.names(sampleTab)<-sampleTab[,1]
sampleTab<-sampleTab[,-1]
colnames(sampleTab)<-c('Gender','Condition')
sampleTab <- as.data.frame(sampleTab)

#need to create taxTab, and otuTab to build a phyloseq object
#create matrix for taxTab to be in phyloseq format
taxList <- unique(rbindlist(DataList,use.names = FALSE)$Classification)
taxListSpecies <- grepl("\\|s_",taxList)
nTax <- length(taxList)
OTUids <- paste0("OTU",seq(1,nTax))
taxTab = matrix(nrow = nTax, ncol = 7)
rownames(taxTab) <- OTUids
colnames(taxTab) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
for (i in 1:nTax){
  taxTab[i,7] <- gsub("d__.*s__","",taxList[i],perl=TRUE)
  restName <- gsub("\\|s__.*","",taxList[i])
  taxTab[i,6] <- gsub("d__.*g__","",restName,perl=TRUE)
  restName <- gsub("\\|g__.*","",restName)
  taxTab[i,5] <- gsub("d__.*f__","",restName,perl=TRUE)
  restName <- gsub("\\|f__.*","",restName)
  taxTab[i,4] <- gsub("d__.*o__","",restName,perl=TRUE)
  restName <- gsub("\\|o__.*","",restName)
  taxTab[i,3] <- gsub("d__.*c__","",restName,perl=TRUE)
  restName <- gsub("\\|c__.*","",restName)
  taxTab[i,2] <- gsub("d__.*p__","",restName,perl=TRUE)
  restName <- gsub("\\|p__.*","",restName)
  taxTab[i,1] <- gsub("d__","",restName,perl=TRUE)
}
taxTab <- gsub("d__.*","",taxTab)
taxTab <- replace(taxTab, taxTab=='', NA)

#build final part of phyloseq class, a matrix of counts per sample per OTUid
otuDF <- data.frame(Classification=taxList, OTU=OTUids,Sorting=seq(1,nTax))
mySamples<-row.names(sampleTab)
for (i in 1:length(mySamples)){
  otuDF<-merge(otuDF,DataList[[mySamples[i]]], by='Classification',all.x=TRUE)
}
row.names(otuDF)<-otuDF[,2]
otuDF <- otuDF[order(otuDF$Sorting), ]
otuDF<-otuDF[,-c(1,2,3)]
otuDF[is.na(otuDF)] <- 0
otuTab <- data.matrix(otuDF)
colnames(otuTab)<-row.names(sampleTab)

#convert to formal class objects
OTU = otu_table(otuTab, taxa_are_rows = TRUE)
TAX = tax_table(taxTab)
SAM = sample_data(sampleTab)


#create phyloseq object with sample data
physeq = phyloseq(OTU, TAX,SAM,warnings=FALSE)

remotes::install_github("statdivlab/corncob")

## Skipping install of 'corncob' from a github remote, the SHA1 (7c2c9634) has not changed since last install.
##   Use `force = TRUE` to force installation

library(corncob)
library(phyloseq)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

set.seed(123)
DA_Gender <- differentialTest(formula = ~ Gender,
                        phi.formula = ~ Gender,
                        formula_null = ~ 1,
                        test = "Wald", boot = FALSE,
                        phi.formula_null = ~ 1,
                        data = physeq,
                        fdr_cutoff = 0.05)

## Error in if (is.nan(val) || any(phi <= sqrt(.Machine$double.eps)) || any(phi >=  : 
##   missing value where TRUE/FALSE needed

library(tidyverse)
library(RColorBrewer)
byGender <- otuDF[DA_Gender$significant_taxa,]
library(data.table)
byGenderT <- transpose(byGender)
# get row and colnames in order
colnames(byGenderT) <- rownames(byGender)
newcol<- gsub("count","",colnames(byGender))
lookup <- as.data.frame(cbind(OTUids,taxList))
row.names(lookup)=lookup$OTUids
myTax <- lookup[DA_Gender$significant_taxa,]
rownames(byGenderT) <- newcol
byGenderM <- merge(sampleTab,byGenderT,by='row.names')

byGenderM  %>%
    pivot_longer(cols=rownames(byGender),
    names_to = "OTU", 
    values_to = "Count") ->byGendertidy

byGendertidy2 <- merge(byGendertidy,lookup, by.x='OTU',by.y='OTUids')
byGendertidy3 <- byGendertidy2[grepl("\\|s",byGendertidy2$taxList),]
byGendertidy3$taxList <- gsub("d__.*s__","",byGendertidy3$taxList,perl=TRUE)

p<-byGendertidy3 %>%
    ggplot(data =.,
           aes(y = Gender,
               x = taxList,
               fill = Count))+
    geom_tile()+
    scale_fill_gradientn("Number of Reads",
                         colors = brewer.pal(11,name="Spectral"),
                         limits=c(0,175))+
    theme(axis.text.x = element_text(angle = 90, hjust=1))+
    labs(x="Taxonomic Classification",
         y="OTU",
         title="Differential Abundance by OTU")+
    theme(plot.background = element_rect(fill="gray"),
          legend.background = element_rect(fill="gray"))
p

DA_Heatmap

Jess Kaufman

2024-03-14

R Markdown