Microbiome Thesis Example

R Markdown

This document shows the steps for pulling in folders of tabular reports from Kraken2 to do the following: -read all files into a single list object -create dataframes in the phyloseq-style (otuDF, otuTab, sampleTab, taxTab) -calculate Shannon Diversity Index, Richness, F:B ratio, etc for each sample -plot numerical measures on box plots by condition

#list folders/conditions to read in
#must setup "data" folder with a folder for each condition inside of it
myFolders <- list.files('data')

#read taxonomic data from .tabular files in myFolders
#make sampleTab from folder names or looking up info on NCBI
DataList <- list() #otu_table and tax_table basis
sampleTab <- matrix(c("Sample", "Gender", "COVID_Depression"), # data
  nrow = 0, # number of rows
  ncol = 3 ) #number of columns

## Warning in matrix(c("Sample", "Gender", "COVID_Depression"), nrow = 0, ncol =
## 3): non-empty data for zero-extent matrix

   #sample_data

#loop through folders
#set file names without .tabular to be sample names in DataList
for (folder in myFolders){
  foldername <- paste0('data/',folder)
  myFiles<-list.files(foldername)
  for(i in 1:length(myFiles)) { 
    input_name <- paste0(foldername,"/",myFiles[i])
    out_name <- gsub(" .tabular","",myFiles[i])
    col_name <- paste0(out_name,"count")
    df1 <- read.table(input_name,sep="\t",header=FALSE)
    names(df1)<-c('Classification',col_name)
    DataList[[out_name]] <-df1
    myRow <- c(out_name,strsplit(folder,"_")[[1]][1],strsplit(folder,"_")[[1]][2])
    sampleTab <-rbind(sampleTab,myRow)
  }
}

#create sampleTab as a dataframe that is convertible to phyloseq format
#sampleTab has information from survey like Gender that can be coded into folder names
#Clean up sampleTab to be in phyloseq format
row.names(sampleTab)<-sampleTab[,1]
sampleTab<-sampleTab[,-1]
colnames(sampleTab)<-c('Gender','Condition')
sampleTab <- as.data.frame(sampleTab)

#need to create taxTab, and otuTab to build a phyloseq object
#create matrix for taxTab to be in phyloseq format
taxList <- unique(rbindlist(DataList,use.names = FALSE)$Classification)
taxListSpecies <- grepl("\\|s_",taxList)
nTax <- length(taxList)
OTUids <- paste0("OTU",seq(1,nTax))
taxTab = matrix(nrow = nTax, ncol = 7)
rownames(taxTab) <- OTUids
colnames(taxTab) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
for (i in 1:nTax){
  taxTab[i,7] <- gsub("d__.*s__","",taxList[i],perl=TRUE)
  restName <- gsub("\\|s__.*","",taxList[i])
  taxTab[i,6] <- gsub("d__.*g__","",restName,perl=TRUE)
  restName <- gsub("\\|g__.*","",restName)
  taxTab[i,5] <- gsub("d__.*f__","",restName,perl=TRUE)
  restName <- gsub("\\|f__.*","",restName)
  taxTab[i,4] <- gsub("d__.*o__","",restName,perl=TRUE)
  restName <- gsub("\\|o__.*","",restName)
  taxTab[i,3] <- gsub("d__.*c__","",restName,perl=TRUE)
  restName <- gsub("\\|c__.*","",restName)
  taxTab[i,2] <- gsub("d__.*p__","",restName,perl=TRUE)
  restName <- gsub("\\|p__.*","",restName)
  taxTab[i,1] <- gsub("d__","",restName,perl=TRUE)
}
taxTab <- gsub("d__.*","",taxTab)
taxTab <- replace(taxTab, taxTab=='', NA)

#build final part of phyloseq class, a matrix of counts per sample per OTUid
otuDF <- data.frame(Classification=taxList, OTU=OTUids,Sorting=seq(1,nTax))
mySamples<-row.names(sampleTab)
for (i in 1:length(mySamples)){
  otuDF<-merge(otuDF,DataList[[mySamples[i]]], by='Classification',all.x=TRUE)
}
row.names(otuDF)<-otuDF[,2]
otuDF <- otuDF[order(otuDF$Sorting), ]
otuDF<-otuDF[,-c(1,2,3)]
otuDF[is.na(otuDF)] <- 0
otuTab <- data.matrix(otuDF)
colnames(otuTab)<-row.names(sampleTab)

#convert to formal class objects
OTU = otu_table(otuTab, taxa_are_rows = TRUE)
TAX = tax_table(taxTab)
SAM = sample_data(sampleTab)


#create phyloseq object with sample data
physeq = phyloseq(OTU, TAX,SAM,warnings=FALSE)

myS <- numeric()
SDI <-numeric()
SE <-numeric()
SVar <-numeric()
F_B <- numeric() #Firmicutes is OTU2 and Bacteroidetes is OTU320, change for your taxTab 

#Start by getting creating a list of species classifications
speciesTab <- as.data.frame(taxTab)
speciesTab<-subset(speciesTab,speciesTab$Species!='')
speciesCounts<-otuTab[row.names(speciesTab),]
for (i in 1:length(mySamples)){
  #subset to species rows only
  sRows <- speciesCounts[,i]
  sRows <- sRows[sRows!=0]
  #find richness/#species
  myS[i] <- length(sRows)
  #total counts and calc proportion, P
  myTotal <- sum(sRows)
  myP <- sRows/myTotal 
  
  #create new vector with p*logp
  myNewColumn = myP*log(myP)
  
  #create new vector with p*logp^
  myNew2 = myP*(log(myP))^2
  
  #Calculate Shannon Diversity Index
  SDI[i] <- -1*sum(myNewColumn)
  
  SS <- sum(myNew2)
  
  SQ <- (sum(myNewColumn))^2
  
  SE[i] <- SDI[i]/log(myS[i])
  
  var_pt1 <- (SS-SQ)/myTotal
  
  var_pt2 <- (myS[i]-1)/(2*myTotal^2)
  
  SVar[i] <- (var_pt1 + var_pt2)^0.5
  
  F_B[i]<- otuDF[2,i]/otuDF[320,i]  #Firmicutes is OTU2 and Bacteroidetes is OTU320, change for your taxTab 
}


myMeasures <- data.frame(ID=mySamples,
                         Gender=sampleTab$Gender,
                         Condition=sampleTab$Condition,
                         Richness=myS,
                         Shannon=SDI,
                         Evenness=SE,
                         Variance=SVar,
                         F_B_ratio=F_B)

ggplot(data = myMeasures, mapping = aes(x = Condition, y = Shannon, fill=Condition)) +
  geom_boxplot()

# Create a data frame
my_data <- data.frame( 
  group = myMeasures$Condition,
  measure = myMeasures$Shannon
)

group_by(my_data, group) %>%
  summarise(
    count = n(),
    mean = mean(measure, na.rm = TRUE),
    sd = sd(measure, na.rm = TRUE)
  )

## # A tibble: 2 × 4
##   group     count  mean    sd
##   <chr>     <int> <dbl> <dbl>
## 1 depressed    60  2.57 0.660
## 2 healthy      60  2.61 0.649

#To run statistics on two groups, you have two choices.  You can do a t-test if 
#you have a normally distributed set of data.  To do the t-test you also have to see 
#if you have equal or unequal variances.

# If you have normal distributions (p-value for both samples > 0.05), you can use 
#a t-test.  Shapiro-Wilks sees if you have normal distributions.
# Shapiro-Wilk normality test for celiac
with(my_data, shapiro.test(measure[group == "depressed"]))# p = 0.00078, not normal

## 
##  Shapiro-Wilk normality test
## 
## data:  measure[group == "depressed"]
## W = 0.92023, p-value = 0.0007834

# Shapiro-Wilk normality test for non-celiac
with(my_data, shapiro.test(measure[group == "healthy"])) # p = 0.7445

## 
##  Shapiro-Wilk normality test
## 
## data:  measure[group == "healthy"]
## W = 0.95503, p-value = 0.027

# For this data, both distributions are not normal so we can't use ttest to compare.
#res.ftest <- var.test(diversity ~ group, data = my_data)
#res.ftest #pvalue < 0.05 means variances are not equal, p-value=.6466 so set 
#var.equal =TRUE

#res <- t.test(SDI_celiac, SDI_nogluten, var.equal = TRUE) #change var.equal based on ftest
#res # p-value = 1.29E-5, fake data is highly significant

# If you don't have normal distributions (p-value for at least one sample less than 0.05)
# Use Wilcoxon test to compare. 
res <- wilcox.test(measure ~ group, data = my_data,
 exact = FALSE)
res #p-value=0.8358, not significant for Shannon Diversity

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  measure by group
## W = 1760, p-value = 0.8358
## alternative hypothesis: true location shift is not equal to 0

#Create single data frame of all biodiversity measures for all samples

myMeasures2 <- data.frame(ID=mySamples,
                         Group=paste0(sampleTab$Gender,sampleTab$Condition),
                         Richness=myS,
                         Shannon=SDI,
                         Evenness=SE,
                         Variance=SVar,
                         F_B_ratio=F_B)

#Plot distribution of variance for 4 groups with box plot

ggplot(data = myMeasures2, mapping = aes(x = Group, y = Variance, fill=Group)) +
  geom_boxplot()

#Use ANOVA to see if there are any significant variations btw groups

variance.aov <- aov(lm(Variance ~ Group, data = myMeasures2))
variance.aov  #Pr(>F) is significant so run post-hoc test to find which groups are sig different

## Call:
##    aov(formula = lm(Variance ~ Group, data = myMeasures2))
## 
## Terms:
##                       Group   Residuals
## Sum of Squares  0.000540435 0.004937618
## Deg. of Freedom           3         116
## 
## Residual standard error: 0.006524237
## Estimated effects may be unbalanced

#Perform post-hoc test

variance.tukey <- TukeyHSD(variance.aov)
variance.tukey #female healthy is significantly different from female depressed and male depressed

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = lm(Variance ~ Group, data = myMeasures2))
## 
## $Group
##                                        diff           lwr           upr
## femalehealthy-femaledepressed  0.0050999291  0.0007088695  0.0094909886
## maledepressed-femaledepressed -0.0001341167 -0.0045251763  0.0042569428
## malehealthy-femaledepressed    0.0011299298 -0.0032611298  0.0055209893
## maledepressed-femalehealthy   -0.0052340458 -0.0096251054 -0.0008429863
## malehealthy-femalehealthy     -0.0039699993 -0.0083610589  0.0004210602
## malehealthy-maledepressed      0.0012640465 -0.0031270130  0.0056551061
##                                   p adj
## femalehealthy-femaledepressed 0.0158608
## maledepressed-femaledepressed 0.9998180
## malehealthy-femaledepressed   0.9079228
## maledepressed-femalehealthy   0.0125417
## malehealthy-femalehealthy     0.0914323
## malehealthy-maledepressed     0.8763363

Microbiome Thesis Example

Jess Kaufman

2024-03-08

R Markdown