#knitr::opts_chunk$set(echo = FALSE)
#Install tools
#tidyverse and data.table from CRAN
#phyloseq from Bioconductor (run lines 9-12 to install)
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("phyloseq")
#load libraries
library(tidyverse, quietly = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(phyloseq, quietly = TRUE)
library(data.table, quietly = TRUE)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
This document shows the steps for pulling in folders of tabular reports from Kraken2 to do the following: -read all files into a single list object -create dataframes in the phyloseq-style (otuDF, otuTab, sampleTab, taxTab) -calculate Shannon Diversity Index, Richness, F:B ratio, etc for each sample -plot numerical measures on box plots by condition
#list folders/conditions to read in
#must setup "data" folder with a folder for each condition inside of it
myFolders <- list.files('data')
#read taxonomic data from .tabular files in myFolders
#make sampleTab from folder names or looking up info on NCBI
DataList <- list() #otu_table and tax_table basis
sampleTab <- matrix(c("Sample", "Gender", "COVID_Depression"), # data
nrow = 0, # number of rows
ncol = 3 ) #number of columns
## Warning in matrix(c("Sample", "Gender", "COVID_Depression"), nrow = 0, ncol =
## 3): non-empty data for zero-extent matrix
#sample_data
#loop through folders
#set file names without .tabular to be sample names in DataList
for (folder in myFolders){
foldername <- paste0('data/',folder)
myFiles<-list.files(foldername)
for(i in 1:length(myFiles)) {
input_name <- paste0(foldername,"/",myFiles[i])
out_name <- gsub(" .tabular","",myFiles[i])
col_name <- paste0(out_name,"count")
df1 <- read.table(input_name,sep="\t",header=FALSE)
names(df1)<-c('Classification',col_name)
DataList[[out_name]] <-df1
myRow <- c(out_name,strsplit(folder,"_")[[1]][1],strsplit(folder,"_")[[1]][2])
sampleTab <-rbind(sampleTab,myRow)
}
}
#create sampleTab as a dataframe that is convertible to phyloseq format
#sampleTab has information from survey like Gender that can be coded into folder names
#Clean up sampleTab to be in phyloseq format
row.names(sampleTab)<-sampleTab[,1]
sampleTab<-sampleTab[,-1]
colnames(sampleTab)<-c('Gender','Condition')
sampleTab <- as.data.frame(sampleTab)
#need to create taxTab, and otuTab to build a phyloseq object
#create matrix for taxTab to be in phyloseq format
taxList <- unique(rbindlist(DataList,use.names = FALSE)$Classification)
taxListSpecies <- grepl("\\|s_",taxList)
nTax <- length(taxList)
OTUids <- paste0("OTU",seq(1,nTax))
taxTab = matrix(nrow = nTax, ncol = 7)
rownames(taxTab) <- OTUids
colnames(taxTab) <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species")
for (i in 1:nTax){
taxTab[i,7] <- gsub("d__.*s__","",taxList[i],perl=TRUE)
restName <- gsub("\\|s__.*","",taxList[i])
taxTab[i,6] <- gsub("d__.*g__","",restName,perl=TRUE)
restName <- gsub("\\|g__.*","",restName)
taxTab[i,5] <- gsub("d__.*f__","",restName,perl=TRUE)
restName <- gsub("\\|f__.*","",restName)
taxTab[i,4] <- gsub("d__.*o__","",restName,perl=TRUE)
restName <- gsub("\\|o__.*","",restName)
taxTab[i,3] <- gsub("d__.*c__","",restName,perl=TRUE)
restName <- gsub("\\|c__.*","",restName)
taxTab[i,2] <- gsub("d__.*p__","",restName,perl=TRUE)
restName <- gsub("\\|p__.*","",restName)
taxTab[i,1] <- gsub("d__","",restName,perl=TRUE)
}
taxTab <- gsub("d__.*","",taxTab)
taxTab <- replace(taxTab, taxTab=='', NA)
#build final part of phyloseq class, a matrix of counts per sample per OTUid
otuDF <- data.frame(Classification=taxList, OTU=OTUids,Sorting=seq(1,nTax))
mySamples<-row.names(sampleTab)
for (i in 1:length(mySamples)){
otuDF<-merge(otuDF,DataList[[mySamples[i]]], by='Classification',all.x=TRUE)
}
row.names(otuDF)<-otuDF[,2]
otuDF <- otuDF[order(otuDF$Sorting), ]
otuDF<-otuDF[,-c(1,2,3)]
otuDF[is.na(otuDF)] <- 0
otuTab <- data.matrix(otuDF)
colnames(otuTab)<-row.names(sampleTab)
#convert to formal class objects
OTU = otu_table(otuTab, taxa_are_rows = TRUE)
TAX = tax_table(taxTab)
SAM = sample_data(sampleTab)
#create phyloseq object with sample data
physeq = phyloseq(OTU, TAX,SAM,warnings=FALSE)
myS <- numeric()
SDI <-numeric()
SE <-numeric()
SVar <-numeric()
F_B <- numeric() #Firmicutes is OTU2 and Bacteroidetes is OTU320, change for your taxTab
#Start by getting creating a list of species classifications
speciesTab <- as.data.frame(taxTab)
speciesTab<-subset(speciesTab,speciesTab$Species!='')
speciesCounts<-otuTab[row.names(speciesTab),]
for (i in 1:length(mySamples)){
#subset to species rows only
sRows <- speciesCounts[,i]
sRows <- sRows[sRows!=0]
#find richness/#species
myS[i] <- length(sRows)
#total counts and calc proportion, P
myTotal <- sum(sRows)
myP <- sRows/myTotal
#create new vector with p*logp
myNewColumn = myP*log(myP)
#create new vector with p*logp^
myNew2 = myP*(log(myP))^2
#Calculate Shannon Diversity Index
SDI[i] <- -1*sum(myNewColumn)
SS <- sum(myNew2)
SQ <- (sum(myNewColumn))^2
SE[i] <- SDI[i]/log(myS[i])
var_pt1 <- (SS-SQ)/myTotal
var_pt2 <- (myS[i]-1)/(2*myTotal^2)
SVar[i] <- (var_pt1 + var_pt2)^0.5
F_B[i]<- otuDF[2,i]/otuDF[320,i] #Firmicutes is OTU2 and Bacteroidetes is OTU320, change for your taxTab
}
myMeasures <- data.frame(ID=mySamples,
Gender=sampleTab$Gender,
Condition=sampleTab$Condition,
Richness=myS,
Shannon=SDI,
Evenness=SE,
Variance=SVar,
F_B_ratio=F_B)
ggplot(data = myMeasures, mapping = aes(x = Condition, y = Shannon, fill=Condition)) +
geom_boxplot()
# Create a data frame
my_data <- data.frame(
group = myMeasures$Condition,
measure = myMeasures$Shannon
)
group_by(my_data, group) %>%
summarise(
count = n(),
mean = mean(measure, na.rm = TRUE),
sd = sd(measure, na.rm = TRUE)
)
## # A tibble: 2 × 4
## group count mean sd
## <chr> <int> <dbl> <dbl>
## 1 depressed 60 2.57 0.660
## 2 healthy 60 2.61 0.649
#To run statistics on two groups, you have two choices. You can do a t-test if
#you have a normally distributed set of data. To do the t-test you also have to see
#if you have equal or unequal variances.
# If you have normal distributions (p-value for both samples > 0.05), you can use
#a t-test. Shapiro-Wilks sees if you have normal distributions.
# Shapiro-Wilk normality test for celiac
with(my_data, shapiro.test(measure[group == "depressed"]))# p = 0.00078, not normal
##
## Shapiro-Wilk normality test
##
## data: measure[group == "depressed"]
## W = 0.92023, p-value = 0.0007834
# Shapiro-Wilk normality test for non-celiac
with(my_data, shapiro.test(measure[group == "healthy"])) # p = 0.7445
##
## Shapiro-Wilk normality test
##
## data: measure[group == "healthy"]
## W = 0.95503, p-value = 0.027
# For this data, both distributions are not normal so we can't use ttest to compare.
#res.ftest <- var.test(diversity ~ group, data = my_data)
#res.ftest #pvalue < 0.05 means variances are not equal, p-value=.6466 so set
#var.equal =TRUE
#res <- t.test(SDI_celiac, SDI_nogluten, var.equal = TRUE) #change var.equal based on ftest
#res # p-value = 1.29E-5, fake data is highly significant
# If you don't have normal distributions (p-value for at least one sample less than 0.05)
# Use Wilcoxon test to compare.
res <- wilcox.test(measure ~ group, data = my_data,
exact = FALSE)
res #p-value=0.8358, not significant for Shannon Diversity
##
## Wilcoxon rank sum test with continuity correction
##
## data: measure by group
## W = 1760, p-value = 0.8358
## alternative hypothesis: true location shift is not equal to 0
#Create single data frame of all biodiversity measures for all samples
myMeasures2 <- data.frame(ID=mySamples,
Group=paste0(sampleTab$Gender,sampleTab$Condition),
Richness=myS,
Shannon=SDI,
Evenness=SE,
Variance=SVar,
F_B_ratio=F_B)
#Plot distribution of variance for 4 groups with box plot
ggplot(data = myMeasures2, mapping = aes(x = Group, y = Variance, fill=Group)) +
geom_boxplot()
#Use ANOVA to see if there are any significant variations btw groups
variance.aov <- aov(lm(Variance ~ Group, data = myMeasures2))
variance.aov #Pr(>F) is significant so run post-hoc test to find which groups are sig different
## Call:
## aov(formula = lm(Variance ~ Group, data = myMeasures2))
##
## Terms:
## Group Residuals
## Sum of Squares 0.000540435 0.004937618
## Deg. of Freedom 3 116
##
## Residual standard error: 0.006524237
## Estimated effects may be unbalanced
#Perform post-hoc test
variance.tukey <- TukeyHSD(variance.aov)
variance.tukey #female healthy is significantly different from female depressed and male depressed
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = lm(Variance ~ Group, data = myMeasures2))
##
## $Group
## diff lwr upr
## femalehealthy-femaledepressed 0.0050999291 0.0007088695 0.0094909886
## maledepressed-femaledepressed -0.0001341167 -0.0045251763 0.0042569428
## malehealthy-femaledepressed 0.0011299298 -0.0032611298 0.0055209893
## maledepressed-femalehealthy -0.0052340458 -0.0096251054 -0.0008429863
## malehealthy-femalehealthy -0.0039699993 -0.0083610589 0.0004210602
## malehealthy-maledepressed 0.0012640465 -0.0031270130 0.0056551061
## p adj
## femalehealthy-femaledepressed 0.0158608
## maledepressed-femaledepressed 0.9998180
## malehealthy-femaledepressed 0.9079228
## maledepressed-femalehealthy 0.0125417
## malehealthy-femalehealthy 0.0914323
## malehealthy-maledepressed 0.8763363