#STEP 1: Read in data from Galaxy Kraken 2 reports
#structure data folder so all files for each condition are in a folder with
#condition name
myFolders <- list.dirs("data")
Sample1<-list.files(myFolders[2])
Sample2<-list.files(myFolders[3])
Samples<-c(Sample1,Sample2)
Conditions<-c(rep(myFolders[2],3),rep(myFolders[3],3))
#Read in each tabular file as element in DataList
#make sure to change folder
DataList <- list()
for(i in 1:length(Samples)) {
input_name <- paste0("/cloud/project/",
Conditions[i],
"/",
Samples[i])
out_name <- gsub(".tabular","",Samples[i],perl=TRUE)
df1 <- read.table(input_name,sep="\t",header=FALSE)
df1 <- cbind(gsub("data\\/","",Conditions[i],perl=TRUE),
out_name,
df1)
names(df1)<-c('Condition','Sample','Classification','Count')
DataList[[out_name]] <-df1
}
#loop to perform diversity calculations
myTidyResults <- data.frame()
#SamplesList<-names(DataList)
#SampleCount=0
for (sample in DataList){
#SampleCount=SampleCount+1
#part 1 of loop, find all values
srows <- sample[grepl("\\|s", sample[["Classification"]]),]
myS <- nrow(srows)
myTotal <- sum(srows$Count)
myP <- srows$Count/myTotal
myNewColumn = myP*log(myP)
myNew2 = myP*log(myP)^2
SDI <- -1*sum(myNewColumn)
Sum2 <- sum(myNew2)
Hmax <- -myTotal*(1/myTotal)*log(1/myTotal)
SE <- SDI/log(myS)
var_pt1 <- (Sum2-(SDI^2))/myTotal
var_pt2 <- (myS-1)/(2*myTotal^2)
SVar <- (var_pt1 + var_pt2)^0.5
#part2 of loop, use rbind to put in dataframe
myTidyResults <- rbind(myTidyResults,
c(sample[1,2],
sample[1,1],
'NumSpecies',
myS))
myTidyResults <- rbind(myTidyResults,
c(sample[1,2],
sample[1,1],
'SDI',
SDI))
myTidyResults <- rbind(myTidyResults,
c(sample[1,2],
sample[1,1],
'Evenness',
SE))
myTidyResults <- rbind(myTidyResults,
c(sample[1,2],
sample[1,1],
'Variance',
SVar))
}
#separate loop for F:B ratio
for (sample in DataList){
#part 1 of loop, find value
b_value <- sample[sample$Classification=="d__Bacteria|p__Bacteroidetes",4]
f_value <- sample[sample$Classification=="d__Bacteria|p__Firmicutes",4]
f_b <-f_value/b_value
#part2 of loop, use rbind to put in dataframe
myTidyResults <- rbind(myTidyResults,
c(sample[1,2],
sample[1,1],
'F_B',
f_b))
}
names(myTidyResults) <- c('Sample','Condition','Measurement','Value')
myTidyResults$Value<-as.numeric(myTidyResults$Value)
myWide <- myTidyResults %>%
pivot_wider(names_from = Measurement, values_from = Value)
#Change y to see other measurements of diversity
ggplot(myWide, aes(x = Condition, y = NumSpecies)) +
geom_boxplot()
## STEP 4: Make tibble of mean and standard deviation for each
measure
my_data <- data.frame(
group = myWide$Condition,
measure = myWide$NumSpecies #change for each diversity measure
)
group_by(my_data, group) %>%
summarise(
count = n(),
mean = mean(measure, na.rm = TRUE),
sd = sd(measure, na.rm = TRUE)
)
## # A tibble: 2 × 4
## group count mean sd
## <chr> <int> <dbl> <dbl>
## 1 Daily 3 117 10.1
## 2 No 3 147. 4.16
#To run statistics on two groups, you have two choices. You can do a t-test if
#you have a normally distributed set of data. To do the t-test you also have to see
#if you have equal or unequal variances.
# If you have normal distributions (p-value for both samples > 0.05), you can use
#a t-test. Shapiro-Wilks sees if you have normal distributions.
# Shapiro-Wilk normality test for daily toothbrushing
with(my_data, shapiro.test(measure[group == "Daily"]))# p = 0.6725, normal
##
## Shapiro-Wilk normality test
##
## data: measure[group == "Daily"]
## W = 0.97087, p-value = 0.6725
# Shapiro-Wilk normality test for no toothbrushing
with(my_data, shapiro.test(measure[group == "No"])) # p = 0.4633
##
## Shapiro-Wilk normality test
##
## data: measure[group == "No"]
## W = 0.92308, p-value = 0.4633
# For this example data, both distributions are normal so use ttest to compare
res.ftest <- var.test(measure ~ group, data = my_data)
res.ftest #pvalue < 0.05 means variances are not equal, p-value=0.2881 so set
##
## F test to compare two variances
##
## data: measure by group
## F = 5.9423, num df = 2, denom df = 2, p-value = 0.2881
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.1523669 231.7500000
## sample estimates:
## ratio of variances
## 5.942308
#var.equal =TRUE
#make two vectors with measurement for each condition
res <- t.test(my_data[my_data$group=="Daily",2],
my_data[my_data$group=="No",2],
var.equal = TRUE) #change var.equal based on ftest
res # p-value = 0.008715, there is a significant difference
##
## Two Sample t-test
##
## data: my_data[my_data$group == "Daily", 2] and my_data[my_data$group == "No", 2]
## t = -4.7895, df = 4, p-value = 0.008715
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -47.91749 -12.74918
## sample estimates:
## mean of x mean of y
## 117.0000 147.3333
# If you don't have normal distributions (p-value for one sample less than 0.05)
# Use Wilcoxon test to compare.
#res <- wilcox.test(distance ~ group, data = my_data,
# exact = FALSE)
#res #p-value=0.008562,
allClassifications <- bind_rows(DataList)
allClassifications <- allClassifications[!grepl("\\|s",
allClassifications[["Classification"]]),]
allClassifications<- allClassifications[!grepl("\\|g",
allClassifications[["Classification"]]),]
allClassifications <- allClassifications[!grepl("\\|f",
allClassifications[["Classification"]]),]
allClassifications <- allClassifications[!grepl("\\|o",
allClassifications[["Classification"]]),]
allClassifications <- allClassifications[!grepl("\\|c",
allClassifications[["Classification"]]),]
phylumClassifications <- allClassifications[grepl("d__Bacteria\\|p__.*",
allClassifications[["Classification"]]),]
names(phylumClassifications)[3]<-"Phylum"
phylumClassifications$Phylum <-gsub("d__Bacteria\\|p__","",phylumClassifications$Phylum,perl=TRUE)
#Plot multiple samples by ERR on single taxonomic bar plot - all phylum
ggplot(phylumClassifications, aes(x = Sample, y = Count, fill = Phylum)) +
geom_bar(stat = "identity") +
theme(legend.position="none")+
theme(axis.text.x = element_text(angle = 60, vjust = 0.5, hjust=1))
#Sum all reads on a single taxonomic bar plot - all phylum
ggplot(phylumClassifications, aes(x= Condition, y = Count, fill = Phylum)) +
geom_bar(stat = "identity") +
theme(legend.position="right")
readSurvey <- function(x){
read_html(GET(paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=",x),timeout(30)))%>%
html_node("table") %>%
html_table()
}
library(rentrez)
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(httr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
allSurveys<-data.frame(matrix(ncol = 4, nrow = 0))
for (i in 1:nrow(myWide)){
r_search <- entrez_search(db="sra", term=myWide$Sample[i])
all_the_links <- entrez_link(dbfrom='sra', id=r_search$ids, db='biosample')
taxize_summ <- entrez_summary(db="biosample", id=all_the_links$links$sra_biosample)
demodata<-readSurvey(taxize_summ$accession)
demodata<-cbind(myWide$Sample[i],
myWide$Condition[i],
demodata)
allSurveys<-rbind(allSurveys,demodata)
}
names(allSurveys)<-c('Sample','Condition','Question','Response')
myQuestion <- allSurveys[allSurveys$Question=="height cm",]
myQuestion$Response <- as.numeric(myQuestion$Response)
#Change y to see other measurements of diversity
ggplot(myQuestion, aes(x = Condition, y = Response)) +
geom_boxplot()