#STEP 1: Read in data from Galaxy Kraken 2 reports

#structure data folder so all files for each condition are in a folder with
#condition name
myFolders <- list.dirs("data")
Sample1<-list.files(myFolders[2])
Sample2<-list.files(myFolders[3])
Samples<-c(Sample1,Sample2)
Conditions<-c(rep(myFolders[2],3),rep(myFolders[3],3))

#Read in each tabular file as element in DataList
#make sure to change folder
DataList <- list()
for(i in 1:length(Samples)) { 
  input_name <- paste0("/cloud/project/",
                       Conditions[i],
                       "/",
                       Samples[i])
  out_name <- gsub(".tabular","",Samples[i],perl=TRUE)
  df1 <- read.table(input_name,sep="\t",header=FALSE)
  df1 <- cbind(gsub("data\\/","",Conditions[i],perl=TRUE),
               out_name,
               df1)
  names(df1)<-c('Condition','Sample','Classification','Count')
  DataList[[out_name]] <-df1
}

STEP 2: Diversity calculations for all samples

#loop to perform diversity calculations
myTidyResults <- data.frame()
#SamplesList<-names(DataList)
#SampleCount=0
for (sample in DataList){
  #SampleCount=SampleCount+1
  #part 1 of loop, find all values
  srows <- sample[grepl("\\|s", sample[["Classification"]]),]
  myS <- nrow(srows)
  myTotal <- sum(srows$Count)
  myP <- srows$Count/myTotal 
  myNewColumn = myP*log(myP)
  myNew2 = myP*log(myP)^2
  
  SDI <- -1*sum(myNewColumn)
  Sum2 <- sum(myNew2)
  
  Hmax <- -myTotal*(1/myTotal)*log(1/myTotal)
  
  SE <- SDI/log(myS)
  
  var_pt1 <- (Sum2-(SDI^2))/myTotal
  var_pt2 <- (myS-1)/(2*myTotal^2)
  SVar <- (var_pt1 + var_pt2)^0.5
  
  #part2 of loop, use rbind to put in dataframe
  myTidyResults <- rbind(myTidyResults,
                         c(sample[1,2],
                           sample[1,1],
                           'NumSpecies',
                            myS))
  myTidyResults <- rbind(myTidyResults,
                         c(sample[1,2],
                           sample[1,1],
                           'SDI',
                           SDI))
  myTidyResults <- rbind(myTidyResults,
                         c(sample[1,2],
                           sample[1,1],
                           'Evenness',
                           SE))
  myTidyResults <- rbind(myTidyResults,
                         c(sample[1,2],
                           sample[1,1],
                           'Variance',
                           SVar))
}

#separate loop for F:B ratio
for (sample in DataList){
  #part 1 of loop, find value
  b_value <- sample[sample$Classification=="d__Bacteria|p__Bacteroidetes",4]
  f_value <- sample[sample$Classification=="d__Bacteria|p__Firmicutes",4]
  f_b <-f_value/b_value

  #part2 of loop, use rbind to put in dataframe
  myTidyResults <- rbind(myTidyResults,
                         c(sample[1,2],
                           sample[1,1],
                           'F_B',
                           f_b))
}

names(myTidyResults) <- c('Sample','Condition','Measurement','Value')
myTidyResults$Value<-as.numeric(myTidyResults$Value)

STEP 3: Compare each measure using box plot

myWide <- myTidyResults %>%
  pivot_wider(names_from = Measurement, values_from = Value)
#Change y to see other measurements of diversity
ggplot(myWide, aes(x = Condition, y = NumSpecies)) +
  geom_boxplot()

## STEP 4: Make tibble of mean and standard deviation for each measure

my_data <- data.frame( 
  group = myWide$Condition,
  measure = myWide$NumSpecies #change for each diversity measure
)

group_by(my_data, group) %>%
  summarise(
    count = n(),
    mean = mean(measure, na.rm = TRUE),
    sd = sd(measure, na.rm = TRUE)
  )
## # A tibble: 2 × 4
##   group count  mean    sd
##   <chr> <int> <dbl> <dbl>
## 1 Daily     3  117  10.1 
## 2 No        3  147.  4.16

STEP 5: Calculate if measure is statistically significant btw groups

#To run statistics on two groups, you have two choices.  You can do a t-test if 
#you have a normally distributed set of data.  To do the t-test you also have to see 
#if you have equal or unequal variances.

# If you have normal distributions (p-value for both samples > 0.05), you can use 
#a t-test.  Shapiro-Wilks sees if you have normal distributions.
# Shapiro-Wilk normality test for daily toothbrushing
with(my_data, shapiro.test(measure[group == "Daily"]))# p = 0.6725, normal
## 
##  Shapiro-Wilk normality test
## 
## data:  measure[group == "Daily"]
## W = 0.97087, p-value = 0.6725
# Shapiro-Wilk normality test for no toothbrushing
with(my_data, shapiro.test(measure[group == "No"])) # p = 0.4633
## 
##  Shapiro-Wilk normality test
## 
## data:  measure[group == "No"]
## W = 0.92308, p-value = 0.4633
# For this example data, both distributions are normal so use ttest to compare
res.ftest <- var.test(measure ~ group, data = my_data)
res.ftest #pvalue < 0.05 means variances are not equal, p-value=0.2881 so set 
## 
##  F test to compare two variances
## 
## data:  measure by group
## F = 5.9423, num df = 2, denom df = 2, p-value = 0.2881
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##    0.1523669 231.7500000
## sample estimates:
## ratio of variances 
##           5.942308
#var.equal =TRUE

#make two vectors with measurement for each condition
res <- t.test(my_data[my_data$group=="Daily",2], 
              my_data[my_data$group=="No",2], 
              var.equal = TRUE) #change var.equal based on ftest
res # p-value = 0.008715, there is a significant difference
## 
##  Two Sample t-test
## 
## data:  my_data[my_data$group == "Daily", 2] and my_data[my_data$group == "No", 2]
## t = -4.7895, df = 4, p-value = 0.008715
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -47.91749 -12.74918
## sample estimates:
## mean of x mean of y 
##  117.0000  147.3333
# If you don't have normal distributions (p-value for one sample less than 0.05)
# Use Wilcoxon test to compare. 
#res <- wilcox.test(distance ~ group, data = my_data,
# exact = FALSE)
#res #p-value=0.008562, 

STEP 6: Taxonomic Bar Plots

allClassifications <- bind_rows(DataList)
allClassifications <- allClassifications[!grepl("\\|s",
                                            allClassifications[["Classification"]]),]
allClassifications<- allClassifications[!grepl("\\|g",
                                            allClassifications[["Classification"]]),]
allClassifications <- allClassifications[!grepl("\\|f",
                                            allClassifications[["Classification"]]),]
allClassifications <- allClassifications[!grepl("\\|o",
                                            allClassifications[["Classification"]]),]
allClassifications <- allClassifications[!grepl("\\|c",
                                            allClassifications[["Classification"]]),]
phylumClassifications <- allClassifications[grepl("d__Bacteria\\|p__.*",
                                          allClassifications[["Classification"]]),]
names(phylumClassifications)[3]<-"Phylum"
phylumClassifications$Phylum <-gsub("d__Bacteria\\|p__","",phylumClassifications$Phylum,perl=TRUE)

#Plot multiple samples by ERR on single taxonomic bar plot - all phylum 
ggplot(phylumClassifications, aes(x = Sample, y = Count, fill = Phylum)) + 
  geom_bar(stat = "identity") + 
  theme(legend.position="none")+ 
  theme(axis.text.x = element_text(angle = 60, vjust = 0.5, hjust=1))

#Sum all reads on a single taxonomic bar plot - all phylum 
ggplot(phylumClassifications, aes(x= Condition, y = Count, fill = Phylum)) + 
  geom_bar(stat = "identity") + 
  theme(legend.position="right")

STEP 7: Define read survey function

readSurvey <- function(x){
  read_html(GET(paste0("https://www.ncbi.nlm.nih.gov/biosample/?term=",x),timeout(30)))%>% 
    html_node("table") %>%
    html_table()
}

STEP 8: Download survey data

library(rentrez)
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(httr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
allSurveys<-data.frame(matrix(ncol = 4, nrow = 0))
for (i in 1:nrow(myWide)){
  r_search <- entrez_search(db="sra", term=myWide$Sample[i])
  all_the_links <- entrez_link(dbfrom='sra', id=r_search$ids, db='biosample')
  taxize_summ <- entrez_summary(db="biosample", id=all_the_links$links$sra_biosample)
  demodata<-readSurvey(taxize_summ$accession)
  demodata<-cbind(myWide$Sample[i],
        myWide$Condition[i],
        demodata)
  allSurveys<-rbind(allSurveys,demodata)
}
names(allSurveys)<-c('Sample','Condition','Question','Response')

STEP 9: Plot one measure from survey

myQuestion <- allSurveys[allSurveys$Question=="height cm",]
myQuestion$Response <- as.numeric(myQuestion$Response)
#Change y to see other measurements of diversity
ggplot(myQuestion, aes(x = Condition, y = Response)) +
  geom_boxplot()