Getting things set up in R

Load packages and set working directory in R.

#clear environment
rm(list=ls())

#install phyloseq if needed
# source("https://raw.githubusercontent.com/joey711/phyloseq/master/inst/scripts/installer.R",
#        local = TRUE)
# 
# install_phyloseq(branch = "github")

#load packages
library(phyloseq)
library(ggplot2)
library(readxl)

#set working directory
setwd("/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/Scripts/for_zach")

#set global options
knitr::opts_chunk$set(echo = TRUE,fig.width=11)



Read in data: sintax_taxonomy assignment predictions

#read in sintax_taxonomy predictions
taxa_predictions_1<-read.table(file="/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/sintax_taxonomy_predictions.txt",sep="\t",fill=TRUE)

taxa_predictions<-taxa_predictions_1[,-3:-5]
colnames(taxa_predictions)<-c("AmpliconSequenceVariant","Taxa")

taxa_predictions<-taxa_predictions[ taxa_predictions$Taxa != "",]

head(taxa_predictions)
##    AmpliconSequenceVariant
## 7                    Asv14
## 10                    Asv1
## 11                    Asv6
## 12                    Asv2
## 13                   Asv17
## 14                    Asv5
##                                                                                                                                                                     Taxa
## 7                d:bacteria(1.00),p:proteobacteria(1.00),c:betaproteobacteria(1.00),o:burkholderiales(1.00),f:oxalobacteraceae(1.00),g:massilia(0.99),s:consociata(0.44)
## 10                      d:bacteria(1.00),p:tenericutes(1.00),c:mollicutes(1.00),o:entomoplasmatales(1.00),f:spiroplasmataceae(1.00),g:spiroplasma(1.00),s:ixodetis(1.00)
## 11                      d:bacteria(1.00),p:tenericutes(1.00),c:mollicutes(1.00),o:entomoplasmatales(1.00),f:spiroplasmataceae(1.00),g:spiroplasma(1.00),s:ixodetis(0.94)
## 12 d:bacteria(1.00),p:proteobacteria(1.00),c:alphaproteobacteria(1.00),o:rhodospirillales(1.00),f:acetobacteraceae(1.00),g:gluconobacter(0.27),s:kanchanaburiensis(0.24)
## 13         d:bacteria(1.00),p:bacteroidetes(1.00),c:sphingobacteriia(1.00),o:sphingobacteriales(1.00),f:sphingobacteriaceae(1.00),g:pedobacter(1.00),s:antarcticus(0.56)
## 14             d:bacteria(1.00),p:proteobacteria(1.00),c:alphaproteobacteria(1.00),o:rhizobiales(0.99),f:methylocystaceae(0.24),g:methylocystis(0.16),s:echinoides(0.15)



Format data

taxa_pred.df<-taxa_predictions

#create avs.df to merge later
ASV<-taxa_pred.df[,c("AmpliconSequenceVariant")]

#get taxonomy assignments and confidence
taxa_predictions_species<-read.table(text=as.character(taxa_pred.df$Taxa),sep=",")
colnames(taxa_predictions_species)<-c("Domain","Phylum","Class","Order","Family","Genus","Species")

#add Scientific_Name colunmn
taxa_predictions_species$Scientific_Name<-paste(gsub("[^[:alpha:] ]","",gsub("g:","", taxa_predictions_species$Genus)),gsub("[^[:alpha:] ]","", gsub("s:","",taxa_predictions_species$Species)), sep="_")

#capitalize Genera
taxa_predictions_species$Scientific_Name<-stringr::str_to_sentence(taxa_predictions_species$Scientific_Name)

#list of column names
colList<-names(taxa_predictions_species)

#extract taxa and confidence and create data.frame
df.out<-list()
for(i in 1:length(colList)){
    TaxaName<-colList[i]
    newCol<-taxa_predictions_species[names(taxa_predictions_species)==colList[i]]
    newCol[,1]<-as.character(newCol[,1])
    if(colList[i]=="Scientific_Name"){
      firstChar=""
    }else{
      firstChar<-unique(substr(newCol[,1], start=1, stop=2))
    }
    dataNew_1<-gsub(firstChar, "",newCol[,1])
    conf<-read.table(text=dataNew_1,sep="(")
    if(length(names(conf))<2){
      conf$Confidence<-NA
      colnames(conf)<-c("Taxa","Confidence")
    }else{
      colnames(conf)<-c("Taxa","Confidence")
    }
    conf$Confidence<-gsub(")","",conf$Confidence)
    
    #add Taxonomic name
    conf$TaxaName<-TaxaName
    #add group
    #conf$Group<-paste("Group_",seq(1:length(row.names(conf))),sep="")
    
    df.out<-rbind(df.out, conf)
   
  
}

df.out.1<-data.frame(Category=df.out$TaxaName, TaxaName=df.out$Taxa, Confidence=df.out$Confidence)



Function to get split out data by taxanomic category and confidence level

#head(df.out)

# dataIn=df.out.1
# CategoryIn="Family"
# ConfidenceLevel = 0.90

splitTaxaData<-function(dataIn, CategoryIn, ConfidenceLevel){
  
  new.df<-dataIn
  
  new.df$Confidence<-as.numeric(as.character(new.df$Confidence))
  
  #now add conf from Species to Scientific_Name
  spp.conf<-subset(new.df, Category=="Species")[,c("Confidence")]
  
  #unique(new.df$Category)
  sciName.df<-subset(new.df, Category=="Scientific_Name")
  sciName.df$Confidence<-spp.conf
  
  #now add back into df.out
  df.sub<-subset(new.df, Category !="Scientific_Name")
  
  df.out.2<-rbind(df.sub, sciName.df)
  
  #now merge with sample.group.df
  merge.df<-cbind(df.out.2, ASV)
  
  #convert confidence to numeric
  merge.df$Confidence<-as.numeric(merge.df$Confidence)
  
  #remove taxa assignment below a given threshold
  conf.df<-subset(merge.df, c(Category==CategoryIn & Confidence >= ConfidenceLevel))
  
  #make Category a character
  #conf.df$Category<-as.character(conf.df$Category)
  
  #subset by Category
  #conf.df.out<-subset(conf.df, Category==CategoryIn)
  
return(conf.df)
}

#test function splitTaxaData
split.data<-splitTaxaData(dataIn=df.out.1, CategoryIn = "Genus", ConfidenceLevel=0.9)

head(split.data)
##       Category       TaxaName Confidence   ASV
## 15456    Genus       massilia       0.99 Asv14
## 15457    Genus    spiroplasma       1.00  Asv1
## 15458    Genus    spiroplasma       1.00  Asv6
## 15460    Genus     pedobacter       1.00 Asv17
## 15462    Genus friedmanniella       1.00 Asv13
## 15467    Genus        pantoea       0.94 Asv10



Function to add sampling location information to data

#now read in table of total counts per sample

addSamplingInfo<-function(dataIn){
  
  new.df<-dataIn
  
  sampleTable.df<-read.table(
  "/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/asv_seqs.count_table.txt", header=FALSE
  )
  
  sampleTable.df<-read.table(
    "/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/asv_seqs.count_table.txt",col.names=c("ASV","ZL_1",   "ZL_10",    "ZL_11",    "ZL_12",    "ZL_13",    "ZL_14",    "ZL_15",    "ZL_16",    "ZL_17",    "ZL_18",    "ZL_19",    "ZL_2", "ZL_20",    "ZL_21",    "ZL_3", "ZL_4", "ZL_5", "ZL_6", "ZL_7", "ZL_8", "ZL_9"))
  
  #melt sampleTable.df
  sample.melt<-reshape2::melt(sampleTable.df, id.vars="ASV")
  
  #now merge taxa.conf.100 with sample.melt
  
  conf.merge<-unique(merge(new.df, sample.melt, by="ASV",all.x=TRUE))
  
  #add treatment and locaiton info
  #read in sample location data
  sampleInfo<-data.frame(read_excel(path="/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/Data/Ladin_eDNA_rain_samples_DataSheet.xlsx", sheet = 1, range = NULL, col_names = TRUE,col_types = NULL, na = "", trim_ws = TRUE, skip = 0, n_max=26,progress = readxl_progress(), .name_repair = "unique"))
  
  #change "Sample" to "variable column names
  names(sampleInfo)[names(sampleInfo)=="Sample"]<-"variable"
  
  #merge sampleInfo with taxa.conf.100.merge
  taxa.merge<-merge(conf.merge, sampleInfo, by="variable",all.x=TRUE)
  
  #unique(taxa.merge$Category)
  ######################################################################################################
  #try plotting
  
  #set order of factor levels
  taxa.merge$Category<-factor(taxa.merge$Category, levels=c("Domain","Phylum","Class","Order","Family", "Genus","Species","Scientific_Name"))
  
  #remove all NAs from Location_type
  taxa.merge.2<-taxa.merge[!is.na(taxa.merge$Location_type),]
  
  taxa.merge.2$Location_type<-factor(taxa.merge.2$Location_type, levels=c("Control_DI_H20", "Forest_DI_H20","Control_Rain","Forest_Rain"))
  
  #remove all with value=0
  taxaPresent<-subset(taxa.merge.2, value!=0)
  
  return(taxaPresent)
}

#test function
taxaPresent.df<-addSamplingInfo(dataIn=split.data)

head(taxaPresent.df)
##     variable     ASV Category      TaxaName Confidence value       Date
## 7       ZL_1 Asv3001    Genus lactobacillus       1.00     1 2018-08-10
## 95      ZL_1 Asv3473    Genus     ralstonia       0.99     9 2018-08-10
## 147     ZL_1 Asv3257    Genus   tepidimonas       0.95     1 2018-08-10
## 153     ZL_1  Asv672    Genus acinetobacter       1.00     1 2018-08-10
## 154     ZL_1 Asv1155    Genus acinetobacter       1.00     9 2018-08-10
## 178     ZL_1  Asv340    Genus amnibacterium       0.97     1 2018-08-10
##                    Time Sample_ID Sample_type  Location_type Location  Latitude
## 7   1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 95  1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 147 1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 153 1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 154 1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 178 1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
##     Longitude Sample_collected Sample_Vol_mL DNA_Conc_ng_per_uL Comments
## 7    39.66495              Yes            30            < 0.050     <NA>
## 95   39.66495              Yes            30            < 0.050     <NA>
## 147  39.66495              Yes            30            < 0.050     <NA>
## 153  39.66495              Yes            30            < 0.050     <NA>
## 154  39.66495              Yes            30            < 0.050     <NA>
## 178  39.66495              Yes            30            < 0.050     <NA>



Function to get unique list of microbial consortia (subtract species in DIH20 from rain samples)

getTaxaRain<-function(dataIn){
  
  new.df<-dataIn
  
  #get list of spp found in DI H20, and remove for rain samples
  remove.df<-subset(new.df,c(Location_type=="Control_DI_H20" | Location_type=="Forest_DI_H20"))
  
  #get list of taxa to remove
  removeList<-sort(unique(as.character(remove.df$TaxaName)))
  
  #create dataframe with taxa to keep (from rain water samples)
  keep.df<-subset(new.df,c(Location_type=="Control_Rain"  | Location_type=="Forest_Rain"))
  
  # spp.keep.df<-subset(spp.controls, ! Taxa %in% speciesRemoveList)
  keepList<-sort(unique(as.character(keep.df$TaxaName)))
  
  #get unique species within keep list
  diffList<-setdiff(keepList,removeList)
  
  #species.plot.data
  keep.data.out<-subset(keep.df, TaxaName %in% diffList)
  #make Sample_ID a factor
  keep.data.out$Sample_ID<-as.factor(keep.data.out$Sample_ID)

  return(keep.data.out)
  
}

#test function
test.df<-getTaxaRain(dataIn=taxaPresent.df)
head(test.df)
##      variable     ASV Category      TaxaName Confidence value       Date
## 1588    ZL_10 Asv2577    Genus    variovorax       0.93     1 2018-08-11
## 1695    ZL_10 Asv3187    Genus   dyadobacter       0.99     6 2018-08-11
## 1716    ZL_10 Asv1057    Genus   dyadobacter       1.00     3 2018-08-11
## 2702    ZL_11 Asv1618    Genus pigmentiphaga       0.95     3 2018-08-11
## 2717    ZL_11 Asv2732    Genus      kluyvera       0.95    20 2018-08-11
## 2934    ZL_11 Asv2776    Genus  chthonomonas       0.96     2 2018-08-11
##                     Time  Sample_ID Sample_type Location_type Location
## 1588 1899-12-31 07:10:00 Control-1B        Rain  Control_Rain  N of EW
## 1695 1899-12-31 07:10:00 Control-1B        Rain  Control_Rain  N of EW
## 1716 1899-12-31 07:10:00 Control-1B        Rain  Control_Rain  N of EW
## 2702 1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 2717 1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 2934 1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
##       Latitude Longitude Sample_collected Sample_Vol_mL DNA_Conc_ng_per_uL
## 1588 -75.74594  39.66495              Yes          32.5            < 0.050
## 1695 -75.74594  39.66495              Yes          32.5            < 0.050
## 1716 -75.74594  39.66495              Yes          32.5            < 0.050
## 2702 -75.74582  39.66496              Yes             1            < 0.050
## 2717 -75.74582  39.66496              Yes             1            < 0.050
## 2934 -75.74582  39.66496              Yes             1            < 0.050
##          Comments
## 1588         <NA>
## 1695         <NA>
## 1716         <NA>
## 2702 small sample
## 2717 small sample
## 2934 small sample



Function that combines previous functions to get data ready to visualize

#get PlotData function
getPlotData<-function(dataIn,CategoryIn, ConfidenceLevel){
  new.df<-dataIn
  newCategory<-CategoryIn
  newConfidence<-ConfidenceLevel
  my.df.1<-splitTaxaData(dataIn=new.df, CategoryIn = newCategory ,ConfidenceLevel = newConfidence)
  my.df.2<-addSamplingInfo(dataIn=my.df.1)
  my.df.3<-getTaxaRain(dataIn=my.df.2)
  
  return(my.df.3)
}

#test function
new.plot.data<-getPlotData(dataIn=df.out.1, CategoryIn = "Order", ConfidenceLevel = 0.6)
head(new.plot.data)
##       variable     ASV Category          TaxaName Confidence value       Date
## 6427     ZL_11  Asv790    Order acholeplasmatales       1.00   124 2018-08-11
## 6603     ZL_11 Asv3124    Order      prochlorales       0.62     9 2018-08-11
## 7532     ZL_11 Asv1631    Order  gemmatimonadales       0.79     1 2018-08-11
## 9905     ZL_12 Asv3124    Order      prochlorales       0.62     1 2018-08-11
## 11645    ZL_13 Asv3923    Order acholeplasmatales       0.86     1 2018-08-11
## 12493    ZL_13 Asv1631    Order  gemmatimonadales       0.79     9 2018-08-11
##                      Time  Sample_ID Sample_type Location_type Location
## 6427  1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 6603  1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 7532  1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 9905  1899-12-31 07:45:00      C8-1A        Rain   Forest_Rain    EW_C8
## 11645 1899-12-31 07:45:00      C8-1B        Rain   Forest_Rain    EW_C8
## 12493 1899-12-31 07:45:00      C8-1B        Rain   Forest_Rain    EW_C8
##        Latitude Longitude Sample_collected Sample_Vol_mL  DNA_Conc_ng_per_uL
## 6427  -75.74582  39.66496              Yes             1             < 0.050
## 6603  -75.74582  39.66496              Yes             1             < 0.050
## 7532  -75.74582  39.66496              Yes             1             < 0.050
## 9905  -75.74500  39.66152              Yes             9 0.55000000000000004
## 11645 -75.74500  39.66152              Yes            30 0.55900000000000005
## 12493 -75.74500  39.66152              Yes            30 0.55900000000000005
##           Comments
## 6427  small sample
## 6603  small sample
## 7532  small sample
## 9905          <NA>
## 11645         <NA>
## 12493         <NA>



Visualize data among different taxanomic categories and confidence thresholds (here, Confidence >= 0.90)

#use function to get taxanomic assignment plots
categoryList<-unique(as.character(df.out.1$Category))

removeCategories<-c("Domain","Phylum","Class","Species")

#remove Domain and Phylum, since there is only overlap between DI H20 and rain-based reads
categoryList<-categoryList[! categoryList %in% removeCategories]

#loop through categoryList and plot
for(i in 1:length(categoryList)){
  
  newCategory<-categoryList[i]
  
  my.data<-getPlotData(dataIn=df.out.1,CategoryIn = newCategory,ConfidenceLevel = 0.9)
  
  #generate random colors for palette according to number of unique taxa
  n=length(unique(my.data$TaxaName))
  myColors<- randomcoloR::distinctColorPalette(n)

  #plot species unique to rain event in experiment
  speciesPlot<-ggplot(data=my.data, aes(x=variable, y=value))+
    geom_bar(stat="identity",aes(fill=TaxaName), position="fill")+
    theme(legend.position="bottom")+
    labs(x="Sample",y="Relative Proportion")+
    scale_fill_manual(values=myColors)+
    theme(axis.text.x=element_text(size=7))+
    ggtitle(paste(newCategory, " - plot of sintax_taxanomic assignments."))
  #speciesPlot
  
  speciesPlotFacet<-speciesPlot+facet_grid(~Location_type, drop=TRUE,scales="free_x",space="free_x")
  
  print(speciesPlotFacet)

  #save each plot
  #ggsave

  }



Plot taxa only occuring in Forest (i.e., subtract Control_rain taxa from Forest_rain)

#get new Species-level data
spp.data<-getPlotData(dataIn=df.out.1, CategoryIn = "Scientific_Name",ConfidenceLevel = 0.9)

control.df<-subset(spp.data, Location_type=="Control_Rain")
controlList<-sort(unique(as.character(control.df$TaxaName)))

forest.df<-subset(spp.data, Location_type=="Forest_Rain")
forestList<-sort(unique(as.character(forest.df$TaxaName)))

forestUnique<-setdiff(forestList, controlList)

forest.data<-subset(forest.df, TaxaName %in% forestUnique)

  #plot species unique to rain event in experiment
  forestPlot<-ggplot(data=forest.data, aes(x=variable, y=value))+
    geom_bar(stat="identity",aes(fill=TaxaName), position="fill")+
    #theme(legend.position="none")+
    labs(x="Sample",y="Relative Proportion")+
    scale_fill_manual(values=myColors)+
    ggtitle(paste(newCategory, " - assignments unique to forest."))
  #scale_fill_manual(values = RColorBrewer::brewer.pal(12, "Set3"))
  #speciesPlot
  
  # forestPlotFacet<-speciesPlot+facet_wrap(.~Location_type)
  
  print(forestPlot)

  #save each plot
  #ggsave