Forest eDNA experiment

Getting things set up in R

Load packages and set working directory in R.

#clear environment
rm(list=ls())

#install phyloseq if needed
# source("https://raw.githubusercontent.com/joey711/phyloseq/master/inst/scripts/installer.R",
#        local = TRUE)
# 
# install_phyloseq(branch = "github")

#load packages
library(phyloseq)
library(ggplot2)
library(readxl)

#set working directory
setwd("/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/Scripts/for_zach")

#set global options
knitr::opts_chunk$set(echo = TRUE,fig.width=11)

Read in data: sintax_taxonomy assignment predictions

#read in sintax_taxonomy predictions
taxa_predictions_1<-read.table(file="/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/sintax_taxonomy_predictions.txt",sep="\t",fill=TRUE)

taxa_predictions<-taxa_predictions_1[,-3:-5]
colnames(taxa_predictions)<-c("AmpliconSequenceVariant","Taxa")

taxa_predictions<-taxa_predictions[ taxa_predictions$Taxa != "",]

head(taxa_predictions)

##    AmpliconSequenceVariant
## 7                    Asv14
## 10                    Asv1
## 11                    Asv6
## 12                    Asv2
## 13                   Asv17
## 14                    Asv5
##                                                                                                                                                                     Taxa
## 7                d:bacteria(1.00),p:proteobacteria(1.00),c:betaproteobacteria(1.00),o:burkholderiales(1.00),f:oxalobacteraceae(1.00),g:massilia(0.99),s:consociata(0.44)
## 10                      d:bacteria(1.00),p:tenericutes(1.00),c:mollicutes(1.00),o:entomoplasmatales(1.00),f:spiroplasmataceae(1.00),g:spiroplasma(1.00),s:ixodetis(1.00)
## 11                      d:bacteria(1.00),p:tenericutes(1.00),c:mollicutes(1.00),o:entomoplasmatales(1.00),f:spiroplasmataceae(1.00),g:spiroplasma(1.00),s:ixodetis(0.94)
## 12 d:bacteria(1.00),p:proteobacteria(1.00),c:alphaproteobacteria(1.00),o:rhodospirillales(1.00),f:acetobacteraceae(1.00),g:gluconobacter(0.27),s:kanchanaburiensis(0.24)
## 13         d:bacteria(1.00),p:bacteroidetes(1.00),c:sphingobacteriia(1.00),o:sphingobacteriales(1.00),f:sphingobacteriaceae(1.00),g:pedobacter(1.00),s:antarcticus(0.56)
## 14             d:bacteria(1.00),p:proteobacteria(1.00),c:alphaproteobacteria(1.00),o:rhizobiales(0.99),f:methylocystaceae(0.24),g:methylocystis(0.16),s:echinoides(0.15)

Format data

taxa_pred.df<-taxa_predictions

#create avs.df to merge later
ASV<-taxa_pred.df[,c("AmpliconSequenceVariant")]

#get taxonomy assignments and confidence
taxa_predictions_species<-read.table(text=as.character(taxa_pred.df$Taxa),sep=",")
colnames(taxa_predictions_species)<-c("Domain","Phylum","Class","Order","Family","Genus","Species")

#add Scientific_Name colunmn
taxa_predictions_species$Scientific_Name<-paste(gsub("[^[:alpha:] ]","",gsub("g:","", taxa_predictions_species$Genus)),gsub("[^[:alpha:] ]","", gsub("s:","",taxa_predictions_species$Species)), sep="_")

#capitalize Genera
taxa_predictions_species$Scientific_Name<-stringr::str_to_sentence(taxa_predictions_species$Scientific_Name)

#list of column names
colList<-names(taxa_predictions_species)

#extract taxa and confidence and create data.frame
df.out<-list()
for(i in 1:length(colList)){
    TaxaName<-colList[i]
    newCol<-taxa_predictions_species[names(taxa_predictions_species)==colList[i]]
    newCol[,1]<-as.character(newCol[,1])
    if(colList[i]=="Scientific_Name"){
      firstChar=""
    }else{
      firstChar<-unique(substr(newCol[,1], start=1, stop=2))
    }
    dataNew_1<-gsub(firstChar, "",newCol[,1])
    conf<-read.table(text=dataNew_1,sep="(")
    if(length(names(conf))<2){
      conf$Confidence<-NA
      colnames(conf)<-c("Taxa","Confidence")
    }else{
      colnames(conf)<-c("Taxa","Confidence")
    }
    conf$Confidence<-gsub(")","",conf$Confidence)
    
    #add Taxonomic name
    conf$TaxaName<-TaxaName
    #add group
    #conf$Group<-paste("Group_",seq(1:length(row.names(conf))),sep="")
    
    df.out<-rbind(df.out, conf)
   
  
}

df.out.1<-data.frame(Category=df.out$TaxaName, TaxaName=df.out$Taxa, Confidence=df.out$Confidence)

Function to get split out data by taxanomic category and confidence level

#head(df.out)

# dataIn=df.out.1
# CategoryIn="Family"
# ConfidenceLevel = 0.90

splitTaxaData<-function(dataIn, CategoryIn, ConfidenceLevel){
  
  new.df<-dataIn
  
  new.df$Confidence<-as.numeric(as.character(new.df$Confidence))
  
  #now add conf from Species to Scientific_Name
  spp.conf<-subset(new.df, Category=="Species")[,c("Confidence")]
  
  #unique(new.df$Category)
  sciName.df<-subset(new.df, Category=="Scientific_Name")
  sciName.df$Confidence<-spp.conf
  
  #now add back into df.out
  df.sub<-subset(new.df, Category !="Scientific_Name")
  
  df.out.2<-rbind(df.sub, sciName.df)
  
  #now merge with sample.group.df
  merge.df<-cbind(df.out.2, ASV)
  
  #convert confidence to numeric
  merge.df$Confidence<-as.numeric(merge.df$Confidence)
  
  #remove taxa assignment below a given threshold
  conf.df<-subset(merge.df, c(Category==CategoryIn & Confidence >= ConfidenceLevel))
  
  #make Category a character
  #conf.df$Category<-as.character(conf.df$Category)
  
  #subset by Category
  #conf.df.out<-subset(conf.df, Category==CategoryIn)
  
return(conf.df)
}

#test function splitTaxaData
split.data<-splitTaxaData(dataIn=df.out.1, CategoryIn = "Genus", ConfidenceLevel=0.9)

head(split.data)

##       Category       TaxaName Confidence   ASV
## 15456    Genus       massilia       0.99 Asv14
## 15457    Genus    spiroplasma       1.00  Asv1
## 15458    Genus    spiroplasma       1.00  Asv6
## 15460    Genus     pedobacter       1.00 Asv17
## 15462    Genus friedmanniella       1.00 Asv13
## 15467    Genus        pantoea       0.94 Asv10

Function to add sampling location information to data

#now read in table of total counts per sample

addSamplingInfo<-function(dataIn){
  
  new.df<-dataIn
  
  sampleTable.df<-read.table(
  "/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/asv_seqs.count_table.txt", header=FALSE
  )
  
  sampleTable.df<-read.table(
    "/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/asv_seqs.count_table.txt",col.names=c("ASV","ZL_1",   "ZL_10",    "ZL_11",    "ZL_12",    "ZL_13",    "ZL_14",    "ZL_15",    "ZL_16",    "ZL_17",    "ZL_18",    "ZL_19",    "ZL_2", "ZL_20",    "ZL_21",    "ZL_3", "ZL_4", "ZL_5", "ZL_6", "ZL_7", "ZL_8", "ZL_9"))
  
  #melt sampleTable.df
  sample.melt<-reshape2::melt(sampleTable.df, id.vars="ASV")
  
  #now merge taxa.conf.100 with sample.melt
  
  conf.merge<-unique(merge(new.df, sample.melt, by="ASV",all.x=TRUE))
  
  #add treatment and locaiton info
  #read in sample location data
  sampleInfo<-data.frame(read_excel(path="/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/Data/Ladin_eDNA_rain_samples_DataSheet.xlsx", sheet = 1, range = NULL, col_names = TRUE,col_types = NULL, na = "", trim_ws = TRUE, skip = 0, n_max=26,progress = readxl_progress(), .name_repair = "unique"))
  
  #change "Sample" to "variable column names
  names(sampleInfo)[names(sampleInfo)=="Sample"]<-"variable"
  
  #merge sampleInfo with taxa.conf.100.merge
  taxa.merge<-merge(conf.merge, sampleInfo, by="variable",all.x=TRUE)
  
  #unique(taxa.merge$Category)
  ######################################################################################################
  #try plotting
  
  #set order of factor levels
  taxa.merge$Category<-factor(taxa.merge$Category, levels=c("Domain","Phylum","Class","Order","Family", "Genus","Species","Scientific_Name"))
  
  #remove all NAs from Location_type
  taxa.merge.2<-taxa.merge[!is.na(taxa.merge$Location_type),]
  
  taxa.merge.2$Location_type<-factor(taxa.merge.2$Location_type, levels=c("Control_DI_H20", "Forest_DI_H20","Control_Rain","Forest_Rain"))
  
  #remove all with value=0
  taxaPresent<-subset(taxa.merge.2, value!=0)
  
  return(taxaPresent)
}

#test function
taxaPresent.df<-addSamplingInfo(dataIn=split.data)

head(taxaPresent.df)

##     variable     ASV Category      TaxaName Confidence value       Date
## 7       ZL_1 Asv3001    Genus lactobacillus       1.00     1 2018-08-10
## 95      ZL_1 Asv3473    Genus     ralstonia       0.99     9 2018-08-10
## 147     ZL_1 Asv3257    Genus   tepidimonas       0.95     1 2018-08-10
## 153     ZL_1  Asv672    Genus acinetobacter       1.00     1 2018-08-10
## 154     ZL_1 Asv1155    Genus acinetobacter       1.00     9 2018-08-10
## 178     ZL_1  Asv340    Genus amnibacterium       0.97     1 2018-08-10
##                    Time Sample_ID Sample_type  Location_type Location  Latitude
## 7   1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 95  1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 147 1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 153 1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 154 1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
## 178 1899-12-31 18:01:00 Control-1      DI_H20 Control_DI_H20  N of EW -75.74594
##     Longitude Sample_collected Sample_Vol_mL DNA_Conc_ng_per_uL Comments
## 7    39.66495              Yes            30            < 0.050     <NA>
## 95   39.66495              Yes            30            < 0.050     <NA>
## 147  39.66495              Yes            30            < 0.050     <NA>
## 153  39.66495              Yes            30            < 0.050     <NA>
## 154  39.66495              Yes            30            < 0.050     <NA>
## 178  39.66495              Yes            30            < 0.050     <NA>

Function to get unique list of microbial consortia (subtract species in DIH20 from rain samples)

getTaxaRain<-function(dataIn){
  
  new.df<-dataIn
  
  #get list of spp found in DI H20, and remove for rain samples
  remove.df<-subset(new.df,c(Location_type=="Control_DI_H20" | Location_type=="Forest_DI_H20"))
  
  #get list of taxa to remove
  removeList<-sort(unique(as.character(remove.df$TaxaName)))
  
  #create dataframe with taxa to keep (from rain water samples)
  keep.df<-subset(new.df,c(Location_type=="Control_Rain"  | Location_type=="Forest_Rain"))
  
  # spp.keep.df<-subset(spp.controls, ! Taxa %in% speciesRemoveList)
  keepList<-sort(unique(as.character(keep.df$TaxaName)))
  
  #get unique species within keep list
  diffList<-setdiff(keepList,removeList)
  
  #species.plot.data
  keep.data.out<-subset(keep.df, TaxaName %in% diffList)
  #make Sample_ID a factor
  keep.data.out$Sample_ID<-as.factor(keep.data.out$Sample_ID)

  return(keep.data.out)
  
}

#test function
test.df<-getTaxaRain(dataIn=taxaPresent.df)
head(test.df)

##      variable     ASV Category      TaxaName Confidence value       Date
## 1588    ZL_10 Asv2577    Genus    variovorax       0.93     1 2018-08-11
## 1695    ZL_10 Asv3187    Genus   dyadobacter       0.99     6 2018-08-11
## 1716    ZL_10 Asv1057    Genus   dyadobacter       1.00     3 2018-08-11
## 2702    ZL_11 Asv1618    Genus pigmentiphaga       0.95     3 2018-08-11
## 2717    ZL_11 Asv2732    Genus      kluyvera       0.95    20 2018-08-11
## 2934    ZL_11 Asv2776    Genus  chthonomonas       0.96     2 2018-08-11
##                     Time  Sample_ID Sample_type Location_type Location
## 1588 1899-12-31 07:10:00 Control-1B        Rain  Control_Rain  N of EW
## 1695 1899-12-31 07:10:00 Control-1B        Rain  Control_Rain  N of EW
## 1716 1899-12-31 07:10:00 Control-1B        Rain  Control_Rain  N of EW
## 2702 1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 2717 1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 2934 1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
##       Latitude Longitude Sample_collected Sample_Vol_mL DNA_Conc_ng_per_uL
## 1588 -75.74594  39.66495              Yes          32.5            < 0.050
## 1695 -75.74594  39.66495              Yes          32.5            < 0.050
## 1716 -75.74594  39.66495              Yes          32.5            < 0.050
## 2702 -75.74582  39.66496              Yes             1            < 0.050
## 2717 -75.74582  39.66496              Yes             1            < 0.050
## 2934 -75.74582  39.66496              Yes             1            < 0.050
##          Comments
## 1588         <NA>
## 1695         <NA>
## 1716         <NA>
## 2702 small sample
## 2717 small sample
## 2934 small sample

Function that combines previous functions to get data ready to visualize

#get PlotData function
getPlotData<-function(dataIn,CategoryIn, ConfidenceLevel){
  new.df<-dataIn
  newCategory<-CategoryIn
  newConfidence<-ConfidenceLevel
  my.df.1<-splitTaxaData(dataIn=new.df, CategoryIn = newCategory ,ConfidenceLevel = newConfidence)
  my.df.2<-addSamplingInfo(dataIn=my.df.1)
  my.df.3<-getTaxaRain(dataIn=my.df.2)
  
  return(my.df.3)
}

#test function
new.plot.data<-getPlotData(dataIn=df.out.1, CategoryIn = "Order", ConfidenceLevel = 0.6)
head(new.plot.data)

##       variable     ASV Category          TaxaName Confidence value       Date
## 6427     ZL_11  Asv790    Order acholeplasmatales       1.00   124 2018-08-11
## 6603     ZL_11 Asv3124    Order      prochlorales       0.62     9 2018-08-11
## 7532     ZL_11 Asv1631    Order  gemmatimonadales       0.79     1 2018-08-11
## 9905     ZL_12 Asv3124    Order      prochlorales       0.62     1 2018-08-11
## 11645    ZL_13 Asv3923    Order acholeplasmatales       0.86     1 2018-08-11
## 12493    ZL_13 Asv1631    Order  gemmatimonadales       0.79     9 2018-08-11
##                      Time  Sample_ID Sample_type Location_type Location
## 6427  1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 6603  1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 7532  1899-12-31 07:10:00 Control-2A        Rain  Control_Rain  N of EW
## 9905  1899-12-31 07:45:00      C8-1A        Rain   Forest_Rain    EW_C8
## 11645 1899-12-31 07:45:00      C8-1B        Rain   Forest_Rain    EW_C8
## 12493 1899-12-31 07:45:00      C8-1B        Rain   Forest_Rain    EW_C8
##        Latitude Longitude Sample_collected Sample_Vol_mL  DNA_Conc_ng_per_uL
## 6427  -75.74582  39.66496              Yes             1             < 0.050
## 6603  -75.74582  39.66496              Yes             1             < 0.050
## 7532  -75.74582  39.66496              Yes             1             < 0.050
## 9905  -75.74500  39.66152              Yes             9 0.55000000000000004
## 11645 -75.74500  39.66152              Yes            30 0.55900000000000005
## 12493 -75.74500  39.66152              Yes            30 0.55900000000000005
##           Comments
## 6427  small sample
## 6603  small sample
## 7532  small sample
## 9905          <NA>
## 11645         <NA>
## 12493         <NA>

Visualize data among different taxanomic categories and confidence thresholds (here, Confidence >= 0.90)

#use function to get taxanomic assignment plots
categoryList<-unique(as.character(df.out.1$Category))

removeCategories<-c("Domain","Phylum","Class","Species")

#remove Domain and Phylum, since there is only overlap between DI H20 and rain-based reads
categoryList<-categoryList[! categoryList %in% removeCategories]

#loop through categoryList and plot
for(i in 1:length(categoryList)){
  
  newCategory<-categoryList[i]
  
  my.data<-getPlotData(dataIn=df.out.1,CategoryIn = newCategory,ConfidenceLevel = 0.9)
  
  #generate random colors for palette according to number of unique taxa
  n=length(unique(my.data$TaxaName))
  myColors<- randomcoloR::distinctColorPalette(n)

  #plot species unique to rain event in experiment
  speciesPlot<-ggplot(data=my.data, aes(x=variable, y=value))+
    geom_bar(stat="identity",aes(fill=TaxaName), position="fill")+
    theme(legend.position="bottom")+
    labs(x="Sample",y="Relative Proportion")+
    scale_fill_manual(values=myColors)+
    theme(axis.text.x=element_text(size=7))+
    ggtitle(paste(newCategory, " - plot of sintax_taxanomic assignments."))
  #speciesPlot
  
  speciesPlotFacet<-speciesPlot+facet_grid(~Location_type, drop=TRUE,scales="free_x",space="free_x")
  
  print(speciesPlotFacet)

  #save each plot
  #ggsave

  }

Plot taxa only occuring in Forest (i.e., subtract Control_rain taxa from Forest_rain)

#get new Species-level data
spp.data<-getPlotData(dataIn=df.out.1, CategoryIn = "Scientific_Name",ConfidenceLevel = 0.9)

control.df<-subset(spp.data, Location_type=="Control_Rain")
controlList<-sort(unique(as.character(control.df$TaxaName)))

forest.df<-subset(spp.data, Location_type=="Forest_Rain")
forestList<-sort(unique(as.character(forest.df$TaxaName)))

forestUnique<-setdiff(forestList, controlList)

forest.data<-subset(forest.df, TaxaName %in% forestUnique)

  #plot species unique to rain event in experiment
  forestPlot<-ggplot(data=forest.data, aes(x=variable, y=value))+
    geom_bar(stat="identity",aes(fill=TaxaName), position="fill")+
    #theme(legend.position="none")+
    labs(x="Sample",y="Relative Proportion")+
    scale_fill_manual(values=myColors)+
    ggtitle(paste(newCategory, " - assignments unique to forest."))
  #scale_fill_manual(values = RColorBrewer::brewer.pal(12, "Set3"))
  #speciesPlot
  
  # forestPlotFacet<-speciesPlot+facet_wrap(.~Location_type)
  
  print(forestPlot)

  #save each plot
  #ggsave

Forest eDNA experiment - UDel

Sampling forest microbial consortia using eDNA in throughfall.

Zachary Ladin (zach@udel.edu)

04 February, 2020

Getting things set up in R

Load packages and set working directory in R.

Read in data: sintax_taxonomy assignment predictions

Format data

Function to get split out data by taxanomic category and confidence level

Function to add sampling location information to data

Function to get unique list of microbial consortia (subtract species in DIH20 from rain samples)

Function that combines previous functions to get data ready to visualize

Visualize data among different taxanomic categories and confidence thresholds (here, Confidence >= 0.90)

Plot taxa only occuring in Forest (i.e., subtract Control_rain taxa from Forest_rain)