#clear environment
rm(list=ls())
#install phyloseq if needed
# source("https://raw.githubusercontent.com/joey711/phyloseq/master/inst/scripts/installer.R",
# local = TRUE)
#
# install_phyloseq(branch = "github")
#load packages
library(phyloseq)
library(ggplot2)
library(readxl)
#set working directory
setwd("/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/Scripts/for_zach")
#set global options
knitr::opts_chunk$set(echo = TRUE,fig.width=11)
#read in sintax_taxonomy predictions
taxa_predictions_1<-read.table(file="/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/sintax_taxonomy_predictions.txt",sep="\t",fill=TRUE)
taxa_predictions<-taxa_predictions_1[,-3:-5]
colnames(taxa_predictions)<-c("AmpliconSequenceVariant","Taxa")
taxa_predictions<-taxa_predictions[ taxa_predictions$Taxa != "",]
head(taxa_predictions)
## AmpliconSequenceVariant
## 7 Asv14
## 10 Asv1
## 11 Asv6
## 12 Asv2
## 13 Asv17
## 14 Asv5
## Taxa
## 7 d:bacteria(1.00),p:proteobacteria(1.00),c:betaproteobacteria(1.00),o:burkholderiales(1.00),f:oxalobacteraceae(1.00),g:massilia(0.99),s:consociata(0.44)
## 10 d:bacteria(1.00),p:tenericutes(1.00),c:mollicutes(1.00),o:entomoplasmatales(1.00),f:spiroplasmataceae(1.00),g:spiroplasma(1.00),s:ixodetis(1.00)
## 11 d:bacteria(1.00),p:tenericutes(1.00),c:mollicutes(1.00),o:entomoplasmatales(1.00),f:spiroplasmataceae(1.00),g:spiroplasma(1.00),s:ixodetis(0.94)
## 12 d:bacteria(1.00),p:proteobacteria(1.00),c:alphaproteobacteria(1.00),o:rhodospirillales(1.00),f:acetobacteraceae(1.00),g:gluconobacter(0.27),s:kanchanaburiensis(0.24)
## 13 d:bacteria(1.00),p:bacteroidetes(1.00),c:sphingobacteriia(1.00),o:sphingobacteriales(1.00),f:sphingobacteriaceae(1.00),g:pedobacter(1.00),s:antarcticus(0.56)
## 14 d:bacteria(1.00),p:proteobacteria(1.00),c:alphaproteobacteria(1.00),o:rhizobiales(0.99),f:methylocystaceae(0.24),g:methylocystis(0.16),s:echinoides(0.15)
taxa_pred.df<-taxa_predictions
#create avs.df to merge later
ASV<-taxa_pred.df[,c("AmpliconSequenceVariant")]
#get taxonomy assignments and confidence
taxa_predictions_species<-read.table(text=as.character(taxa_pred.df$Taxa),sep=",")
colnames(taxa_predictions_species)<-c("Domain","Phylum","Class","Order","Family","Genus","Species")
#add Scientific_Name colunmn
taxa_predictions_species$Scientific_Name<-paste(gsub("[^[:alpha:] ]","",gsub("g:","", taxa_predictions_species$Genus)),gsub("[^[:alpha:] ]","", gsub("s:","",taxa_predictions_species$Species)), sep="_")
#capitalize Genera
taxa_predictions_species$Scientific_Name<-stringr::str_to_sentence(taxa_predictions_species$Scientific_Name)
#list of column names
colList<-names(taxa_predictions_species)
#extract taxa and confidence and create data.frame
df.out<-list()
for(i in 1:length(colList)){
TaxaName<-colList[i]
newCol<-taxa_predictions_species[names(taxa_predictions_species)==colList[i]]
newCol[,1]<-as.character(newCol[,1])
if(colList[i]=="Scientific_Name"){
firstChar=""
}else{
firstChar<-unique(substr(newCol[,1], start=1, stop=2))
}
dataNew_1<-gsub(firstChar, "",newCol[,1])
conf<-read.table(text=dataNew_1,sep="(")
if(length(names(conf))<2){
conf$Confidence<-NA
colnames(conf)<-c("Taxa","Confidence")
}else{
colnames(conf)<-c("Taxa","Confidence")
}
conf$Confidence<-gsub(")","",conf$Confidence)
#add Taxonomic name
conf$TaxaName<-TaxaName
#add group
#conf$Group<-paste("Group_",seq(1:length(row.names(conf))),sep="")
df.out<-rbind(df.out, conf)
}
df.out.1<-data.frame(Category=df.out$TaxaName, TaxaName=df.out$Taxa, Confidence=df.out$Confidence)
#head(df.out)
# dataIn=df.out.1
# CategoryIn="Family"
# ConfidenceLevel = 0.90
splitTaxaData<-function(dataIn, CategoryIn, ConfidenceLevel){
new.df<-dataIn
new.df$Confidence<-as.numeric(as.character(new.df$Confidence))
#now add conf from Species to Scientific_Name
spp.conf<-subset(new.df, Category=="Species")[,c("Confidence")]
#unique(new.df$Category)
sciName.df<-subset(new.df, Category=="Scientific_Name")
sciName.df$Confidence<-spp.conf
#now add back into df.out
df.sub<-subset(new.df, Category !="Scientific_Name")
df.out.2<-rbind(df.sub, sciName.df)
#now merge with sample.group.df
merge.df<-cbind(df.out.2, ASV)
#convert confidence to numeric
merge.df$Confidence<-as.numeric(merge.df$Confidence)
#remove taxa assignment below a given threshold
conf.df<-subset(merge.df, c(Category==CategoryIn & Confidence >= ConfidenceLevel))
#make Category a character
#conf.df$Category<-as.character(conf.df$Category)
#subset by Category
#conf.df.out<-subset(conf.df, Category==CategoryIn)
return(conf.df)
}
#test function splitTaxaData
split.data<-splitTaxaData(dataIn=df.out.1, CategoryIn = "Genus", ConfidenceLevel=0.9)
head(split.data)
## Category TaxaName Confidence ASV
## 15456 Genus massilia 0.99 Asv14
## 15457 Genus spiroplasma 1.00 Asv1
## 15458 Genus spiroplasma 1.00 Asv6
## 15460 Genus pedobacter 1.00 Asv17
## 15462 Genus friedmanniella 1.00 Asv13
## 15467 Genus pantoea 0.94 Asv10
#now read in table of total counts per sample
addSamplingInfo<-function(dataIn){
new.df<-dataIn
sampleTable.df<-read.table(
"/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/asv_seqs.count_table.txt", header=FALSE
)
sampleTable.df<-read.table(
"/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/zach_16s_pipeline-master/pipeline_output/asv_seqs.count_table.txt",col.names=c("ASV","ZL_1", "ZL_10", "ZL_11", "ZL_12", "ZL_13", "ZL_14", "ZL_15", "ZL_16", "ZL_17", "ZL_18", "ZL_19", "ZL_2", "ZL_20", "ZL_21", "ZL_3", "ZL_4", "ZL_5", "ZL_6", "ZL_7", "ZL_8", "ZL_9"))
#melt sampleTable.df
sample.melt<-reshape2::melt(sampleTable.df, id.vars="ASV")
#now merge taxa.conf.100 with sample.melt
conf.merge<-unique(merge(new.df, sample.melt, by="ASV",all.x=TRUE))
#add treatment and locaiton info
#read in sample location data
sampleInfo<-data.frame(read_excel(path="/Users/zach/Dropbox (ZachTeam)/Projects/eDNA_Brazil/Pilot_study/Data/Ladin_eDNA_rain_samples_DataSheet.xlsx", sheet = 1, range = NULL, col_names = TRUE,col_types = NULL, na = "", trim_ws = TRUE, skip = 0, n_max=26,progress = readxl_progress(), .name_repair = "unique"))
#change "Sample" to "variable column names
names(sampleInfo)[names(sampleInfo)=="Sample"]<-"variable"
#merge sampleInfo with taxa.conf.100.merge
taxa.merge<-merge(conf.merge, sampleInfo, by="variable",all.x=TRUE)
#unique(taxa.merge$Category)
######################################################################################################
#try plotting
#set order of factor levels
taxa.merge$Category<-factor(taxa.merge$Category, levels=c("Domain","Phylum","Class","Order","Family", "Genus","Species","Scientific_Name"))
#remove all NAs from Location_type
taxa.merge.2<-taxa.merge[!is.na(taxa.merge$Location_type),]
taxa.merge.2$Location_type<-factor(taxa.merge.2$Location_type, levels=c("Control_DI_H20", "Forest_DI_H20","Control_Rain","Forest_Rain"))
#remove all with value=0
taxaPresent<-subset(taxa.merge.2, value!=0)
return(taxaPresent)
}
#test function
taxaPresent.df<-addSamplingInfo(dataIn=split.data)
head(taxaPresent.df)
## variable ASV Category TaxaName Confidence value Date
## 7 ZL_1 Asv3001 Genus lactobacillus 1.00 1 2018-08-10
## 95 ZL_1 Asv3473 Genus ralstonia 0.99 9 2018-08-10
## 147 ZL_1 Asv3257 Genus tepidimonas 0.95 1 2018-08-10
## 153 ZL_1 Asv672 Genus acinetobacter 1.00 1 2018-08-10
## 154 ZL_1 Asv1155 Genus acinetobacter 1.00 9 2018-08-10
## 178 ZL_1 Asv340 Genus amnibacterium 0.97 1 2018-08-10
## Time Sample_ID Sample_type Location_type Location Latitude
## 7 1899-12-31 18:01:00 Control-1 DI_H20 Control_DI_H20 N of EW -75.74594
## 95 1899-12-31 18:01:00 Control-1 DI_H20 Control_DI_H20 N of EW -75.74594
## 147 1899-12-31 18:01:00 Control-1 DI_H20 Control_DI_H20 N of EW -75.74594
## 153 1899-12-31 18:01:00 Control-1 DI_H20 Control_DI_H20 N of EW -75.74594
## 154 1899-12-31 18:01:00 Control-1 DI_H20 Control_DI_H20 N of EW -75.74594
## 178 1899-12-31 18:01:00 Control-1 DI_H20 Control_DI_H20 N of EW -75.74594
## Longitude Sample_collected Sample_Vol_mL DNA_Conc_ng_per_uL Comments
## 7 39.66495 Yes 30 < 0.050 <NA>
## 95 39.66495 Yes 30 < 0.050 <NA>
## 147 39.66495 Yes 30 < 0.050 <NA>
## 153 39.66495 Yes 30 < 0.050 <NA>
## 154 39.66495 Yes 30 < 0.050 <NA>
## 178 39.66495 Yes 30 < 0.050 <NA>
getTaxaRain<-function(dataIn){
new.df<-dataIn
#get list of spp found in DI H20, and remove for rain samples
remove.df<-subset(new.df,c(Location_type=="Control_DI_H20" | Location_type=="Forest_DI_H20"))
#get list of taxa to remove
removeList<-sort(unique(as.character(remove.df$TaxaName)))
#create dataframe with taxa to keep (from rain water samples)
keep.df<-subset(new.df,c(Location_type=="Control_Rain" | Location_type=="Forest_Rain"))
# spp.keep.df<-subset(spp.controls, ! Taxa %in% speciesRemoveList)
keepList<-sort(unique(as.character(keep.df$TaxaName)))
#get unique species within keep list
diffList<-setdiff(keepList,removeList)
#species.plot.data
keep.data.out<-subset(keep.df, TaxaName %in% diffList)
#make Sample_ID a factor
keep.data.out$Sample_ID<-as.factor(keep.data.out$Sample_ID)
return(keep.data.out)
}
#test function
test.df<-getTaxaRain(dataIn=taxaPresent.df)
head(test.df)
## variable ASV Category TaxaName Confidence value Date
## 1588 ZL_10 Asv2577 Genus variovorax 0.93 1 2018-08-11
## 1695 ZL_10 Asv3187 Genus dyadobacter 0.99 6 2018-08-11
## 1716 ZL_10 Asv1057 Genus dyadobacter 1.00 3 2018-08-11
## 2702 ZL_11 Asv1618 Genus pigmentiphaga 0.95 3 2018-08-11
## 2717 ZL_11 Asv2732 Genus kluyvera 0.95 20 2018-08-11
## 2934 ZL_11 Asv2776 Genus chthonomonas 0.96 2 2018-08-11
## Time Sample_ID Sample_type Location_type Location
## 1588 1899-12-31 07:10:00 Control-1B Rain Control_Rain N of EW
## 1695 1899-12-31 07:10:00 Control-1B Rain Control_Rain N of EW
## 1716 1899-12-31 07:10:00 Control-1B Rain Control_Rain N of EW
## 2702 1899-12-31 07:10:00 Control-2A Rain Control_Rain N of EW
## 2717 1899-12-31 07:10:00 Control-2A Rain Control_Rain N of EW
## 2934 1899-12-31 07:10:00 Control-2A Rain Control_Rain N of EW
## Latitude Longitude Sample_collected Sample_Vol_mL DNA_Conc_ng_per_uL
## 1588 -75.74594 39.66495 Yes 32.5 < 0.050
## 1695 -75.74594 39.66495 Yes 32.5 < 0.050
## 1716 -75.74594 39.66495 Yes 32.5 < 0.050
## 2702 -75.74582 39.66496 Yes 1 < 0.050
## 2717 -75.74582 39.66496 Yes 1 < 0.050
## 2934 -75.74582 39.66496 Yes 1 < 0.050
## Comments
## 1588 <NA>
## 1695 <NA>
## 1716 <NA>
## 2702 small sample
## 2717 small sample
## 2934 small sample
#get PlotData function
getPlotData<-function(dataIn,CategoryIn, ConfidenceLevel){
new.df<-dataIn
newCategory<-CategoryIn
newConfidence<-ConfidenceLevel
my.df.1<-splitTaxaData(dataIn=new.df, CategoryIn = newCategory ,ConfidenceLevel = newConfidence)
my.df.2<-addSamplingInfo(dataIn=my.df.1)
my.df.3<-getTaxaRain(dataIn=my.df.2)
return(my.df.3)
}
#test function
new.plot.data<-getPlotData(dataIn=df.out.1, CategoryIn = "Order", ConfidenceLevel = 0.6)
head(new.plot.data)
## variable ASV Category TaxaName Confidence value Date
## 6427 ZL_11 Asv790 Order acholeplasmatales 1.00 124 2018-08-11
## 6603 ZL_11 Asv3124 Order prochlorales 0.62 9 2018-08-11
## 7532 ZL_11 Asv1631 Order gemmatimonadales 0.79 1 2018-08-11
## 9905 ZL_12 Asv3124 Order prochlorales 0.62 1 2018-08-11
## 11645 ZL_13 Asv3923 Order acholeplasmatales 0.86 1 2018-08-11
## 12493 ZL_13 Asv1631 Order gemmatimonadales 0.79 9 2018-08-11
## Time Sample_ID Sample_type Location_type Location
## 6427 1899-12-31 07:10:00 Control-2A Rain Control_Rain N of EW
## 6603 1899-12-31 07:10:00 Control-2A Rain Control_Rain N of EW
## 7532 1899-12-31 07:10:00 Control-2A Rain Control_Rain N of EW
## 9905 1899-12-31 07:45:00 C8-1A Rain Forest_Rain EW_C8
## 11645 1899-12-31 07:45:00 C8-1B Rain Forest_Rain EW_C8
## 12493 1899-12-31 07:45:00 C8-1B Rain Forest_Rain EW_C8
## Latitude Longitude Sample_collected Sample_Vol_mL DNA_Conc_ng_per_uL
## 6427 -75.74582 39.66496 Yes 1 < 0.050
## 6603 -75.74582 39.66496 Yes 1 < 0.050
## 7532 -75.74582 39.66496 Yes 1 < 0.050
## 9905 -75.74500 39.66152 Yes 9 0.55000000000000004
## 11645 -75.74500 39.66152 Yes 30 0.55900000000000005
## 12493 -75.74500 39.66152 Yes 30 0.55900000000000005
## Comments
## 6427 small sample
## 6603 small sample
## 7532 small sample
## 9905 <NA>
## 11645 <NA>
## 12493 <NA>
#use function to get taxanomic assignment plots
categoryList<-unique(as.character(df.out.1$Category))
removeCategories<-c("Domain","Phylum","Class","Species")
#remove Domain and Phylum, since there is only overlap between DI H20 and rain-based reads
categoryList<-categoryList[! categoryList %in% removeCategories]
#loop through categoryList and plot
for(i in 1:length(categoryList)){
newCategory<-categoryList[i]
my.data<-getPlotData(dataIn=df.out.1,CategoryIn = newCategory,ConfidenceLevel = 0.9)
#generate random colors for palette according to number of unique taxa
n=length(unique(my.data$TaxaName))
myColors<- randomcoloR::distinctColorPalette(n)
#plot species unique to rain event in experiment
speciesPlot<-ggplot(data=my.data, aes(x=variable, y=value))+
geom_bar(stat="identity",aes(fill=TaxaName), position="fill")+
theme(legend.position="bottom")+
labs(x="Sample",y="Relative Proportion")+
scale_fill_manual(values=myColors)+
theme(axis.text.x=element_text(size=7))+
ggtitle(paste(newCategory, " - plot of sintax_taxanomic assignments."))
#speciesPlot
speciesPlotFacet<-speciesPlot+facet_grid(~Location_type, drop=TRUE,scales="free_x",space="free_x")
print(speciesPlotFacet)
#save each plot
#ggsave
}
#get new Species-level data
spp.data<-getPlotData(dataIn=df.out.1, CategoryIn = "Scientific_Name",ConfidenceLevel = 0.9)
control.df<-subset(spp.data, Location_type=="Control_Rain")
controlList<-sort(unique(as.character(control.df$TaxaName)))
forest.df<-subset(spp.data, Location_type=="Forest_Rain")
forestList<-sort(unique(as.character(forest.df$TaxaName)))
forestUnique<-setdiff(forestList, controlList)
forest.data<-subset(forest.df, TaxaName %in% forestUnique)
#plot species unique to rain event in experiment
forestPlot<-ggplot(data=forest.data, aes(x=variable, y=value))+
geom_bar(stat="identity",aes(fill=TaxaName), position="fill")+
#theme(legend.position="none")+
labs(x="Sample",y="Relative Proportion")+
scale_fill_manual(values=myColors)+
ggtitle(paste(newCategory, " - assignments unique to forest."))
#scale_fill_manual(values = RColorBrewer::brewer.pal(12, "Set3"))
#speciesPlot
# forestPlotFacet<-speciesPlot+facet_wrap(.~Location_type)
print(forestPlot)
#save each plot
#ggsave