Import and filter data

Load Packages:

library(tidyverse); packageVersion("tidyverse") # 1.3.0
library(phyloseq); packageVersion("phyloseq") # 1.30.0
library(ggplot2); packageVersion("ggplot2") # 3.3.0
library(ampvis2); packageVersion("ampvis2") # 2.5.9
library(plyr); packageVersion("plyr") # 1.8.6
library(dplyr); packageVersion("dplyr") # 0.8.5
library(gridExtra); packageVersion("gridExtra") # 2.3
library(ggpubr); packageVersion("ggpubr") # 0.3.0
library(vegan); packageVersion("vegan") # 2.5.6
if(!require("devtools"))
  install.packages("devtools")
# source the phyloseq_to_ampvis2() function from the gist
devtools::source_gist("8d0ca4206a66be7ff6d76fc4ab8e66c6")

Import data

1. Import from Qiime2
biom = ("run9-14_unfiltered_feature_table_w_taxonomy_ASV.biom")
biom = import_biom(biom) 

metadata = import_qiime_sample_data("metadata_Biofilms_run9-14_v5.txt")
meta_csv <- read.csv("metadata_Biofilms_run9-14_v5.csv")

Rooted_Tree = ("tree.nwk")
Tree <- read_tree(Rooted_Tree)
2. Change rank names of the data
  • Data include the ASV level (making 8 ranks), but phyloseq and ampvis packages require 7 ranks. Family rank is removed.
colnames(tax_table(biom)) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "ASV")
rank_names(biom)

tax_table(biom) <- tax_table(biom) [,-5] # Removes the "Family" column
rank_names(biom)
3. Merge to create the phyloseq object
myData = merge_phyloseq(biom,metadata,Tree)

# Order columns for later graphs
ordered <- c("0Sucr_0Muc", "0Sucr_2.5Muc", "0.1Sucr_0.1Muc", "0.1Sucr_2.5Muc", "0.1Sucr_8Muc", 
             "0.5Sucr_2.5Muc", "0.8Sucr_2.5Muc", "Inoculum")
sample_data(myData)$Sucrose_and_mucin <- 
  factor(sample_data(myData)$Sucrose_and_mucin, levels=ordered)

ordered <- c("0Sucr_2.5Muc_0FBS", "0Sucr_0Muc_10FBS", "0Sucr_2.5Muc_10FBS", "0Sucr_2.5Muc_20FBS",  
             "0.1Sucr_0.1Muc_0FBS", "0.1Sucr_2.5Muc_0FBS", "0.1Sucr_8.0Muc_0FBS", "0.1Sucr_2.5Muc_10FBS", 
             "0.1Sucr_0.1Muc_30FBS", "0.1Sucr_2.5Muc_30FBS",
             "0.1Sucr_2.5Muc_50FBS", 
             "0.5Sucr_2.5Muc_0FBS", "0.5Sucr_2.5Muc_10FBS", 
             "0.8Sucr_2.5Muc_0FBS", "0.8Sucr_2.5Muc_10FBS", 
             "Inoculum")
sample_data(myData)$Replicates_2 <- 
  factor(sample_data(myData)$Replicates_2, levels=ordered)

myData
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 869 taxa and 94 samples ]
## sample_data() Sample Data:       [ 94 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 869 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 869 tips and 864 internal nodes ]

Filter the data

1. Remove unassigned
myData <- subset_taxa(myData, Phylum != "Unassigned")
myData
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 764 taxa and 94 samples ]
## sample_data() Sample Data:       [ 94 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 764 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 764 tips and 759 internal nodes ]
2. Remove controls
controls <- c("media", "not_applicable") # names of controls in the column "sample_type"
myData <- subset_samples(myData, !(sample_type %in% controls))
myData
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 764 taxa and 63 samples ]
## sample_data() Sample Data:       [ 63 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 764 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 764 tips and 759 internal nodes ]
3. Remove contaminants
  • Common kit contaminants, non-oral contaminating organisms, and other organisms high controls samples were removed.
  • Two ASVs (of oral species S. oralis and V. dispar) were high in all samples, including controls, and were removed.
Contam_o <- c("o__Rhizobiales")
Contam_g <- c("g__Delftia","g__Yersinia", "g__Listeria", "g__Mogibacterium", "g__Acinetobacter", "g__Jeogalicoccus", "g__Pseudomonas", "g__Pseudoramibacter")
Contam_s <- c("s__platensis","s__subtilis", "s__paradoxus","s__coli","s__aeruginosa", "s__fluorescens")
Contam_a <- c("49ed5dfd41e37e23a4e525c5a21a8fa8","ab2bb49867509a1650d185f93c305875")
Contam_media <- c("s__atypica","s__moorei", "s__salivae","s__concisus")


myData <- subset_taxa(myData, !(Order %in% Contam_o))
myData <- subset_taxa(myData, !(Genus %in% Contam_g))
myData <- subset_taxa(myData, !(Species %in% Contam_s))
myData <- subset_taxa(myData, !(ASV %in% Contam_a))
myData <- subset_taxa(myData, !(ASV %in% Contam_media))
myData
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 675 taxa and 63 samples ]
## sample_data() Sample Data:       [ 63 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 675 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 675 tips and 670 internal nodes ]
4. Remove additional samples not used in the study
myData <- subset_samples(myData, media_type != "spent")
myData
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 675 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 675 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 675 tips and 670 internal nodes ]
5. Remove empty taxa
myData <- prune_taxa(taxa_sums(myData) > 0, myData)
myData 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 301 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 301 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 301 tips and 299 internal nodes ]

Figure 1

1. Agglomerate to species level
myData_species <- tax_glom(myData, "Species")
myData_species <- prune_taxa(taxa_sums(myData_species) > 0, myData_species)
myData_species
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 153 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 153 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 153 tips and 152 internal nodes ]
2. Convert to an ampvis object
  • Ampvis requires rank names as “Kingdom”, “Phylum”, “Class”, “Order”, “Family”, “Genus”, “Species”; so the names must be changed
  • Merge the replicates for each condition resulting in a mean read count for each condition.
# Change rank names
myData_species_ampvis <- myData_species
colnames(tax_table(myData_species_ampvis)) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
# rank_names(myData_species_ampvis) # Now 7 ranks ordered in the old way, but remember they are wrong

# Convert to ampvis
myData_species_ampvis <- phyloseq_to_ampvis2(myData_species_ampvis)

# Fix the SampleID column name 
colnames(myData_species_ampvis$metadata) # Need to change "X_SampleID" to just "SampleID"
##  [1] "X_SampleID"                    "BarcodeSequence"              
##  [3] "LinkerPrimerSequence"          "ReversePrimer"                
##  [5] "InputFileName"                 "description"                  
##  [7] "sample_type"                   "dna_kit"                      
##  [9] "sucrose_concentration_percent" "DS_Sucrose"                   
## [11] "AV_Sucrose"                    "Sucrose_and_mucin"            
## [13] "mucin_concentration_g"         "DS_Mucin"                     
## [15] "AV_Mucin"                      "fbs_concentration_percent"    
## [17] "DS_FBS"                        "AV_FBS"                       
## [19] "Closest"                       "Replicates"                   
## [21] "Replicates_2"                  "Replicates_3"                 
## [23] "initial_pH"                    "final_pH"                     
## [25] "OD"                            "media_type"                   
## [27] "study_name"                    "Sample_ID"                    
## [29] "Unique_names"                  "Type"                         
## [31] "Run"                           "DNA_Location"                 
## [33] "Inoculum"                      "DNA_conc"
colnames(myData_species_ampvis$metadata)[colnames(myData_species_ampvis$metadata) == 'X_SampleID'] <- 'SampleID'

# Merging in ampvis takes the mean of the counts for all the merged samples
myData_species_ampvis_mergedRep <- amp_mergereplicates(myData_species_ampvis, "Replicates_2")
myData_species_ampvis_mergedRep 
## ampvis2 object with 4 elements. 
## Summary of OTU table:
## Samples    OTUs 
##      16     153 
## (The read counts have been normalised)
## 
## Assigned taxonomy:
##   Kingdom    Phylum     Class     Order    Family     Genus   Species 
## 153(100%) 153(100%) 153(100%) 153(100%) 153(100%) 153(100%)     0(0%) 
## 
## Metadata variables: 34 
##  SampleID, BarcodeSequence, LinkerPrimerSequence, ReversePrimer, InputFileName, description, sample_type, dna_kit, sucrose_concentration_percent, DS_Sucrose, AV_Sucrose, Sucrose_and_mucin, mucin_concentration_g, DS_Mucin, AV_Mucin, fbs_concentration_percent, DS_FBS, AV_FBS, Closest, Replicates, Replicates_2, Replicates_3, initial_pH, final_pH, OD, media_type, study_name, Sample_ID, Unique_names, Type, Run, DNA_Location, Inoculum, DNA_conc
3. Export abundance and taxonomy tables from ampvis
  • Tables have the read counts averaged across replicates.
abund_table <- myData_species_ampvis_mergedRep$abund
tax_table <- myData_species_ampvis_mergedRep$tax

for_krona <- merge(abund_table, tax_table, by=0)

# write.csv(for_krona "For_Krona/data_unrarefied.csv")
4. Import to Krona
Figure 1: Krona pie charts

Figure 1: Krona pie charts

Figure 2

1. Sucrose Gradient

  • Change in DNA concentration, species count, and final pH between sucrose concentrations 0%, 0.1%, 0.5%, and 0.8%
  • FBS fixed at 10%; mucin fixed at 2.5g/L
a. Subset the data
myData_sucr <- subset_samples(myData, mucin_concentration_g == "2.5")
myData_sucr <- subset_samples(myData_sucr, fbs_concentration_percent == "10")
myData_sucr_species <- tax_glom(myData_sucr, "Species") # agglomerate to species level
myData_sucr_species <- prune_taxa(taxa_sums(myData_sucr_species) > 0, myData_sucr_species) # remove empty taxa after subsetting
myData_sucr_species
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 93 taxa and 10 samples ]
## sample_data() Sample Data:       [ 10 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 93 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 93 tips and 92 internal nodes ]
get_variable(myData_sucr_species, "Unique_names") # There are 10 samples with fixed  10% FBS and 2.5g/L mucin
##  [1] "14 C 10F 2.5M 0S -1"        "14 C 10F 2.5M 0S -2"       
##  [3] "14 C 10F 2.5M 0S -3"        "14 C 10FBS 2.5M 0.1S -1"   
##  [5] "14 C 10FBS 2.5M 0.1S -2"    "14 Trep C 10F 2.5M 0.5S -1"
##  [7] "14 Trep C 10F 2.5M 0.5S -2" "14 Trep C 10F 2.5M 0.8S -1"
##  [9] "14 Trep C 10F 2.5M 0.8S -2" "9 C 10F 2.5M 0.1S"
b. Boxplots of species numbers
sample_names_sucr <- as.character(unique(sample_data(myData_sucr_species)$Sample_ID)) # Extract sample names for the 10 samples
species_number_sucr <-rep(list(NA),10) # Make an empty list where the number of species will go
names(species_number_sucr) <- sample_names_sucr # Add sample names to each place on the list

# Loop to count number of species in each sample
for(k in 1:10){
  one_sample <- prune_samples(sample_data(myData_sucr_species)$Sample_ID %in% sample_names_sucr[k],myData_sucr_species) # Isolate one sample
  one_sample_pruned <- prune_taxa(taxa_sums(one_sample) > 0, one_sample) # Remove all species that are 0 (not present in the sample)
  species_number_sucr[[k]]<-nrow(otu_table(one_sample_pruned)) # write the number of different species for this sample onto the list
}
  
# Generate dataframe for boxplot
species_number_sucr_df <- ldply(species_number_sucr, cbind) # convert to a dataframe
colnames(species_number_sucr_df) <- c("X.SampleID", "species_count")

# Merge Dataframes
meta_species_sucr_df <- merge(meta_csv, species_number_sucr_df, by = "X.SampleID", all.x = F) # Adds metadata to the samples

# Generate boxplot with p-values
Species_Sucr_plot_t_test <- ggplot(meta_species_sucr_df, aes(x=as.factor(sucrose_concentration_percent), y=species_count)) +
  labs(title="Species count for each % sucrose", x="% Sucrose", y="Species count") +
  ylim(20,70) + # Change based on the max and min species values for both sucrose and fbs
  geom_boxplot() +
  stat_compare_means(label.y=c(20)) +
  theme_classic() +
  theme(plot.title = element_text(size=10))
c. Boxplots of DNA concentrations
# Make DNA concentration numeric 
meta_species_sucr_df$DNA_conc <- as.character(meta_species_sucr_df$DNA_conc) # Has to be changed to a character first
meta_species_sucr_df$DNA_conc <- as.numeric(meta_species_sucr_df$DNA_conc)

# Visualize p-values: Specify the comparisons
my_comparisons <- list(c("0", "0.5"), c("0", "0.8"), c("0.1", "0.8")) # only significant comparisions
symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns"))

# Generate boxplot, only show signifcant comparisions
DNA_Sucr_plot_t_test <- ggplot(meta_species_sucr_df, aes(x=as.factor(sucrose_concentration_percent), y=DNA_conc)) +
  labs(title="DNA conc. (ng/µL) for each % sucrose", x="% Sucrose", y="DNA concentration (ng/µL)") +
  ylim(0,250) +
  geom_boxplot() +
  stat_compare_means(comparisons = my_comparisons, method="t.test",
                     label.y = c(225, 245, 195), symnum.args = symnum.args) +
  stat_compare_means(label.y=c(0)) +
  theme_classic() +
  theme(plot.title = element_text(size=10))
d. Boxplots of final pH
# Make DNA concentration numeric 
meta_species_sucr_df$final_pH <- as.character(meta_species_sucr_df$final_pH) # Has to be changed to a character first
meta_species_sucr_df$final_pH <- as.numeric(meta_species_sucr_df$final_pH)

# Visualize p-values: Specify the comparisons
my_comparisons <- list( c("0", "0.5"), c("0", "0.8"), c("0.1", "0.5")) # only significant comparisions
symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns"))

# Generate boxplot, only show signifcant comparisions
ph_Sucr_plot_t_test <- ggplot(meta_species_sucr_df, aes(x=as.factor(sucrose_concentration_percent), y=final_pH)) +
  labs(title="Final pH for each % sucrose", x="% Sucrose", y="Final pH") +
  ylim(4,8.1) +
  stat_compare_means(comparisons = my_comparisons, method="t.test",
                     label.y = c(7.6, 8, 7.2), symnum.args = symnum.args) +
  stat_compare_means(label.y=c(4)) +
  geom_boxplot() +
  theme_classic() +
  theme(plot.title = element_text(size=10))

2. FBS Gradient

  • Change in DNA concentration, species count, and final pH between FBS concentrations 0%, 10%, 30%, and 50%
  • Sucrose fixed at 0.1%; mucin fixed at 2.5g/L
a. Subset the data
myData_fbs <- subset_samples(myData, mucin_concentration_g == "2.5")
myData_fbs <- subset_samples(myData_fbs, sucrose_concentration_percent == "0.1")
myData_fbs_species <- tax_glom(myData_fbs, "Species") # agglomerate to species level
myData_fbs_species <- prune_taxa(taxa_sums(myData_fbs_species) > 0, myData_fbs_species) # remove empty taxa after subsetting
myData_fbs_species
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 87 taxa and 10 samples ]
## sample_data() Sample Data:       [ 10 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 87 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 87 tips and 86 internal nodes ]
get_variable(myData_fbs_species, "Unique_names") # There are 10 samples with fixed  0.1% sucrose and 2.5g/L mucin
##  [1] "14 C 10FBS 2.5M 0.1S -1" "14 C 10FBS 2.5M 0.1S -2"
##  [3] "9 C 0F 2.5M 0.1Suc"      "9 C 0F 2.5M 0.1S"       
##  [5] "9 C control SHI -1"      "9 C 10F 2.5M 0.1S"      
##  [7] "9 C control SHI -2"      "9 C control SHI -3"     
##  [9] "9 C 30F 2.5M 0.1S"       "9 C 50F 2.5M 0.1S"
b. Boxplots of species numbers
sample_names_fbs <-as.character(unique(sample_data(myData_fbs_species)$Sample_ID)) # Extract sample names for the 10 samples
species_number_fbs <-rep(list(NA),10) # Make an empty list where the number of species will go
names(species_number_fbs) <- sample_names_fbs # Add sample names to each place on the list

# Loop to count number of species in each sample
for(k in 1:10){
  one_sample <- prune_samples(sample_data(myData_fbs_species)$Sample_ID %in% sample_names_fbs[k],myData_fbs_species) # Isolate one sample
  one_sample_pruned <- prune_taxa(taxa_sums(one_sample) > 0, one_sample) # Remove all species that are 0 (not present in the sample)
  species_number_fbs[[k]]<-nrow(otu_table(one_sample_pruned)) # write the number of different species for this sample onto the list
  }

# Generate dataframe for boxplot
species_number_fbs_df <- ldply(species_number_fbs, cbind) # convert to a dataframe
colnames(species_number_fbs_df) <- c("X.SampleID", "species_count")

# Merge Dataframes
meta_species_fbs_df <- merge(meta_csv, species_number_fbs_df, by = "X.SampleID", all.x = F) # Adds metadata to the samples

# Generate boxplot with p-values
Species_fbs_plot_t_test <- ggplot(meta_species_fbs_df, aes(x=as.factor(fbs_concentration_percent), y=species_count)) +
  labs(title="Species count for each % FBS", x="% FBS", y="Species count") +
  ylim(20,70) + # Change based on the max and min species values for both sucrose and fbs
  geom_boxplot() +
  stat_compare_means(label.y=c(20)) +
  theme_classic() +
  theme(plot.title = element_text(size=10))
c. Boxplots of DNA concentrations
# Make DNA concentration numeric 
meta_species_fbs_df$DNA_conc <- as.character(meta_species_fbs_df$DNA_conc) # Has to be changed to a character first
meta_species_fbs_df$DNA_conc <- as.numeric(meta_species_fbs_df$DNA_conc)

# Visualize p-values: Specify the comparisons
# There are no signficant comparisions

# Generate boxplot, only show signifcant comparisions
DNA_fbs_plot_t_test <- ggplot(meta_species_fbs_df, aes(x=as.factor(fbs_concentration_percent), y=DNA_conc)) +
  labs(title="DNA conc. (ng/µL) for each % FBS", x="% FBS", y="DNA concentration (ng/µL)") +
  ylim(0,250) +
  geom_boxplot() +
  stat_compare_means(label.y=c(0)) +
  theme_classic() +
  theme(plot.title = element_text(size=10))
d. Boxplots of final pH
# Make DNA concentration numeric 
meta_species_fbs_df$final_pH <- as.character(meta_species_fbs_df$final_pH) # Has to be changed to a character first
meta_species_fbs_df$final_pH <- as.numeric(meta_species_fbs_df$final_pH)

# Visualize p-values: Specify the comparisons
# There are no signficant comparisions

# Generate boxplot, only show signifcant comparisions
ph_fbs_plot_t_test <- ggplot(meta_species_fbs_df, aes(x=as.factor(fbs_concentration_percent), y=final_pH)) +
  # geom_dotplot(binaxis='y', stackdir='center', dotsize=0.5, position=position_dodge(1)) +
  labs(title="Final pH for each % FBS", x="% FBS", y="Final pH") +
  ylim(4,8.1) +
  geom_boxplot() +
  stat_compare_means(label.y=c(4)) +
  theme_classic() +
  theme(plot.title = element_text(size=10))

3. Mucin Gradient

  • Change in DNA concentration, species count, and final pH between mucin concentrations 0.1g/L, 2.5g/L, 8g/L
  • Sucrose fixed at 0.1%; FBS fixed at 0%
a. Subset the data
myData_muc <- subset_samples(myData, sucrose_concentration_percent == "0.1")
myData_muc <- subset_samples(myData_muc, fbs_concentration_percent == "0")
myData_muc_species <- tax_glom(myData_muc, "Species") # agglomerate to species level
myData_muc_species <- prune_taxa(taxa_sums(myData_muc_species) > 0, myData_muc_species) # remove empty taxa after subsetting
myData_muc_species
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 76 taxa and 7 samples ]
## sample_data() Sample Data:       [ 7 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 76 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 76 tips and 75 internal nodes ]
get_variable(myData_muc_species, "Unique_names") # There are 7 samples with fixed  0.1% sucrose and 0% FBS
## [1] "9 C 0F 2.5M 0.1Suc" "9 C 0F 2.5M 0.1S"   "9 C control SHI -1"
## [4] "9 C control SHI -2" "9 C control SHI -3" "9 C 0F 8.0M 0.1S"  
## [7] "9 C 0F 0.1M 0.1S"
b. Boxplots of species numbers
sample_names_muc <-as.character(unique(sample_data(myData_muc_species)$Sample_ID)) # Extract sample names for the 7 samples
species_number_muc <-rep(list(NA),7) # Make an empty list where the number of species will go
names(species_number_muc) <- sample_names_muc # Add sample names to each place on the list

# Loop to count number of species in each sample
for(k in 1:7){
  one_sample <- prune_samples(sample_data(myData_muc_species)$Sample_ID %in% sample_names_muc[k],myData_muc_species) # Isolate one sample
  one_sample_pruned <- prune_taxa(taxa_sums(one_sample) > 0, one_sample) # Remove all species that are 0 (not present in the sample)
  species_number_muc[[k]]<-nrow(otu_table(one_sample_pruned)) # write the number of different species for this sample onto the list
  }

# Generate dataframe for boxplot
species_number_muc_df <- ldply(species_number_muc, cbind) # convert to a dataframe
colnames(species_number_muc_df) <- c("X.SampleID", "species_count")

# Merge Dataframes
meta_species_muc_df <- merge(meta_csv, species_number_muc_df, by = "X.SampleID", all.x = F) # Adds metadata to the samples

# Generate boxplot with p-values
Species_muc_plot_t_test <- ggplot(meta_species_muc_df, aes(x=as.factor(mucin_concentration_g), y=species_count)) +
  # geom_dotplot(binaxis='y', stackdir='center', dotsize=0.5, position=position_dodge(1)) +
  labs(title="Species count for each mucin conc.", x="g/L mucin", y="Species count") +
  ylim(20,70) + # Change based on the max and min species values for both sucrose and fbs
  geom_boxplot() +
  stat_compare_means(label.y=c(20)) +
  theme_classic() +
  theme(plot.title = element_text(size=10))
c. Boxplots of DNA concentrations
# Make DNA concentration numeric 
meta_species_muc_df$DNA_conc <- as.character(meta_species_muc_df$DNA_conc) # Has to be changed to a character first
meta_species_muc_df$DNA_conc <- as.numeric(meta_species_muc_df$DNA_conc)

# Visualize p-values: Specify the comparisons
# There are no signficant comparisions

# Generate boxplot, only show signifcant comparisions
DNA_muc_plot_t_test <- ggplot(meta_species_muc_df, aes(x=as.factor(mucin_concentration_g), y=DNA_conc)) +
  labs(title="DNA conc. (ng/µL) for each mucin conc.", x="g/L mucin", y="DNA concentration (ng/µL)") +
  ylim(0,250) +
  geom_boxplot() +
  stat_compare_means(label.y=c(0)) +
  theme_classic() +
  theme(plot.title = element_text(size=10))
d. Boxplots of final pH
# Make DNA concentration numeric 
meta_species_muc_df$final_pH <- as.character(meta_species_muc_df$final_pH) # Has to be changed to a character first
meta_species_muc_df$final_pH <- as.numeric(meta_species_muc_df$final_pH)

# Visualize p-values: Specify the comparisons
# There are no signficant comparisions

# Generate boxplot, only show signifcant comparisions
ph_muc_plot_t_test <- ggplot(meta_species_muc_df, aes(x=as.factor(mucin_concentration_g), y=final_pH)) +
  # geom_dotplot(binaxis='y', stackdir='center', dotsize=0.5, position=position_dodge(1)) +
  labs(title="Final pH for each mucin conc.", x="g/L mucin", y="Final pH") +
  ylim(4,8.1) +
  geom_boxplot() +
  stat_compare_means(label.y=c(4)) +
  theme_classic() +
  theme(plot.title = element_text(size=10))

4. View the graphs

grid.arrange(DNA_Sucr_plot_t_test, DNA_fbs_plot_t_test, DNA_muc_plot_t_test, 
             Species_Sucr_plot_t_test, Species_fbs_plot_t_test, Species_muc_plot_t_test,
             ph_Sucr_plot_t_test, ph_fbs_plot_t_test, ph_muc_plot_t_test,
             nrow=3, ncol=3)

5. Venn Diagrams

a. Create a map file
# Glom to Species
myData_species <- tax_glom(myData, "Species")

# Remove Empty taxa
myData_species <- prune_taxa(taxa_sums(myData_species) > 0, myData_species)

# Make a dataframe of the list of genus and species names connected to the OTU
tax <- as.data.frame(myData_species@tax_table@.Data) # Isolate the tax table
tax$Genus_species <- paste(tax$Genus,tax$Species,sep=" ") # add a new column of Genus and species names
tax$Genus_species <- sub("g__", "", tax$Genus_species) # remove g__
tax$Genus_species <- sub("s__", "", tax$Genus_species) # remove s__
tax$Genus_species <- gsub("_", " ", tax$Genus_species) # replace all occurences of "_"
MAP_FILE_species <- cbind(OTU_ID = rownames(tax), Genus_species = tax$Genus_species) # Generate map file with correct columns and column names
# View(MAP_FILE_species)
b. Venn Diagram of sucrose
  • Species found in varying sucrose (0%, 0.1%, 0.5%, 0.8%)
  • All conditions examined had 10% FBS and 2.5g/L mucin
# Filter data to contain only conditions with 10% FBS and 2.5g/L mucin
myData_sucr <- subset_samples(myData, mucin_concentration_g == "2.5")
myData_sucr <- subset_samples(myData_sucr, fbs_concentration_percent == "10")

# Agglomerate to species level
myData_sucr_species <- tax_glom(myData_sucr, "Species")

# Remove empty taxa
myData_sucr_species <- prune_taxa(taxa_sums(myData_sucr_species) > 0, myData_sucr_species)

myData_sucr_species
levels(get_variable(myData_sucr_species, "Replicates_2"))

# Create empty lists
replicates_sucr_species<-rep(list(NA),4) # 4 is the number of sucrose conditions being compared
poplist<-rep(list(NA),4)
pops<-as.character(unique(sample_data(myData_sucr_species)$Replicates_2)) 

# Populate List with the Names of Groups we Want to Compare
poplist[[1]]<-pops[1]
poplist[[2]]<-pops[2]
poplist[[3]]<-pops[3]
poplist[[4]]<-pops[4]
names(replicates_sucr_species)<- pops

#Loop over each population, subset the phyloseq object to just that population and work out which ASVs are in that population, store those names in the list
# If the ASV is in just one of the samples of the condition, then it is included in the list!!!!
for(k in 1:4){ # Change number to number of conditions
  phy.sub<-prune_samples(sample_data(myData_sucr_species)$Replicates_2 %in% poplist[[k]],myData_sucr_species) 
  phy.sub.keep<-apply(otu_table(phy.sub),1,function(x){sum(x)>0}) 
  phy.sub.sub<-prune_taxa(phy.sub.keep,phy.sub)
  replicates_sucr_species[[k]]<-rownames(otu_table(phy.sub.sub))
}

# Can check Venn Diagram
# venn(replicates_sucr_species)

# Dataframes we need
replicates_sucr_species # Lists of the species in at least one replicate!
View(MAP_FILE_species) # Key comparing ASV code to species

# Convert to a data frame
replicates_sucr_species_df <- ldply(replicates_sucr_species, cbind)
replicates_sucr_species_df
colnames(replicates_sucr_species_df) <- c("Condition", "OTU_ID")

# Merge the two dataframes based on OTU_ID
# https://stackoverflow.com/questions/25520753/replace-values-from-a-column-of-one-dataframe-by-values-from-another-dataframe
replicates_sucr_species_df_merged <- merge(replicates_sucr_species_df, MAP_FILE_species, by = "OTU_ID", all.x = TRUE)

# Order based on Condition
replicates_sucr_species_df_merged <- replicates_sucr_species_df_merged[order(replicates_sucr_species_df_merged$Condition),]

# Export CSV 
# write.csv(replicates_sucr_species_df_merged, "Gradients_Graphs/Venn_sucrose_change_replicates_SPECIES_all.csv")

The CSV file can be used to generate a Venn Diagram with Interactivenn (http://www.interactivenn.net/)

c. Venn Diagram of FBS
  • Species found in varying FBS (0%, 10%, 30%, 50%)
  • All conditions examined had 0.1% sucrose and 2.5g/L mucin
# Filter data to contain only conditions with 0.1% sucrose and 2.5g/L mucin
myData_fbs <- subset_samples(myData, mucin_concentration_g == "2.5")
myData_fbs <- subset_samples(myData_fbs, sucrose_concentration_percent == "0.1")

# Agglomerate to species level
myData_fbs_species <- tax_glom(myData_fbs, "Species")

# Remove empty taxa
myData_fbs_species <- prune_taxa(taxa_sums(myData_fbs_species) > 0, myData_fbs_species)

myData_fbs_species
levels(get_variable(myData_fbs_species, "Replicates_2")) # 4 replicates

# Create empty lists
replicates_fbs_species<-rep(list(NA),4) # 4 is the number of FBS conditions being compared
poplist<-rep(list(NA),4)
pops<-as.character(unique(sample_data(myData_fbs_species)$Replicates_2)) 

# Populate List with the Names of Groups we Want to Compare
poplist[[1]]<-pops[1]
poplist[[2]]<-pops[2]
poplist[[3]]<-pops[3]
poplist[[4]]<-pops[4]
names(replicates_fbs_species)<- pops

#Loop over each population, subset the phyloseq object to just that population and work out which ASVs are in that population, store those names in the list
# If the ASV is in just one of the samples of the condition, then it is included in the list!!!!
for(k in 1:4){ # Change number to number of conditions
  phy.sub<-prune_samples(sample_data(myData_fbs_species)$Replicates_2 %in% poplist[[k]],myData_fbs_species) # Change the condition column
  phy.sub.keep<-apply(otu_table(phy.sub),1,function(x){sum(x)>0}) 
  phy.sub.sub<-prune_taxa(phy.sub.keep,phy.sub)
  replicates_fbs_species[[k]]<-rownames(otu_table(phy.sub.sub))
}

# Can check Venn Diagram
# venn(replicates_fbs_species)

# Dataframes we need
replicates_fbs_species # Lists of the species in at least one replicate!
View(MAP_FILE_species) # Key comparing ASV code to species

# Convert to a data frame
replicates_fbs_species_df <- ldply(replicates_fbs_species, cbind)
replicates_fbs_species_df
colnames(replicates_fbs_species_df) <- c("Condition", "OTU_ID")

# Merge the two dataframes based on OTU_ID
# https://stackoverflow.com/questions/25520753/replace-values-from-a-column-of-one-dataframe-by-values-from-another-dataframe
replicates_fbs_species_df_merged <- merge(replicates_fbs_species_df, MAP_FILE_species, by = "OTU_ID", all.x = TRUE)

# Order based on Condition
replicates_fbs_species_df_merged <- replicates_fbs_species_df_merged[order(replicates_fbs_species_df_merged$Condition),]

# Export CSV 
# write.csv(replicates_fbs_species_df_merged, "Gradients_Graphs/Venn_fbs_change_replicates_SPECIES_all.csv")

The CSV file can be used to generate a Venn Diagram with Interactivenn (http://www.interactivenn.net/)

d. Venn Diagram of mucin
  • Species found in varying mucin (0.1g/L, 2.5g/L, 8g/L)
  • All conditions examined had 0% FBS and 0.1% sucrose
# Filter data to contain only conditions with 0.1% sucrose and 2.5g/L mucin
myData_muc <- subset_samples(myData, sucrose_concentration_percent == "0.1")
myData_muc <- subset_samples(myData_muc, fbs_concentration_percent == "0")

# Agglomerate to species level
myData_muc_species <- tax_glom(myData_muc, "Species")

# Remove empty taxa
myData_muc_species <- prune_taxa(taxa_sums(myData_muc_species) > 0, myData_muc_species)

myData_muc_species
levels(get_variable(myData_fbs_species, "Replicates_2")) # 3 replicates

# Create empty lists
replicates_muc_species<-rep(list(NA),3) # 3 is the number of mucin conditions we are comparing
poplist<-rep(list(NA),3)
pops<-as.character(unique(sample_data(myData_muc_species)$Replicates_2)) # Change to be the correct column

# Populate List with the Names of Groups we Want to Compare
poplist[[1]]<-pops[1]
poplist[[2]]<-pops[2]
poplist[[3]]<-pops[3]
names(replicates_muc_species)<- pops

#Loop over each population, subset the phyloseq object to just that population and work out which ASVs are in that population, store those names in the list
# If the ASV is in just one of the samples of the condition, then it is included in the list!!!!
for(k in 1:3){ # Change number to number of conditions
  phy.sub<-prune_samples(sample_data(myData_muc_species)$Replicates_2 %in% poplist[[k]],myData_muc_species) # Change the condition column
  phy.sub.keep<-apply(otu_table(phy.sub),1,function(x){sum(x)>0}) 
  phy.sub.sub<-prune_taxa(phy.sub.keep,phy.sub)
  replicates_muc_species[[k]]<-rownames(otu_table(phy.sub.sub))
}

# Can check Venn Diagram
# venn(replicates_muc_species)

# Dataframes we need
replicates_muc_species # Lists of the species in at least one replicate!
View(MAP_FILE_species) # Key comparing ASV code to species

# Convert to a data frame
replicates_muc_species_df <- ldply(replicates_muc_species, cbind)
replicates_muc_species_df
colnames(replicates_muc_species_df) <- c("Condition", "OTU_ID")

# Merge the two dataframes based on OTU_ID
# https://stackoverflow.com/questions/25520753/replace-values-from-a-column-of-one-dataframe-by-values-from-another-dataframe
replicates_muc_species_df_merged <- merge(replicates_muc_species_df, MAP_FILE_species, by = "OTU_ID", all.x = TRUE)

# Order based on Condition
replicates_muc_species_df_merged <- replicates_muc_species_df_merged[order(replicates_muc_species_df_merged$Condition),]

# Export CSV 
# write.csv(replicates_muc_species_df_merged, "Gradients_Graphs/Venn_mucin_change_replicates_SPECIES_all.csv")

The CSV file can be used to generate a Venn Diagram with Interactivenn (http://www.interactivenn.net/)

Figure 3

1. Agglomerate to the phylum level
# Glom to phylum
myData_phylum <- tax_glom(myData, "Phylum")
myData_phylum # 9 phyla
2. Convert to an Ampvis Object
  • Ampvis requires rank names as “Kingdom”, “Phylum”, “Class”, “Order”, “Family”, “Genus”, “Species”; so the names must be changed
  • Merge the replicates for each condition resulting in a mean read count for each condition.
  • Need to do this to average the OTUs across the replicates
# Change rank names
myData_phylum_ampvis <- myData_phylum
colnames(tax_table(myData_phylum_ampvis)) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
# rank_names(myData_phylum_ampvis) # Now 7 ranks ordered in the old way, but remember they are wrong

# Convert to ampvis
myData_phylum_ampvis <- phyloseq_to_ampvis2(myData_phylum_ampvis)

# Fix the SampleID column name 
colnames(myData_phylum_ampvis$metadata) # Need to change "X_SampleID" to just "SampleID"
colnames(myData_phylum_ampvis$metadata)[colnames(myData_phylum_ampvis$metadata) == 'X_SampleID'] <- 'SampleID'

# Merging in ampvis takes the mean of the counts for all the merged samples
myData_phylum_ampvis_mergedRep <- amp_mergereplicates(myData_phylum_ampvis, "Replicates_2")
myData_phylum_ampvis_mergedRep # 16 samples, 9 Phyla (OTUs here)

# Convert to percent abundance
myData_phylum_ampvis_mergedRep_ra <- normaliseTo100(myData_phylum_ampvis_mergedRep)

#Convert to a dataframe
myData_phylum_ampvis_mergedRep_ra_df <- amp_export_long(myData_phylum_ampvis_mergedRep_ra, 
                                                                 tax_levels = "Phylum")
3. Make the barplot
Barplot <- ggplot(myData_phylum_ampvis_mergedRep_ra_df, aes(x=Sucrose_and_mucin, y = count, fill = Phylum)) +
  geom_bar(stat="identity", position="stack") +
  theme_gray() + # Makes it look more like the ampvis heatmap
  facet_grid(~fbs_concentration_percent, scales="free", space = "free") + 
  theme(axis.text.x = element_text(angle = 45, size=8, vjust = 1, hjust=1),
        axis.text.y = element_text(size=8), axis.title.y = element_text(size=8),
        legend.position="left", legend.text=element_text(size=10), 
        legend.title = element_text(size=10), legend.key.size = unit(0.2, "cm"),
        plot.subtitle = element_text(hjust=0.5, size=10),
        strip.text.x = element_text(size=5), # changes size of FBS labels
        plot.margin=unit(c(0.5,0,0.5,1.5),"cm"), 
        panel.spacing = unit(0.6, "lines")) + # increases the distance between FBS groups
  labs(title="Figure 3", x="Medium condition", y=("Average % Abundance"), 
       caption=NULL, subtitle="% FBS") + 
  scale_y_continuous(expand=c(0,0)) + scale_x_discrete(expand=c(0,0)) # Removes the extra space around the plot
Barplot

Figure 4

1. Filter and rarefy data
# Remove unassigned
myData <- subset_taxa(myData, Phylum != "Unassigned")
myData
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 764 taxa and 94 samples ]
## sample_data() Sample Data:       [ 94 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 764 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 764 tips and 759 internal nodes ]
# Remove controls 
controls <- c("media", "not_applicable") # names of controls in the column "sample_type"
myData <- subset_samples(myData, !(sample_type %in% controls))
myData
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 764 taxa and 63 samples ]
## sample_data() Sample Data:       [ 63 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 764 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 764 tips and 759 internal nodes ]
# RAREFY
set.seed(142)
myData_rarefied <- rarefy_even_depth(myData, sample.size=min(sample_sums(myData)), rngseed=F, replace=T, trimOTUs=T, verbose=T)
# 419OTUs were removed because they are no longer 
# present in any sample after random subsampling

# Remove contaminants
Contam_o <- c("o__Rhizobiales")
Contam_g <- c("g__Delftia","g__Yersinia", "g__Listeria", "g__Mogibacterium", "g__Acinetobacter", "g__Jeogalicoccus", "g__Pseudomonas", "g__Pseudoramibacter")
Contam_s <- c("s__platensis","s__subtilis", "s__paradoxus","s__coli","s__aeruginosa", "s__fluorescens")
Contam_a <- c("49ed5dfd41e37e23a4e525c5a21a8fa8","ab2bb49867509a1650d185f93c305875")
myData_rarefied <- subset_taxa(myData_rarefied, !(Order %in% Contam_o))
myData_rarefied <- subset_taxa(myData_rarefied, !(Genus %in% Contam_g))
myData_rarefied <- subset_taxa(myData_rarefied, !(Species %in% Contam_s))
myData_rarefied <- subset_taxa(myData_rarefied, !(ASV %in% Contam_a))
myData_rarefied
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 329 taxa and 63 samples ]
## sample_data() Sample Data:       [ 63 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 329 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 329 tips and 327 internal nodes ]
myData_unrarefied <- subset_taxa(myData, !(Order %in% Contam_o))
myData_unrarefied <- subset_taxa(myData_unrarefied, !(Genus %in% Contam_g))
myData_unrarefied <- subset_taxa(myData_unrarefied, !(Species %in% Contam_s))
myData_unrarefied <- subset_taxa(myData_unrarefied, !(ASV %in% Contam_a))
myData_unrarefied
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 675 taxa and 63 samples ]
## sample_data() Sample Data:       [ 63 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 675 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 675 tips and 670 internal nodes ]
# Remove spent SHI
myData_rarefied <- subset_samples(myData_rarefied, media_type != "spent")
myData_rarefied
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 329 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 329 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 329 tips and 327 internal nodes ]
myData_unrarefied <- subset_samples(myData_unrarefied, media_type != "spent")
myData_unrarefied
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 675 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 675 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 675 tips and 670 internal nodes ]
# Remove Empty Taxa
myData_rarefied <- prune_taxa(taxa_sums(myData_rarefied) > 0, myData_rarefied)
myData_rarefied 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 272 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 272 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 272 tips and 270 internal nodes ]
myData_unrarefied <- prune_taxa(taxa_sums(myData_unrarefied) > 0, myData_unrarefied)
myData_unrarefied 
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 301 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 301 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 301 tips and 299 internal nodes ]
2. Alpha Diversity: Observed ASVs

using rarefied data

p_observed <- plot_richness(myData_rarefied, x="Sucrose_and_mucin", color="mucin_concentration_g",
                                          measures=c("Observed")) +
  theme_bw() +
  labs(title="Observed ASVs", color="g/L mucin", subtitle="% FBS", x="% sucrose", y="Observed ASVs") +
  geom_boxplot(aes(color = mucin_concentration_g), alpha = 0.1) +
  facet_grid(~fbs_concentration_percent, scales="free", space = "free") +
  theme(axis.text.x = element_text(angle = 45, size=9, vjust = 1, hjust=1), 
        plot.subtitle = element_text(hjust=0.5, size=10),
        plot.margin=unit(c(0.1,0.1,0.1,2),"cm"), 
        panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
        # panel.border = element_blank(), axis.line = element_line(colour = "black")) +
  scale_x_discrete(labels=c("0Sucr_2.5Muc"="0", "0.1Sucr_0.1Muc"="0.1", "0.1Sucr_2.5Muc"="0.1", "0.1Sucr_8Muc"="0.1",
                            "0.5Sucr_2.5Muc"="0.5", "0Sucr_0Muc"="0", "0.8Sucr_2.5Muc"="0.8", "Inoculum"="Inoculum"))
p_observed

2. Alpha Diversity: Shannon Index

using rarefied data

p_shannon <- plot_richness(myData_rarefied, x="Sucrose_and_mucin", color="mucin_concentration_g",
                            measures=c("Shannon")) +
  theme_bw() +
  labs(title="Shannon Index", color="g/L mucin", subtitle="% FBS", x="% sucrose") +
  geom_boxplot(aes(color = mucin_concentration_g), alpha = 0.1) +
  facet_grid(~fbs_concentration_percent, scales="free", space = "free") +
  theme(axis.text.x = element_text(angle = 45, size=9, vjust = 1, hjust=1), 
        plot.subtitle = element_text(hjust=0.5, size=10),
        plot.margin=unit(c(0.1,0.1,0.1,2),"cm"), 
        panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  scale_x_discrete(labels=c("0Sucr_2.5Muc"="0", "0.1Sucr_0.1Muc"="0.1", "0.1Sucr_2.5Muc"="0.1", "0.1Sucr_8Muc"="0.1",
                            "0.5Sucr_2.5Muc"="0.5", "0Sucr_0Muc"="0", "0.8Sucr_2.5Muc"="0.8", "Inoculum"="Inoculum"))
p_shannon

3. Alpha Diversity: Simpson Index

using rarefied data

p_simpson <- plot_richness(myData_rarefied, x="Sucrose_and_mucin", color="mucin_concentration_g",
                           measures=c("Simpson")) +
  theme_bw() + 
  labs(title="Simpson Index", color="g/L mucin", subtitle="% FBS", x="% sucrose") +
  geom_boxplot(aes(color = mucin_concentration_g), alpha = 0.1) +
  facet_grid(~fbs_concentration_percent, scales="free", space = "free") +
  theme(axis.text.x = element_text(angle = 45, size=9, vjust = 1, hjust=1), 
        plot.subtitle = element_text(hjust=0.5, size=10),
        plot.margin=unit(c(0.1,0.1,0.1,2),"cm"),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  scale_x_discrete(labels=c("0Sucr_2.5Muc"="0", "0.1Sucr_0.1Muc"="0.1", "0.1Sucr_2.5Muc"="0.1", "0.1Sucr_8Muc"="0.1",
                            "0.5Sucr_2.5Muc"="0.5", "0Sucr_0Muc"="0", "0.8Sucr_2.5Muc"="0.8", "Inoculum"="Inoculum"))
p_simpson

4. Beta Diversity: Unweighted Unifrac

Using unrarefied data

myData_ord_unweighted <- ordinate(myData_unrarefied, method="PCoA", distance="unifrac", weighted=F)
myData_ord_unweighted_plot <- plot_ordination(myData_unrarefied, myData_ord_unweighted, type="samples", color="Replicates_2", 
                                                title="Unweighted Unifrac PCoA", axes=1:2) +
  theme_classic()
myData_ord_unweighted_plot

4. Beta Diversity: Weighted Unifrac

Using unrarefied data

myData_ord_weighted <- ordinate(myData_unrarefied, method="PCoA", distance="unifrac", weighted=T)
myData_ord_weighted_plot <- plot_ordination(myData_unrarefied, myData_ord_weighted, type="samples", color="Replicates_2", 
                                              title="Weighted Unifrac PCoA", axes=1:2) +
  theme_classic()
myData_ord_weighted_plot

Figure 5

1. Preprocess the data
# Glom to species
myData_species <- tax_glom(myData, "Species") #agglomerate to the Species level
myData_species # (153 species)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 153 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 153 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 153 tips and 152 internal nodes ]
# Prune empty taxa
myData_species <- prune_samples(sample_sums(myData_species)>0, myData_species)

# Change rank names
myData_species_ampvis <- myData_species
colnames(tax_table(myData_species_ampvis)) = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")

# Convert to ampvis
myData_species_ampvis <- phyloseq_to_ampvis2(myData_species_ampvis)
2. Presence/absence of Treponema
# Normalise OTUs to percent abundance then subset
# https://madsalbertsen.github.io/ampvis2/reference/amp_heatmap.html#preserving-relative-abundances-in-a-subset-of-larger-data
myData_species_ampvis_Trep <- amp_subset_taxa(myData_species_ampvis, tax_vector="g__Treponema", normalise=T)
# 134 OTUs have been filtered 
# Before: 153 OTUs
# After: 19 OTUs

Heatmap_Trep_pa <- amp_heatmap(myData_species_ampvis_Trep, group_by="Sucrose_and_mucin",facet_by="fbs_concentration_percent",
                               normalise=F, tax_aggregate="Genus", tax_add="Family",
                               tax_show=20, tax_empty="remove", 
                               color_vector=c("white","Dark Red"), plot_colorscale="log10", # added a color vector
                               max_abundance = 0.0001, min_abundance = .0000001, # these changed
                               plot_values = F, plot_values_size = 3) + # Change plot values to falst
  theme(axis.text.x = element_text(angle = 45, size=9, vjust = 1),
        axis.text.y = element_text(size=9),
        legend.position="none",
        strip.text.x = element_text(size=5),
        plot.subtitle = element_text(hjust=0.5, size=10),
        plot.caption = element_text(hjust=0.5, size=10),
        plot.title = element_text(hjust=0, size=12)) +
  labs(fill = "Presence/absence", title="Presence/absence of Treponema spp", subtitle="% FBS", caption="Medium condition")
Heatmap_Trep_pa

3. Presence/absence of Saccharibacteria
# Normalise OTUs to percent abundance then subset
# https://madsalbertsen.github.io/ampvis2/reference/amp_heatmap.html#preserving-relative-abundances-in-a-subset-of-larger-data
myData_species_ampvis_TM7 <- amp_subset_taxa(myData_species_ampvis, tax_vector="p__Saccharibacteria_(TM7)", normalise=T)
# 147 OTUs have been filtered 
# Before: 153 OTUs
# After: 6 OTUs

# Presence absence heatmap
Heatmap_tm7_pa <- amp_heatmap(myData_species_ampvis_TM7, group_by="Sucrose_and_mucin",facet_by="fbs_concentration_percent",
                               normalise=F, tax_aggregate="Genus", tax_add="Family",
                               tax_show=20, tax_empty="remove", 
                               color_vector=c("white","Dark Red"), plot_colorscale="log10", # added a color vector
                               max_abundance = 0.0001, min_abundance = .0000001, # these changed
                               plot_values = F, plot_values_size = 3) + # Change plot values to falst
  theme(axis.text.x = element_text(angle = 45, size=9, vjust = 1),
        axis.text.y = element_text(size=9),
        legend.position="none",
        strip.text.x = element_text(size=5),
        plot.subtitle = element_text(hjust=0.5, size=10),
        plot.caption = element_text(hjust=0.5, size=10),
        plot.title = element_text(hjust=0, size=12)) +
  labs(fill = "Presence/absence", title="Presence/absence of TM7 spp", subtitle="% FBS", caption="Medium condition")
Heatmap_tm7_pa

4. Presence/absence of Actinomyces
# Normalise OTUs to percent abundance then subset
# https://madsalbertsen.github.io/ampvis2/reference/amp_heatmap.html#preserving-relative-abundances-in-a-subset-of-larger-data
myData_species_ampvis_Actino <- amp_subset_taxa(myData_species_ampvis, tax_vector="g__Actinomyces", normalise=T)
# 141 OTUs have been filtered 
# Before: 153 OTUs
# After: 12 OTUs
# Warning message:
#   One or more samples have 0 total reads 

# Presence/absence heatmap
Heatmap_Actino_pa <- amp_heatmap(myData_species_ampvis_Actino, group_by="Sucrose_and_mucin",facet_by="fbs_concentration_percent",
                               normalise=F, tax_aggregate="Genus", tax_add="Family",
                               tax_show=20, tax_empty="remove", 
                               color_vector=c("white","Dark Red"), plot_colorscale="log10", # added a color vector
                               max_abundance = 0.0001, min_abundance = .0000001, # these changed
                               plot_values = F, plot_values_size = 3) + # Change plot values to falst
  theme(axis.text.x = element_text(angle = 45, size=9, vjust = 1),
        axis.text.y = element_text(size=9),
        legend.position="none",
        strip.text.x = element_text(size=5),
        plot.subtitle = element_text(hjust=0.5, size=10),
        plot.caption = element_text(hjust=0.5, size=10),
        plot.title = element_text(hjust=0, size=12)) +
  labs(fill = "Presence/absence", title="Presence/absence of Actinomyces spp.", subtitle="% FBS", caption="Medium condition")
Heatmap_Actino_pa

Figure S1

1. Preprocess the data
#Transform Count Data to % Abundance 
myData_ra <- transform_sample_counts(myData, function(x) (x/sum(x))*100)

#Filter
myData_ra <- prune_taxa(taxa_sums(myData_ra) >0, myData_ra)

#Agglomerate Data at the Phylum Level
myData_ra_phylum  = tax_glom(myData_ra, "Phylum", NArm = TRUE)
myData_ra_phylum
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 9 taxa and 36 samples ]
## sample_data() Sample Data:       [ 36 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 9 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 9 tips and 8 internal nodes ]
# Remove inoculum
myData_ra_phylum_2 <- subset_samples(myData_ra_phylum, Type == "Biofilm")
myData_ra_phylum_2
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 9 taxa and 34 samples ]
## sample_data() Sample Data:       [ 34 samples by 34 sample variables ]
## tax_table()   Taxonomy Table:    [ 9 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 9 tips and 8 internal nodes ]
#Filter
myData_ra_phylum_2 <- prune_taxa(taxa_sums(myData_ra_phylum_2) >0, myData_ra_phylum_2)

#Convert to Dataframe
myData_ra_phylum_2_df <- psmelt(myData_ra_phylum_2) 
2. Firmicutes percent abundance vs sucrose concentration
trend_firmicutes <- ggplot(subset(myData_ra_phylum_2_df, Phylum %in% c("p__Firmicutes")),
                             aes(x=as.factor(sucrose_concentration_percent), y=Abundance)) +
  geom_point(size = 1, alpha = 0.5) + # adds the specific points to the graph and makes transparent 
  theme_grey() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"), 
        strip.text.x = element_text(size=6), # changes size of facet labels
        axis.text.x = element_text(angle = 0, size=9), axis.title.x = element_text(size=9),
        axis.text.y = element_text(size=9), axis.title.y = element_text(size=9)) +
  scale_y_continuous(trans = 'log10') + # Expand the y-axis
  geom_smooth(method=lm, aes(group=1), se=T, color="dark grey", formula=y~x) + # https://stackoverflow.com/questions/10911057/adding-a-simple-lm-trend-line-to-a-ggplot-boxplot
  labs(title="Firmicutes % abundance with sucrose conc.", x="% sucrose", y=("% Abundance (log10 transformed)"), subtitle="All medium contitions",
       caption=NULL)
trend_firmicutes

# https://sejohnston.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/
regression_trend_firmicutes <- lm(Abundance ~ as.factor(sucrose_concentration_percent), data = subset(myData_ra_phylum_2_df, Phylum %in% c("p__Firmicutes")))
summary(regression_trend_firmicutes) # Give the R2 value
## 
## Call:
## lm(formula = Abundance ~ as.factor(sucrose_concentration_percent), 
##     data = subset(myData_ra_phylum_2_df, Phylum %in% c("p__Firmicutes")))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.5054  -3.9289  -0.1508   3.7473  24.1833 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                   38.277      2.858  13.392
## as.factor(sucrose_concentration_percent)0.1   15.964      3.690   4.326
## as.factor(sucrose_concentration_percent)0.5   52.237      4.951  10.552
## as.factor(sucrose_concentration_percent)0.8   51.049      5.347   9.547
##                                                       Pr(>|t|)    
## (Intercept)                                 0.0000000000000343 ***
## as.factor(sucrose_concentration_percent)0.1           0.000155 ***
## as.factor(sucrose_concentration_percent)0.5 0.0000000000129133 ***
## as.factor(sucrose_concentration_percent)0.8 0.0000000001332891 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.039 on 30 degrees of freedom
## Multiple R-squared:  0.8442, Adjusted R-squared:  0.8286 
## F-statistic: 54.18 on 3 and 30 DF,  p-value: 0.000000000003206
2. Bacteroidetes percent abundance vs FBS concentration
trend_Bacteroidetes <- ggplot(subset(myData_ra_phylum_2_df, Phylum %in% c("p__Bacteroidetes")),
                                aes(as.factor(fbs_concentration_percent), Abundance)) +
  geom_point(size = 1, alpha = 0.5) + # adds the specific points to the graph and makes transparent 
  theme_grey() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black"), 
        strip.text.x = element_text(size=6), # changes size of facet labels
        axis.text.x = element_text(angle = 0, size=9), axis.title.x = element_text(size=9), 
        axis.text.y = element_text(size=9), axis.title.y = element_text(size=9)) +
  scale_y_continuous(trans = 'log10') + # Expand the y-axis
  geom_smooth(method=lm, aes(group=1), se=T, color="dark grey") + # https://stackoverflow.com/questions/10911057/adding-a-simple-lm-trend-line-to-a-ggplot-boxplot
  labs(title="Bacteroidetes % abundance with sucrose conc.", x="% FBS", y=("% Abundance (Log10 transformed)"), subtitle="All medium contitions",
       caption=NULL)
trend_Bacteroidetes

# https://sejohnston.com/2012/08/09/a-quick-and-easy-function-to-plot-lm-results-in-r/
regression_trend_Bacteroidetes <- lm(Abundance ~ as.factor(fbs_concentration_percent), data = subset(myData_ra_phylum_2_df, Phylum %in% c("p__Bacteroidetes")))
summary(regression_trend_Bacteroidetes) # Give the R2 value
## 
## Call:
## lm(formula = Abundance ~ as.factor(fbs_concentration_percent), 
##     data = subset(myData_ra_phylum_2_df, Phylum %in% c("p__Bacteroidetes")))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.638 -4.878 -1.339  5.555 15.485 
## 
## Coefficients:
##                                        Estimate Std. Error t value     Pr(>|t|)
## (Intercept)                               8.635      1.945   4.439      0.00012
## as.factor(fbs_concentration_percent)10    2.024      2.751   0.736      0.46771
## as.factor(fbs_concentration_percent)20   12.401      4.492   2.761      0.00989
## as.factor(fbs_concentration_percent)30   13.458      4.010   3.356      0.00222
## as.factor(fbs_concentration_percent)50   56.975      7.278   7.829 0.0000000124
##                                           
## (Intercept)                            ***
## as.factor(fbs_concentration_percent)10    
## as.factor(fbs_concentration_percent)20 ** 
## as.factor(fbs_concentration_percent)30 ** 
## as.factor(fbs_concentration_percent)50 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.013 on 29 degrees of freedom
## Multiple R-squared:  0.7158, Adjusted R-squared:  0.6765 
## F-statistic: 18.26 on 4 and 29 DF,  p-value: 0.0000001363

Finding potential G3 Host

1. Organize the data frame
# Convert to species level 
myData_species <- tax_glom(myData, taxrank = "Species")

# Convert to percent abundance 
myData_species_pa <- transform_sample_counts(myData_species, function(x) (x/sum(x)*100))

# Export OTU and TAX table to a CSV each (can this step be bypassed?)
write.csv(tax_table(myData_species_pa), file="G3_host_MLRM_copy/percent_abundance_TAX_table_for_G3_host_ASV-level2.csv")
write.csv(otu_table(myData_species_pa), file="G3_host_MLRM_copy/percent_abundance_OTU_table_for_G3_host_ASV-level2.csv")

# import the CSVs (did not alter them at all)
TAX_species_for_G3_host <- read.csv(file="G3_host_MLRM_copy/percent_abundance_TAX_table_for_G3_host_ASV-level2.csv")
# View(TAX_species_for_G3_host)
OTU_species_for_G3_host <- read.csv(file="G3_host_MLRM_copy/percent_abundance_OTU_table_for_G3_host_ASV-level2.csv")
# View(TAX_species_for_G3_host)

# Merge the two dataframes based on X (which is the OTU_ID)
# https://stackoverflow.com/questions/25520753/replace-values-from-a-column-of-one-dataframe-by-values-from-another-dataframe
for_G3_host_species_merged_df <- merge(TAX_species_for_G3_host, OTU_species_for_G3_host, by = "X", all.x = TRUE)
# View(for_G3_host_species_merged_df)

# Do some post processing on the data frame
for_G3_host_species_merged_df$ASV <- NULL # remove the empty ASV column
for_G3_host_species_merged_df$X14.033.sub.plaque.Q <- NULL # remove the source plaque column
for_G3_host_species_merged_df$X9.033.sub.plaque.Q <- NULL
colnames(for_G3_host_species_merged_df)
# View(for_G3_host_species_merged_df)
for_G3_host_species_merged_df$Genus_species <- paste(for_G3_host_species_merged_df$Genus,for_G3_host_species_merged_df$Species,sep="_") # create a new column of genus species names 
for_G3_host_species_merged_df$Genus_species <- sub("g__", "", for_G3_host_species_merged_df$Genus_species) # remove g__
for_G3_host_species_merged_df$Genus_species <- sub("s__", "", for_G3_host_species_merged_df$Genus_species) # remove s__
for_G3_host_species_merged_df[c("X", "Kingdom", "Phylum", "Class", "Order", "Genus", "Species")] <- list(NULL) # Remove the extra columns
rownames(for_G3_host_species_merged_df) <- for_G3_host_species_merged_df$Genus_species # make genus species the row names
for_G3_host_species_merged_df$Genus_species <- NULL # remove the genus species column
2. Separate the data frame into two dataframes, one of samples containing HMT-351 and one of samples without HMT-351
which(rownames(for_G3_host_species_merged_df) == "Saccharibacteria_(TM7)_[G-3]_bacterium_HMT_351") 
# This is row number 22

# Make two dataframes, one with samples containing G3  and one samples without any G3 ####
df <- for_G3_host_species_merged_df
df[22,] > 0
# https://stackoverflow.com/questions/40800731/subset-columns-based-on-row-value
# https://win-vector.com/2018/02/27/r-tip-use-drop-false-with-data-frames/
wG3 <- df[,df[22,]>0,drop=FALSE]
# View(wG3)
ncol(wG3) # 19
woutG3 <- df[,df[22,]==0,drop=FALSE]
# View(woutG3)
ncol(woutG3) # 15

wG3 <- t(wG3) # transpose
rownames(wG3) <- c() # remove the row names
# View(wG3) # perfect!

woutG3 <- t(woutG3) # transpose
rownames(woutG3) <- c() # remove the row names
# View(woutG3) # perfect!
3. Perform a one-sided non-parametric Wilcoxon test on each species

We want to know if the mean for wG3 is greater than the mean for wOut G3, so want a one sided Wilcoxon test (non-parametric) - Ho: wG3 - woutG3 = 0 - Ha: wG3 - woutG3 > 0

# Loop #
# (from https://stackoverflow.com/questions/44400278/t-test-for-each-row-in-r)

# Create an empty list that has the same number of values as number of columns (this will be the number of p-values we want at the end)
pvals <- rep(NA, ncol(wG3))
# pvals # empty list where every value is NA

# Make a loop
for(i in 1:ncol(wG3)) pvals[i] <- 
  wilcox.test(wG3[,i], woutG3[,i], alternative="greater", mu=0)$p.value
# [,i] points to the columns
# This is comparing each column from each dataframe, running a t-test and giving just the p-value,
# then writing that to each spot in the list
pvals
typeof(pvals) # its a double which is like an integer 
class(pvals) # it is numeric

# Create a list of the species names
colnames(wG3)
# This pulls up the column names of wG3
names <- colnames(wG3)
names

# Combine the pvals and the names into one data frame
species_level_p_values <- data.frame(names, pvals)
species_level_p_values

# Give the data frame nicer column headers
cols <- c("Species", "p_value")
colnames(species_level_p_values) <- cols
species_level_p_values

# Add a column showing whether it is significant or not
# https://stackoverflow.com/questions/37321020/adding-new-column-with-conditional-values-using-ifelse
species_level_p_values$Significant <- ifelse(species_level_p_values$p_value < 0.001, '**', ifelse(species_level_p_values$p_value < 0.05, "*", "ns"))
View(species_level_p_values)
4. Add average percent abundances of each species
wG3_average_percent_abundance <- colMeans(wG3)
wG3_pa_df <- as.data.frame(wG3_average_percent_abundance)
row.names(wG3_pa_df)
wG3_pa_df <- data.frame(Species = rownames(wG3_pa_df), wG3_pa_df)
rownames(wG3_pa_df) <- c() # remove the row names
View(wG3_pa_df)

woutG3_average_percent_abundance <- colMeans(woutG3)
woutG3_pa_df <- as.data.frame(woutG3_average_percent_abundance)
row.names(woutG3_pa_df)
woutG3_pa_df <- data.frame(Species = rownames(woutG3_pa_df), woutG3_pa_df)
rownames(woutG3_pa_df) <- c() # remove the row names
# View(woutG3_pa_df)

# Combine percent abundances with p-values
species_level_p_values_pa <- merge(wG3_pa_df, woutG3_pa_df, by = "Species", all.x = TRUE)
species_level_p_values_pa <- merge(species_level_p_values_pa, species_level_p_values, by = "Species", all.x = TRUE)
write.csv(species_level_p_values_pa, "G3_host_MLRM_copy/species_level_p_values_pa2.csv")