library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(phyloseq)
library(ape)
library(genefilter)
##
## Attaching package: 'genefilter'
##
## The following object is masked from 'package:readr':
##
## spec
library(ggplot2)
# Import Count table. Skip first row of tsv file, which is just some text
asv_counts <- read_tsv(file="~/export/table.tsv")
## Rows: 193 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): OTU ID
## dbl (11): RACEKX3, SANPEDRO, STOD4, BALK10C, ITA01A, IZR, IZRMEDIA, KIEL9, N...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# And specify that the first column of data are rownames
asv_counts <- column_to_rownames(asv_counts, var = colnames(asv_counts)[1])
# Import taxonomy of ASVs
taxonomy <- read_tsv(file="~/export/taxonomy.tsv")
## Rows: 193 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Feature ID, Taxon
## dbl (1): Confidence
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Import tree file
#tree = read_tree("~/export/tree.nwk")
# Import fasta
asv_fasta <- read.FASTA(file = "~/export/dna-sequences.fasta")
#Import metadata file
sample_info_tab <- read_tsv("~/export/metadata_more_info.tsv")
## Rows: 11 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): sample-ID, type, clade
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Also delete the row with the QIIME2 category codes
#sample_info_tab <- sample_info_tab[-1,]
head(taxonomy)
## # A tibble: 6 × 3
## `Feature ID` Taxon Confi…¹
## <chr> <chr> <dbl>
## 1 000c465694b030b575b0399f2c5fdd29 d__Archaea; p__Halobacterota; c__Met… 0.999
## 2 00af279a7dcd6d0cab8f388df82f2075 d__Archaea; p__Asgardarchaeota; c__O… 0.879
## 3 015c2c21f0b0e140796b2d0a2fa72080 d__Archaea; p__Halobacterota; c__Met… 0.944
## 4 020bf741caada7c0a497b09e997463e2 d__Archaea; p__Halobacterota; c__Met… 0.757
## 5 02c6990d44eaa5a670e6d9b15d02501d d__Archaea; p__Nanoarchaeota; c__Nan… 0.954
## 6 02d8c10ed712b0f49cc734646de936f9 d__Archaea; p__Crenarchaeota; c__Nit… 0.995
## # … with abbreviated variable name ¹Confidence
Taxonomy table, split the taxonomy in columns
taxonomy <- taxonomy %>%
separate(Taxon, into=c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus","Species"), sep=";") %>%
select (-Confidence) %>% #delete the column Confidence
column_to_rownames(var = 'Feature ID') # specify that the "Feature ID" column of data are rownames
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 124 rows [1, 4,
## 6, 7, 10, 11, 12, 13, 16, 17, 20, 21, 22, 25, 27, 28, 30, 32, 33, 36, ...].
head(taxonomy)
## Kingdom Phylum
## 000c465694b030b575b0399f2c5fdd29 d__Archaea p__Halobacterota
## 00af279a7dcd6d0cab8f388df82f2075 d__Archaea p__Asgardarchaeota
## 015c2c21f0b0e140796b2d0a2fa72080 d__Archaea p__Halobacterota
## 020bf741caada7c0a497b09e997463e2 d__Archaea p__Halobacterota
## 02c6990d44eaa5a670e6d9b15d02501d d__Archaea p__Nanoarchaeota
## 02d8c10ed712b0f49cc734646de936f9 d__Archaea p__Crenarchaeota
## Class Order
## 000c465694b030b575b0399f2c5fdd29 c__Methanomicrobia o__Methanomicrobiales
## 00af279a7dcd6d0cab8f388df82f2075 c__Odinarchaeia o__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 c__Methanomicrobia o__Methanomicrobiales
## 020bf741caada7c0a497b09e997463e2 c__Methanomicrobia o__Methanomicrobiales
## 02c6990d44eaa5a670e6d9b15d02501d c__Nanoarchaeia o__Woesearchaeales
## 02d8c10ed712b0f49cc734646de936f9 c__Nitrososphaeria o__Nitrososphaerales
## Family
## 000c465694b030b575b0399f2c5fdd29 f__Methanospirillaceae
## 00af279a7dcd6d0cab8f388df82f2075 f__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 f__Methanoregulaceae
## 020bf741caada7c0a497b09e997463e2 f__Methanoregulaceae
## 02c6990d44eaa5a670e6d9b15d02501d f__CG1-02-57-44
## 02d8c10ed712b0f49cc734646de936f9 f__Nitrososphaeraceae
## Genus
## 000c465694b030b575b0399f2c5fdd29 g__Methanospirillum
## 00af279a7dcd6d0cab8f388df82f2075 g__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 g__Methanoregula
## 020bf741caada7c0a497b09e997463e2 g__Methanolinea
## 02c6990d44eaa5a670e6d9b15d02501d g__CG1-02-57-44
## 02d8c10ed712b0f49cc734646de936f9 g__Candidatus_Nitrocosmicus
## Species
## 000c465694b030b575b0399f2c5fdd29 <NA>
## 00af279a7dcd6d0cab8f388df82f2075 s__uncultured_marine
## 015c2c21f0b0e140796b2d0a2fa72080 s__Methanoregula_formicica
## 020bf741caada7c0a497b09e997463e2 <NA>
## 02c6990d44eaa5a670e6d9b15d02501d s__uncultured_euryarchaeote
## 02d8c10ed712b0f49cc734646de936f9 <NA>
making phyloseq objects out of each of our input data tables (in the last tutorial, I imported the tree using phyloseq so it is already a phyloseq object)
ASV = otu_table(data.frame(asv_counts), taxa_are_rows = TRUE)
TAX = tax_table(as.matrix(taxonomy))
META = sample_data(data.frame(sample_info_tab, row.names = sample_info_tab$`sample-ID`))
The taxonomy in the taxonomy table is retained in one column and the different levels are separated by underscore, eg:
head(taxonomy)
## Kingdom Phylum
## 000c465694b030b575b0399f2c5fdd29 d__Archaea p__Halobacterota
## 00af279a7dcd6d0cab8f388df82f2075 d__Archaea p__Asgardarchaeota
## 015c2c21f0b0e140796b2d0a2fa72080 d__Archaea p__Halobacterota
## 020bf741caada7c0a497b09e997463e2 d__Archaea p__Halobacterota
## 02c6990d44eaa5a670e6d9b15d02501d d__Archaea p__Nanoarchaeota
## 02d8c10ed712b0f49cc734646de936f9 d__Archaea p__Crenarchaeota
## Class Order
## 000c465694b030b575b0399f2c5fdd29 c__Methanomicrobia o__Methanomicrobiales
## 00af279a7dcd6d0cab8f388df82f2075 c__Odinarchaeia o__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 c__Methanomicrobia o__Methanomicrobiales
## 020bf741caada7c0a497b09e997463e2 c__Methanomicrobia o__Methanomicrobiales
## 02c6990d44eaa5a670e6d9b15d02501d c__Nanoarchaeia o__Woesearchaeales
## 02d8c10ed712b0f49cc734646de936f9 c__Nitrososphaeria o__Nitrososphaerales
## Family
## 000c465694b030b575b0399f2c5fdd29 f__Methanospirillaceae
## 00af279a7dcd6d0cab8f388df82f2075 f__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 f__Methanoregulaceae
## 020bf741caada7c0a497b09e997463e2 f__Methanoregulaceae
## 02c6990d44eaa5a670e6d9b15d02501d f__CG1-02-57-44
## 02d8c10ed712b0f49cc734646de936f9 f__Nitrososphaeraceae
## Genus
## 000c465694b030b575b0399f2c5fdd29 g__Methanospirillum
## 00af279a7dcd6d0cab8f388df82f2075 g__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 g__Methanoregula
## 020bf741caada7c0a497b09e997463e2 g__Methanolinea
## 02c6990d44eaa5a670e6d9b15d02501d g__CG1-02-57-44
## 02d8c10ed712b0f49cc734646de936f9 g__Candidatus_Nitrocosmicus
## Species
## 000c465694b030b575b0399f2c5fdd29 <NA>
## 00af279a7dcd6d0cab8f388df82f2075 s__uncultured_marine
## 015c2c21f0b0e140796b2d0a2fa72080 s__Methanoregula_formicica
## 020bf741caada7c0a497b09e997463e2 <NA>
## 02c6990d44eaa5a670e6d9b15d02501d s__uncultured_euryarchaeote
## 02d8c10ed712b0f49cc734646de936f9 <NA>
First check that the inputs are in compatible formats by checking for ASV names with the phyloseq function, taxa_names
head(taxa_names(TAX))
## [1] "000c465694b030b575b0399f2c5fdd29" "00af279a7dcd6d0cab8f388df82f2075"
## [3] "015c2c21f0b0e140796b2d0a2fa72080" "020bf741caada7c0a497b09e997463e2"
## [5] "02c6990d44eaa5a670e6d9b15d02501d" "02d8c10ed712b0f49cc734646de936f9"
head(taxa_names(ASV))
## [1] "000c465694b030b575b0399f2c5fdd29" "00af279a7dcd6d0cab8f388df82f2075"
## [3] "015c2c21f0b0e140796b2d0a2fa72080" "020bf741caada7c0a497b09e997463e2"
## [5] "02c6990d44eaa5a670e6d9b15d02501d" "02d8c10ed712b0f49cc734646de936f9"
And check sample names were also detected
head(sample_names(ASV))
## [1] "RACEKX3" "SANPEDRO" "STOD4" "BALK10C" "ITA01A" "IZR"
head(sample_names(META))
## [1] "RACEKX3" "STOD4" "SANPEDRO" "KIEL9" "IZR" "BALK10C"
Make one phyloseq object, which contains all 4 objects:
ps <- phyloseq(ASV, TAX, META)
Check some features of the phyloseq object
rank_names(ps)
## [1] "Kingdom" "Phylum" "Class" "Order" "Family" "Genus" "Species"
unique(tax_table(ps)[, "Kingdom"])
## Taxonomy Table: [3 taxa by 1 taxonomic ranks]:
## Kingdom
## 000c465694b030b575b0399f2c5fdd29 "d__Archaea"
## 278a7abeb2fa129cd7e5a4dd13bde4cc "d__Eukaryota"
## 304d7ca27eeb601ca2061dbc9e221080 "Unassigned"
table(tax_table(ps)[, "Kingdom"], exclude = NULL)
##
## d__Archaea d__Eukaryota Unassigned
## 191 1 1
remove Eukaryota and Unassigned (and Bacteria I only need Archaea)
ps <- subset_taxa(ps, !is.na(Kingdom) & !Kingdom %in% c("Unassigned", "d__Eukaryota"))
table(tax_table(ps)[, "Kingdom"], exclude = NULL)
##
## d__Archaea
## 191
now I have only archaeal sequences in my data
check for NA or low abundant Phyla
table(tax_table(ps)[, "Phylum"], exclude = NULL)
##
## p__Altiarchaeota p__Asgardarchaeota p__Crenarchaeota
## 3 8 28
## p__Euryarchaeota p__Halobacterota p__Micrarchaeota
## 8 75 4
## p__Nanoarchaeota p__Thermoplasmatota <NA>
## 43 13 9
Filter out NA Phyla
ps <- subset_taxa(ps, !is.na(Phylum) & !Phylum %in% c(""))
table(tax_table(ps)[, "Phylum"], exclude = NULL)
##
## p__Altiarchaeota p__Asgardarchaeota p__Crenarchaeota
## 3 8 28
## p__Euryarchaeota p__Halobacterota p__Micrarchaeota
## 8 75 4
## p__Nanoarchaeota p__Thermoplasmatota
## 43 13
check for NA or low abundant Classes
table(tax_table(ps)[, "Class"], exclude = NULL)
##
## c__Altiarchaeia c__Bathyarchaeia c__Halobacteria c__Lokiarchaeia
## 3 16 1 4
## c__Methanobacteria c__Methanocellia c__Methanomicrobia c__Methanosarcinia
## 7 2 45 27
## c__Micrarchaeia c__Nanoarchaeia c__Nitrososphaeria c__Odinarchaeia
## 4 43 12 1
## c__Thermococci c__Thermoplasmata <NA>
## 1 13 3
Remove NA Classes
ps <- subset_taxa(ps, !is.na(Class) & !Class %in% c(""))
table(tax_table(ps)[, "Class"], exclude = NULL)
##
## c__Altiarchaeia c__Bathyarchaeia c__Halobacteria c__Lokiarchaeia
## 3 16 1 4
## c__Methanobacteria c__Methanocellia c__Methanomicrobia c__Methanosarcinia
## 7 2 45 27
## c__Micrarchaeia c__Nanoarchaeia c__Nitrososphaeria c__Odinarchaeia
## 4 43 12 1
## c__Thermococci c__Thermoplasmata
## 1 13
Check for NA in Order
table(tax_table(ps)[, "Order"], exclude = NULL)
##
## o__Altiarchaeales o__Bathyarchaeia
## 3 16
## o__Halobacterales o__Lokiarchaeia
## 1 4
## o__Methanobacteriales o__Methanocellales
## 7 2
## o__Methanofastidiosales o__Methanomassiliicoccales
## 1 13
## o__Methanomicrobiales o__Methanosarciniales
## 45 27
## o__Micrarchaeales o__Nitrosopumilales
## 4 8
## o__Nitrososphaerales o__Odinarchaeia
## 4 1
## o__Woesearchaeales
## 43
Check NA Family
table(tax_table(ps)[, "Family"], exclude = NULL)
##
## f__Altiarchaeaceae f__Bathyarchaeia
## 3 16
## f__CG1-02-32-21 f__CG1-02-57-44
## 1 3
## f__GW2011 f__Haloferacaceae
## 2 1
## f__Lokiarchaeia f__Methanobacteriaceae
## 4 7
## f__Methanocellaceae f__Methanofastidiosaceae
## 2 1
## f__Methanomassiliicoccaceae f__Methanomethylophilaceae
## 11 2
## f__Methanomicrobiales f__Methanoregulaceae
## 1 33
## f__Methanosaetaceae f__Methanosarcinaceae
## 10 17
## f__Methanospirillaceae f__Micrarchaeales
## 9 3
## f__Nitrosopumilaceae f__Nitrososphaeraceae
## 8 4
## f__Odinarchaeia f__Woesearchaeales
## 1 35
## <NA>
## 5
Filter out NA Family
ps <- subset_taxa(ps, !is.na(Family) & !Family %in% c(""))
table(tax_table(ps)[, "Family"], exclude = NULL)
##
## f__Altiarchaeaceae f__Bathyarchaeia
## 3 16
## f__CG1-02-32-21 f__CG1-02-57-44
## 1 3
## f__GW2011 f__Haloferacaceae
## 2 1
## f__Lokiarchaeia f__Methanobacteriaceae
## 4 7
## f__Methanocellaceae f__Methanofastidiosaceae
## 2 1
## f__Methanomassiliicoccaceae f__Methanomethylophilaceae
## 11 2
## f__Methanomicrobiales f__Methanoregulaceae
## 1 33
## f__Methanosaetaceae f__Methanosarcinaceae
## 10 17
## f__Methanospirillaceae f__Micrarchaeales
## 9 3
## f__Nitrosopumilaceae f__Nitrososphaeraceae
## 8 4
## f__Odinarchaeia f__Woesearchaeales
## 1 35
Check NA Genus
table(tax_table(ps)[, "Genus"], exclude = NULL)
##
## g__AR15 g__Bathyarchaeia
## 2 16
## g__Candidatus_Altiarchaeum g__Candidatus_Methanofastidiosum
## 3 1
## g__Candidatus_Nitrocosmicus g__Candidatus_Nitrosopumilus
## 2 4
## g__Candidatus_Nitrosotenuis g__CG1-02-32-21
## 1 1
## g__CG1-02-57-44 g__Halogranum
## 3 1
## g__Lokiarchaeia g__Methanobacterium
## 4 7
## g__Methanococcoides g__Methanolinea
## 1 2
## g__Methanolobus g__Methanomassiliicoccus
## 5 2
## g__Methanomethylovorans g__Methanoregula
## 6 31
## g__Methanosaeta g__Methanosarcina
## 10 5
## g__Methanospirillum g__Micrarchaeales
## 9 3
## g__Nitrosopumilaceae g__Nitrososphaeraceae
## 2 2
## g__Odinarchaeia g__Rice_Cluster_I
## 1 1
## g__uncultured g__Woesearchaeales
## 10 35
## <NA>
## 4
Filter out NA Genus
ps <- subset_taxa(ps, !is.na(Genus) & !Genus %in% c(""))
table(tax_table(ps)[, "Genus"], exclude = NULL)
##
## g__AR15 g__Bathyarchaeia
## 2 16
## g__Candidatus_Altiarchaeum g__Candidatus_Methanofastidiosum
## 3 1
## g__Candidatus_Nitrocosmicus g__Candidatus_Nitrosopumilus
## 2 4
## g__Candidatus_Nitrosotenuis g__CG1-02-32-21
## 1 1
## g__CG1-02-57-44 g__Halogranum
## 3 1
## g__Lokiarchaeia g__Methanobacterium
## 4 7
## g__Methanococcoides g__Methanolinea
## 1 2
## g__Methanolobus g__Methanomassiliicoccus
## 5 2
## g__Methanomethylovorans g__Methanoregula
## 6 31
## g__Methanosaeta g__Methanosarcina
## 10 5
## g__Methanospirillum g__Micrarchaeales
## 9 3
## g__Nitrosopumilaceae g__Nitrososphaeraceae
## 2 2
## g__Odinarchaeia g__Rice_Cluster_I
## 1 1
## g__uncultured g__Woesearchaeales
## 10 35
or you can rename not filter out NA taxonomic classification
If there are other NA’s in you taxonomy table then do this
tax_table(ps)[is.na(tax_table(ps)[,5])] <- "f__"
tax_table(ps)[is.na(tax_table(ps)[,6])] <- "g__"
# Then you can convert those unclassified such as p__ without phylum name to this or skip this part
tax_table(ps)[tax_table(ps)[,"Family"]== "f__", "Family"] <- "Unknown_Family"
tax_table(ps)[tax_table(ps)[,"Genus"]== "g__", "Genus" ] <- "Unknown_Genus"
Check the filtered tax table
View(ps@tax_table)
phyloseq_relative <- transform_sample_counts(ps, function(x) x / sum(x))
flista<-filterfun(kOverA(1,0.15))
a <- filter_taxa(phyloseq_relative, flista)
filtered_phyloseq <- prune_taxa(a,ps)
View(filtered_phyloseq@tax_table)
only include Methanobacteriales, Methanomicrobiales etc. to leave only symbionts
filtered_phyloseq_methanogens <- subset_taxa(filtered_phyloseq, Order==“Methanomicrobiales” | Order==“Methanobacteriales” | Order==“Methanosarciniales”) View(filtered_phyloseq_methanogens@tax_table)
rename and clean the taxonomy table after filtering
```r
#clean up taxa table
tax_table(ps)[,colnames(tax_table(ps))] <- gsub(tax_table(ps)[,colnames(tax_table(ps))],pattern="g__",replacement="")
tax_table(ps)[,colnames(tax_table(ps))] <- gsub(tax_table(ps)[,colnames(tax_table(ps))],pattern="f__",replacement="")
tax_table(ps)[,colnames(tax_table(ps))] <- gsub(tax_table(ps)[,colnames(tax_table(ps))],pattern="o__",replacement="")
tax_table(ps)[,colnames(tax_table(ps))] <- gsub(tax_table(ps)[,colnames(tax_table(ps))],pattern="c__",replacement="")
tax_table(ps)[,colnames(tax_table(ps))] <- gsub(tax_table(ps)[,colnames(tax_table(ps))],pattern="p__",replacement="")
tax_table(ps)[,colnames(tax_table(ps))] <- gsub(tax_table(ps)[,colnames(tax_table(ps))],pattern="d__",replacement="")
check the taxonomy table
View(ps@tax_table)
after all the filtering, rename the ASVs
rownames(filtered_phyloseq@otu_table) <-
case_when(rownames(filtered_phyloseq@otu_table)=="015c2c21f0b0e140796b2d0a2fa72080" ~ "Methanoregula formicica",
rownames(filtered_phyloseq@otu_table)=="0eb6233014f786a4a526856e4a18324a" ~ "Methanobacterium ASV 1",
rownames(filtered_phyloseq@otu_table)=="1d74bcf4e7d652412f786ee2d65a4a0f" ~ "Methanosaeta ASV 1",
rownames(filtered_phyloseq@otu_table)=="2235b1f103e24aa46eebb31db8e2428d" ~ "Methanoregula ASV 1",
rownames(filtered_phyloseq@otu_table)=="61cbae61dd0773faa39783c90c1f3fe6" ~ "Methanobacteriales",
rownames(filtered_phyloseq@otu_table)=="655dba046ff74bc0d4200bc5730eb407" ~ "Methanoregula ASV 2",
rownames(filtered_phyloseq@otu_table)=="a90aa8573c921a842c3022f35c9f794e" ~ "Methanoregula ASV 3",
rownames(filtered_phyloseq@otu_table)=="aa0c6b48066ab0ba40dddf93359c2b18" ~ "Methanobacterium ASV 2",
rownames(filtered_phyloseq@otu_table)=="acdd6e53ca593f84c5ba3ebf3221acd7" ~ "Methanoregula ASV 4",
rownames(filtered_phyloseq@otu_table)=="b83a52fc891dbd06476c103fcb667b6b" ~ "Methanobacterium ASV 3",
rownames(filtered_phyloseq@otu_table)=="c6beaabd9c67491b6914cdd516379e95" ~ "Methanoregula ASV 5",
rownames(filtered_phyloseq@otu_table)=="e6d1121872022926ff0d1ac375ebb6f5" ~ "Methanolobus bombayensis",
rownames(filtered_phyloseq@otu_table)=="e7146984aeb97f06a4d5089412447d36" ~ "Methanomethylovorans ASV 1",
rownames(filtered_phyloseq@otu_table)=="ead6478a925b701c77a9cace2f4924ba" ~ "Methanomethylovorans ASV 2",
TRUE ~ rownames(filtered_phyloseq@otu_table))
# now figure out how to get a column in the taxonomy file called "symbiont" which only has those IDs in it (but does not have
# anything for the features which are not labelled). probably have to do this manually
# import back
count_table<-read_tsv(file="~/export/table_only_symbionts.tsv")
## Rows: 14 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): OTU_ID
## dbl (11): RACEKX3, SANPEDRO, STOD4, BALK10C, ITA01A, IZR, IZRMEDIA, KIEL9, N...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
count_table<-column_to_rownames(count_table, var=colnames(count_table)[1])
View(count_table)
taxonomy <- read_tsv(file="~/export/taxonomy_symbiont_ASV2.tsv")
## Rows: 193 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Feature ID, Taxon, Type
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(taxonomy)
asv_fasta <- read.FASTA(file="~/export/dna-sequences.fasta")
sample_info_tab<-read_tsv(file="~/export/metadata_more_info.tsv")
## Rows: 11 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): sample-ID, type, clade
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
taxonomy <- taxonomy %>%
separate(Taxon, into=c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus","Species"), sep=";") %>%
column_to_rownames(var = 'Feature ID') # specify that the "Feature ID" column of data are rownames
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 124 rows [1, 4,
## 6, 7, 10, 11, 12, 13, 16, 17, 20, 21, 22, 25, 27, 28, 30, 32, 33, 36, ...].
head(taxonomy)
## Kingdom Phylum
## 000c465694b030b575b0399f2c5fdd29 d__Archaea p__Halobacterota
## 00af279a7dcd6d0cab8f388df82f2075 d__Archaea p__Asgardarchaeota
## 015c2c21f0b0e140796b2d0a2fa72080 d__Archaea p__Halobacterota
## 020bf741caada7c0a497b09e997463e2 d__Archaea p__Halobacterota
## 02c6990d44eaa5a670e6d9b15d02501d d__Archaea p__Nanoarchaeota
## 02d8c10ed712b0f49cc734646de936f9 d__Archaea p__Crenarchaeota
## Class Order
## 000c465694b030b575b0399f2c5fdd29 c__Methanomicrobia o__Methanomicrobiales
## 00af279a7dcd6d0cab8f388df82f2075 c__Odinarchaeia o__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 c__Methanomicrobia o__Methanomicrobiales
## 020bf741caada7c0a497b09e997463e2 c__Methanomicrobia o__Methanomicrobiales
## 02c6990d44eaa5a670e6d9b15d02501d c__Nanoarchaeia o__Woesearchaeales
## 02d8c10ed712b0f49cc734646de936f9 c__Nitrososphaeria o__Nitrososphaerales
## Family
## 000c465694b030b575b0399f2c5fdd29 f__Methanospirillaceae
## 00af279a7dcd6d0cab8f388df82f2075 f__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 f__Methanoregulaceae
## 020bf741caada7c0a497b09e997463e2 f__Methanoregulaceae
## 02c6990d44eaa5a670e6d9b15d02501d f__CG1-02-57-44
## 02d8c10ed712b0f49cc734646de936f9 f__Nitrososphaeraceae
## Genus
## 000c465694b030b575b0399f2c5fdd29 g__Methanospirillum
## 00af279a7dcd6d0cab8f388df82f2075 g__Odinarchaeia
## 015c2c21f0b0e140796b2d0a2fa72080 g__Methanoregula
## 020bf741caada7c0a497b09e997463e2 g__Methanolinea
## 02c6990d44eaa5a670e6d9b15d02501d g__CG1-02-57-44
## 02d8c10ed712b0f49cc734646de936f9 g__Candidatus_Nitrocosmicus
## Species Type
## 000c465694b030b575b0399f2c5fdd29 <NA> <NA>
## 00af279a7dcd6d0cab8f388df82f2075 s__uncultured_marine <NA>
## 015c2c21f0b0e140796b2d0a2fa72080 s__Methanoregula_formicica symbiont
## 020bf741caada7c0a497b09e997463e2 <NA> <NA>
## 02c6990d44eaa5a670e6d9b15d02501d s__uncultured_euryarchaeote <NA>
## 02d8c10ed712b0f49cc734646de936f9 <NA> <NA>
ASV = otu_table(data.frame(count_table), taxa_are_rows=TRUE)
TAX = tax_table(as.matrix(taxonomy))
META= sample_data(data.frame(sample_info_tab, row.names = sample_info_tab$'sample-ID'))
# make phyloseq object
ps2 <- phyloseq(ASV, TAX, META)
ps2_rel <- transform_sample_counts(ps, function(x) x / sum(x))
#clean up taxa table
tax_table(ps2)[,colnames(tax_table(ps2))] <- gsub(tax_table(ps2)[,colnames(tax_table(ps2))],pattern="g__",replacement="")
tax_table(ps2)[,colnames(tax_table(ps2))] <- gsub(tax_table(ps2)[,colnames(tax_table(ps2))],pattern="f__",replacement="")
tax_table(ps2)[,colnames(tax_table(ps2))] <- gsub(tax_table(ps2)[,colnames(tax_table(ps2))],pattern="o__",replacement="")
tax_table(ps2)[,colnames(tax_table(ps2))] <- gsub(tax_table(ps2)[,colnames(tax_table(ps2))],pattern="c__",replacement="")
tax_table(ps2)[,colnames(tax_table(ps2))] <- gsub(tax_table(ps2)[,colnames(tax_table(ps2))],pattern="p__",replacement="")
tax_table(ps2)[,colnames(tax_table(ps2))] <- gsub(tax_table(ps2)[,colnames(tax_table(ps2))],pattern="d__",replacement="")
# transform to relative abundance
ra_ps2 <- transform_sample_counts(ps2, function(x) x / sum(x))
plot abundances and relative abundances of all samples
# barplot
plot_bar(ps2,fill="Genus")
plot_bar(ra_ps2,fill="Genus", facet_grid=~type) + scale_fill_brewer(palette = "Paired")
plot_bar(ra_ps2,fill="Genus")+facet_grid(rows="type",scales="free_y", space="free")+coord_flip()+scale_fill_brewer(palette="Paired")
plot relative ASV abundances, group by clades
plot_bar(ra_ps2, fill = "Genus")+
facet_grid(rows="clade",scales="free_y",space="free")+coord_flip()+theme(strip.text.y=element_text(angle=360))+
ylab("Relative abundance") + theme(axis.title = element_text(size=10, face="bold")) +
xlab("Clade") +
theme(axis.text.x = element_blank(), axis.text.y = element_text(size=5)) +
theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 10, face="bold")) +
theme(strip.text.x = element_text(size = 5, face="bold")) +
theme(legend.key = element_rect(size = 0.4)) + theme(legend.position="bottom")+scale_fill_brewer(palette="Paired")
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
plot relative abundances, group by media/symbiont sample
plot_bar(ra_ps2, fill = "Genus")+
facet_grid(rows="type",scales="free_y",space="free")+coord_flip()+
ylab("Relative abundance") + theme(axis.title = element_text(size=10, face="bold")) +
xlab("Strain") +
theme(axis.text.x = element_blank(), axis.text.y = element_text(size=7)) +
theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 10, face="bold")) +
theme(strip.text.x = element_text(size=7, face="bold")) +
theme(legend.key = element_rect(size=5)) + theme(legend.position="bottom")+scale_fill_brewer(palette="Paired")
plot relative abundances, grouped by clade, vertical, different colors
plot_bar(ra_ps2, fill = "Genus")+facet_grid(cols=vars(clade),scales="free_x",space="free")+
ylab("Relative abundance") + theme(axis.title = element_text(size=12, face="bold")) + xlab("Clade") + theme(axis.text.x = element_text(size=5), axis.text.y = element_text(size=7)) +
theme(legend.text = element_text(size = 7)) + theme(legend.title = element_text(size = 13, face="bold")) +
theme(strip.text.x = element_text(size = 10, face="italic")) +
theme(legend.key = element_rect(size = 0.4)) + theme(legend.position="bottom") +
scale_fill_brewer(palette = "Set2")
let’s check overall how the orders are distributed among samples using phyloseq
# First aglomerate the ASVs at the order level using the phyloseq function, tax_glom
phylumGlommed = tax_glom(ps2, "Order")
# and plot
plot_bar(phylumGlommed, x = "Sample", fill = "Order") +
scale_fill_brewer(palette = "Set2")
Finally we are ready to try some ordinations
What is an ordination? Ordination is a collective term for multivariate techniques which summarize a multidimensional dataset in such a way that when it is projected onto a low dimensional space, any intrinsic pattern the data may possess becomes apparent upon visual inspection (Pielou, 1984).
The first one to try here will be a principal coordinate analysis (PCoA), PCoA tries to represent the distance between objects by projecting their dissimilarity into a 2- or 3-D (Euclidean) space. We will using the “bray” method (Bray-Curtis) to calculate the dissimilarity matrix. You can build the dissimilarity method directly into the PCoA calculation, but first let’s do it on it’s own to check what the matrix looks like:
ps_dist <- distance(ps2, method = "bray")
ps_dist
## RACEKX3 SANPEDRO STOD4 BALK10C ITA01A IZR IZRMEDIA
## SANPEDRO 0.9958338
## STOD4 0.7963143 1.0000000
## BALK10C 1.0000000 1.0000000 1.0000000
## ITA01A 1.0000000 1.0000000 1.0000000 1.0000000
## IZR 1.0000000 1.0000000 1.0000000 0.9895343 1.0000000
## IZRMEDIA 1.0000000 1.0000000 1.0000000 0.9964066 1.0000000 0.8543534
## KIEL9 1.0000000 1.0000000 1.0000000 0.9879284 1.0000000 0.4496730 0.9958080
## NA1 1.0000000 1.0000000 1.0000000 0.9608021 1.0000000 0.9965628 0.9882167
## NA1MEDIA 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.9361360 0.8220641
## SLOV3A 1.0000000 1.0000000 1.0000000 0.9637462 1.0000000 0.9022039 0.9953330
## KIEL9 NA1 NA1MEDIA
## SANPEDRO
## STOD4
## BALK10C
## ITA01A
## IZR
## IZRMEDIA
## KIEL9
## NA1 1.0000000
## NA1MEDIA 0.9408855 0.9893430
## SLOV3A 0.8741132 0.9977072 0.9765020
Next plot the PCoA
out.pcoa <- ordinate(ps2, method = "PCoA", distance = "bray")
pcoa_plot = plot_ordination(ps2, out.pcoa, color="clade", shape = "type") +
geom_point(size = 3)
pcoa_plot
Next, I will demonstrate a weighted UniFrac PCoA. When calculating the distance matrix, this takes into account how many related taxa are shared among each pair of samples using the phylogenetic tree (which we embedded in the phyloseq object). It then performs the PCoA:
out.pcoa <- ordinate(ps, method = “PCoA”, distance = “wunifrac”) wuf_pcoa_plot = plot_ordination(ps, out.pcoa, color =“sample-id”, shape = “type”) + geom_point(size = 3) wuf_pcoa_plot