Dependencies

knitr::opts_chunk$set(dev="png",
                      dpi=200,
                      echo = TRUE,
                      results = 'hide',
                      warning = FALSE,
                      message=FALSE,
                      fig.align='center')

library(pato)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x ggplot2::annotate() masks pato::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
library(ggsci)
library(ggraph)
library(tidygraph)
## 
## Attaching package: 'tidygraph'
## The following object is masked from 'package:stats':
## 
##     filter
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidygraph':
## 
##     groups
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggforce)
library(ggstatsplot)
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
## 
## Attaching package: 'ggstatsplot'
## The following object is masked from 'package:igraph':
## 
##     %<-%
library(patchwork)
library(ggnewscale)

Loading data

We are using the plasmids defined by Sergio both the hybrid assembly and the predicted by GPlas. We had annotated the plasmids using PROKKA and typed with MOB-Typer. We load the sequence info (AA and DNA in PATO) and creates the pangenome clustering the proteins in families of 70% of identity and 80% of coverage.

#WGS data
files_fna = dir(".", recursive = T, pattern = "fsa")
mash_wgs <- mash(files_fna, n_cores = 20, type = "nucl")

#Proteins data
files_faa = dir(".", recursive = T, pattern = "faa", full.names = T)
mm_prot <- mmseqs(files_faa, type ="prot",n_cores = 20)

#Genes data
files_ffn = dir(".", recursive = T, pattern = "ffn", full.names = T)
mm_ffn <- mmseqs(files_ffn, type ="nucl",identity = 0.7, n_cores = 20)

Plasmid clustering

First we create the AccNET object for further analysis.

### Accnet object
acc <- accnet(mm_ffn,dist = TRUE)

We export the AccNET network to analize the Accessory Genome in Gephi.

### Export the network to load in Gephi.
export_to_gephi(acc,"Accnet")
## Adding missing grouping variables: `Target`

We use similar criteria than COPLA (Redondo-Salvo et al 2020). They use a criteria of “two plasmids are connected if the share at least the 50% of the small plasmid with a similarity of 70%”. We are using a 70% of similarity in the gene families and 0.5 in the Jaccard similarity. To cluster the plasmids we build a KNN Network with 10 neighbors, minimum similarity of 0.5 (jaccard similarity) and without duplicate edges and find clusters with “Fast-Greedy” algorithm.

kn_network <- knnn(acc,n_neigh = 10,repeats = T, threshold = 0.5)
cl <- clustering(kn_network, method = "greedy")

Loading Metadata

Here we join the data from MOB-Typer, the plasmid length (from PROKKA output) and samples metadata provided by Ana and Sergio from Microreact.

typing <- read_tsv("all.mob")
## Rows: 3116 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (23): sample_id, size, md5, rep_type(s), rep_type_accession(s), relaxase...
## dbl  (3): num_contigs, gc, mash_neighbor_distance
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sizes2 <- read_tsv("lenghs.tsv", col_names = F) %>% rename(ID =1,size = 2)
## Rows: 3116 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): X1
## dbl (1): X2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
typing <- typing %>% 
  mutate(sample_id = gsub("fna","ffn",sample_id)) %>% 
  select(-size) %>% 
  full_join(sizes2 %>% 
              mutate(sample_id = gsub("$",".ffn",ID)))
## Joining, by = "sample_id"
typing <- typing %>% separate(sample_id,c("genome_id","plasmid"),sep = "_plasmid_", remove = F) %>% mutate(plasmid = gsub(".ffn","",plasmid))

metadata <- read_csv("microreact-project-3T9X5PlUD-data.csv")
## Rows: 2064 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (31): id, name, latitude, longitude, Lab, Lab__colour, Isolation source,...
## dbl  (5): ST__autocolour, PANINI Panaroo cluster__autocolour, PopPUNK cluste...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metadata <- metadata %>% 
  mutate(id = gsub("_1$","",id,perl = T)) %>% 
  mutate(id = gsub("#","~",id))

full_meta <- full_join(metadata,typing, by= c("id" = "genome_id")) %>% 
  ungroup() %>% 
  full_join(cl %>% 
              rename(plasmid_cluster = Cluster), by = c("sample_id" = "Source"))

Reformat some variable

We create new variables for PopPunk, Panini and plasmid cluster to only represent the majorities groups (cluster with more than 9 members) in order to simplify the analysis.

full_meta <- full_meta %>% 
  ungroup %>% 
  group_by(plasmid_cluster) %>% 
  mutate(plasmid_cluster_size = n()) %>%
  ungroup() %>% 
  mutate(plasmid_cluster_2 = ifelse(plasmid_cluster_size >= 10,plasmid_cluster,"other")) %>% 
  group_by(`PANINI Panaroo cluster__autocolour`) %>% 
  mutate(PANINI_cluster_size = n()) %>% 
  ungroup() %>%  
  mutate(PANINI_cl = ifelse(PANINI_cluster_size >= 10,`PANINI Panaroo cluster__autocolour`,"Other")) %>% 
  mutate(PANINI_cl = as.factor(PANINI_cl)) %>% 
  group_by(`PopPUNK cluster__autocolour`) %>% 
  mutate(PopPunk_cluster_size = n()) %>% 
  ungroup() %>% 
  mutate(PopPunk_cl = ifelse(PopPunk_cluster_size >= 10,`PopPUNK cluster__autocolour`,"Other"))

Here you can find the final table that you can download.

full_meta <- full_meta %>% mutate(PANINI_cl = as.factor(PANINI_cl), 
                      plasmid_cluster_2 = as.factor(plasmid_cluster_2))

full_meta %>% DT::datatable(extensions = 'Buttons',
            options = list(dom = 'Blfrtip',
                           buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
                           lengthMenu = list(c(10,25,50,-1),
                                             c(10,25,50,"All"))))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Exporting data to Cytoscape

The best option to arrange the K-NN Network was Cytoscape. We use it to arrange the network and then export the layout to R.

export_to_gephi(kn_network,"toCytoscape")
full_meta %>% write_tsv("toCytoscape.table.tsv")

Importing the Cytoscape arranged network

cyt <- jsonlite::fromJSON("toCytoscape.net.tsv.cyjs")
net <- as_tbl_graph(kn_network) %>% inner_join(full_meta, by = c("name" = "sample_id"))

## Prune network for some edges that connect some small clusters. 
to_remove <- net %>% 
  igraph::as_data_frame() %>% 
  inner_join(full_meta %>% 
               select(sample_id,cl_from = plasmid_cluster_2), by = c("from"="sample_id")) %>% 
  inner_join(full_meta %>% 
               select(sample_id,cl_to = plasmid_cluster_2), by = c("to"="sample_id")) %>% 
  filter(cl_from == "other" | cl_to == "other") %>% 
  filter(cl_from != cl_to) %>% select(from,to) %>% graph_from_data_frame(directed = F)

kn_network_pruned <- kn_network - to_remove 

net <- as_tbl_graph(kn_network_pruned) %>% inner_join(full_meta, by = c("name" = "sample_id"))


full_meta <- full_meta %>% inner_join(cbind(cyt$elements$nodes$data$shared_name,cyt$elements$nodes$position) %>% 
               rename(sample_id =1, x =2, y=3) %>% 
                 mutate(y = y*-1))
## Joining, by = "sample_id"
ly <- net %>% 
  activate(nodes) %>% 
  as.data.frame() %>% 
  inner_join(cbind(cyt$elements$nodes$data$shared_name,cyt$elements$nodes$position) %>% 
               rename(name =1, x =2, y=3)) %>% 
  select(x,y) %>% mutate(y = y*-1) %>% 
  as.matrix()
## Joining, by = "name"

Plots & Graphs

set.seed(11)
cm_palette <- randomcoloR::distinctColorPalette(77)

We plot the Plasmid clustering and the assembly type.

  ggraph(net, layout =ly )+ 
  geom_mark_hull(aes(fill = plasmid_cluster_2, 
                     x= ly[,1], 
                     y = ly[,2],
                     label =plasmid_cluster_2 ),
                 label.margin = margin(0, 0, 0, 0, "mm"),
                 concavity = 3,
                 expand=0.005,
                 radius=0.001, 
                 label.fontsize = 9, show.legend = T)+
  geom_edge_link(color = "grey") + 
  geom_node_point(aes(fill=plasmid_cluster_2, shape = `Hybrid assembly__autocolour`), color ="black", size=3)+
  #geom_node_text(aes(label = plasmid_cluster_2), size = 3, check_overlap = T)+
  scale_color_manual(values = cm_palette)+
  scale_fill_manual(values = cm_palette)+
  scale_shape_manual(values = c(21,23,24))+
  theme_void() +
  theme(plot.title = element_text(hjust = 0.5))+
  labs(title ="K-NN Network Plasmid Clustering")
## Warning: Removed 114 rows containing missing values (geom_point).

We can see that the network has a great modularity and the plasmid cluster are well defined. We found 667 plasmid cluters that we summirised in 59 clusters with >= 10 members and we agreggate the rest in an “other” cluster.

Summary of previous data

We show the previous metadata in order to put in context the results of the plasmids. Here we show the absolute abundance of each Isolate Source and the relative frequency in each PANINI cluster. We have selected the PANINI cluster as population structure but we can change to PopPunk scheme (to discuss)

{full_meta %>% ungroup() %>% 
    filter(!is.na(PANINI_cl)) %>% 
    
  select(id,PANINI_cl,`Isolation source`) %>% distinct() %>% 
  ggplot(aes(x = PANINI_cl, fill = `Isolation source` )) + 
  geom_bar() +
  scale_fill_d3() + coord_flip() + theme_bw() + theme(legend.position = "none")} +

{full_meta %>% ungroup() %>% 
    filter(!is.na(PANINI_cl)) %>%  
    
  select(id,PANINI_cl,`Isolation source`) %>% 
  ggplot(aes(x = PANINI_cl, fill = `Isolation source` )) + 
  geom_bar(position = "fill") +
  scale_fill_d3() + coord_flip() +  theme_bw()}

Plasmid number distribution by PANNINI cluster

Now we show the number distribution of plasmids in each genome cluster. Note that some isolates does not have plasmids. (to discuss: Is it worth doing a statistical analysis?)

wo_plasmids <- full_meta %>% 
  ungroup() %>% 
  filter(is.na(sample_id)) %>% 
  select(id,PANINI_cl) %>% 
  distinct() %>% 
  mutate(Plasmid_number =0)
full_meta %>% 
  group_by(id,PANINI_cl) %>% 
  summarise(Plasmid_number = n()) %>% 
  bind_rows(wo_plasmids) %>% 
  ggplot(aes(x = PANINI_cl, y =Plasmid_number, fill = PANINI_cl)) + 
  geom_boxplot()+
  scale_fill_manual(values = cm_palette) +
  scale_y_continuous(limits = c(-1,10)) + 
  coord_flip()+  
  theme_bw() + guides(fill=guide_legend(ncol=2))
## `summarise()` has grouped output by 'id'. You can override using the `.groups` argument.

We see that there are not big differences among the PANINI clusters.

plasmid number per Isolation Source

We show the plasmid number distribution per Isolate Source

wo_plasmids <- full_meta %>% ungroup() %>% filter(is.na(sample_id)) %>% select(id,`Isolation source`) %>% distinct() %>% mutate(Plasmid_number =0)
full_meta %>% 
  group_by(id,`Isolation source`) %>% 
  summarise(Plasmid_number = n()) %>% 
  bind_rows(wo_plasmids) %>% 
  ggplot(aes(x = `Isolation source`, y =Plasmid_number, fill = `Isolation source`)) + 
  geom_violin() + 
  scale_fill_d3() + 
  coord_flip() +
  scale_y_continuous(limits = c(-1,10)) +  
  theme_bw()
## `summarise()` has grouped output by 'id'. You can override using the `.groups` argument.

Again we do not observe big differences among the isolation sources. It seems that the plasmid distribution is homogeneous amog the sources.

Distributoin of plamids cluster in PANINI cluster

To illustrate the plasmid distribution by PANINI cluster we use a matrix representation. The number represents to absolute abundance of plasmids in each plasmid cluster <-> Panini cluster.

full_meta %>% 
  group_by(plasmid_cluster_2, PANINI_cl) %>% 
  summarise(N = n()) %>% 
  mutate(plasmid_cluster_2 = as.factor(plasmid_cluster_2)) %>% 
  ggplot(aes(x = plasmid_cluster_2, y = PANINI_cl)) + 
  geom_tile(aes(fill = N)) + 
  scale_fill_material(palette = "red") + 
  geom_text(aes(label = N)) + theme_bw()
## `summarise()` has grouped output by 'plasmid_cluster_2'. You can override using the `.groups` argument.

We can represent the same data over the K-NN Network. Each number represent the PANINI cluster and the shadow represent the plasmid cluster.

  ggraph(net, layout =ly )+ 
  geom_mark_hull(aes(fill = plasmid_cluster_2, 
                     x= ly[,1], 
                     y = ly[,2],
                     label =plasmid_cluster_2 ),
                 label.margin = margin(0, 0, 0, 0, "mm"),
                 concavity = 3,
                 expand=0.005,
                 radius=0.001, 
                 label.fontsize = 9, show.legend = F)+
  geom_edge_link(color = "grey") + 
  #geom_node_point(aes(fill=PANINI_cl),color ="black",shapesize=3, shape = 21)+
  geom_node_label(aes(fill=PANINI_cl,label = PANINI_cl),color ="black",size=3)+
  #geom_node_text(aes(label = PANINI_cl), size = 2, check_overlap = T,repel = F)+
  scale_color_manual(values = cm_palette)+
  scale_fill_manual(values = randomcoloR::distinctColorPalette(99))+
  theme_void() + 
  theme(legend.position ="none") +
  labs("PANINI cluster in K-NN Network")
## Warning: Removed 117 rows containing missing values (geom_label).

Groups of plamids with great heterogeneity (6, 20, 13, 14…) can be distinguished while other plasmids are clearly host-specific (32, 142, 144…)

Distribution of plasmids per Isolation Source

{full_meta %>% 
  filter(!is.na(plasmid_cluster_2)) %>% 
  filter(!is.na(`Isolation source`)) %>% 
  ggplot(aes(x = plasmid_cluster_2, fill = `Isolation source`)) + 
  geom_bar() + 
  coord_flip() + 
  scale_fill_d3() +
  theme_bw() + theme(legend.position = "none")} + 
{full_meta %>% 
  filter(!is.na(plasmid_cluster_2)) %>% 
  filter(!is.na(`Isolation source`)) %>% 
  ggplot(aes(x = plasmid_cluster_2, fill = `Isolation source`)) + 
  geom_bar(position = "fill") + 
  coord_flip() + 
  scale_fill_d3() +
  theme_bw()}

 ggraph(net, layout =ly )+ 
  geom_mark_hull(aes(fill = plasmid_cluster_2, 
                     x= ly[,1], 
                     y = ly[,2],
                     label =plasmid_cluster_2 ),
                 label.margin = margin(0, 0, 0, 0, "mm"),
                 concavity = 3,
                 expand=0.005,
                 radius=0.001, 
                 label.fontsize = 9, show.legend = F)+
  scale_fill_manual(values = cm_palette)+
  new_scale_fill()+
  geom_edge_link(color = "grey") + 
  geom_node_point(aes(fill=`Isolation source`),color ="black",size=3, shape = 21)+
  scale_color_d3()+
  scale_fill_d3()+
  theme_void()+
  labs(title="Isolation Source of plasmid host")

Again we can find groups of host-specific plasmids as very heterogeneous groups. The specific groups appear to match both in host and isolation source. To check it, we are going to represent the two variables together.

ggraph(net, layout =ly )+ 
  geom_mark_hull(aes(fill = plasmid_cluster_2, 
                     x= ly[,1], 
                     y = ly[,2],
                     label =plasmid_cluster_2 ),
                 label.margin = margin(0, 0, 0, 0, "mm"),
                 concavity = 3,
                 expand=0.005,
                 radius=0.001, 
                 label.fontsize = 9, show.legend = F)+
  geom_edge_link(color = "grey") + 
  scale_fill_manual(values = cm_palette) +
  new_scale_fill()+
  geom_node_label(aes(fill=`Isolation source`,label = PANINI_cl),color ="black",size=3)+
  scale_color_manual(values = cm_palette)+
  scale_fill_d3()+
  theme_void() + 
  labs(title = "PANINI cluster and Isolation Source")
## Warning: Removed 117 rows containing missing values (geom_label).

We find examples of same plasmid group, same isolation source different Panini cluster that could be indicated plasmids that actually are specifict from the isolation sources, and other in where the variables Panini cluster and isolation sources match, so we can not concluded if there are clustering because the host are very similar.

Plasmid cluster characterization

Now we try to characterize the plasmid clusters: Sizes, Mobilization, Replication, Host Range and Antibiotic resistance.

Size distribution

We represent the plasmid size distribution along with the predicted mobility.

full_meta %>% 
  filter(!is.na(size)) %>% 
  ggplot(aes(x = size, fill = predicted_mobility)) + 
  geom_density(alpha = 0.5, adjust = 0.5) +
  scale_fill_d3() +
  scale_x_log10()+
  theme_bw() +
  labs(title ="Size distribution and Predicted Mobility")

The distribution of the plasmids in enterococcus is quite different to the observed in other families. We find a lot of small mobilizable plasmids. The rest of the distribution is expected data, plasmids that start to be conjugatives after a certain size.

Size distribution per isolate source

full_meta %>% filter(!is.na(size)) %>% 
  filter(`Isolation source` != "NA") %>% 
  ggplot(aes(x = `Isolation source`, y = size, fill = `Isolation source`)) + 
  geom_violin() + 
  scale_fill_d3() + coord_flip() + scale_y_log10() + theme_bw()

As in the number of plasmids, we did not find big differences between the plasmids of the different sources of isolation.

Plasmid size distribution by plasmid cluster

full_meta %>% filter(!is.na(size)) %>% 
  filter(!is.na(plasmid_cluster_2 )) %>% 
  ggplot(aes(x = plasmid_cluster_2, y = size, fill = plasmid_cluster_2)) + 
  geom_boxplot() + 
  scale_fill_manual(values = cm_palette) + 
  coord_flip() + 
  scale_y_log10() + 
  theme_bw() + 
  theme(legend.position = "none")

Predicted mobilization.

full_meta %>% filter(!is.na(plasmid_cluster_2)) %>% 
  filter(!is.na(predicted_mobility)) %>% 
  ggplot(aes(x=plasmid_cluster_2, fill = predicted_mobility)) + geom_bar(position = "fill") + 
  coord_flip() + 
  scale_fill_d3() +
  theme_bw()

ggraph(net, layout =ly )+ 
  geom_mark_hull(aes(fill = plasmid_cluster_2, 
                     x= ly[,1], 
                     y = ly[,2],
                     label =plasmid_cluster_2 ),
                 label.margin = margin(0, 0, 0, 0, "mm"),
                 concavity = 3,
                 expand=0.005,
                 radius=0.001, 
                 label.fontsize = 9, show.legend = F)+
  scale_fill_manual(values = cm_palette)+
  new_scale_fill()+
  geom_edge_link(color = "grey") + 
  geom_node_point(aes(fill=predicted_mobility),color ="black",size=3, shape = 21)+
  scale_color_d3()+
  scale_fill_d3()+
  theme_void() +
  labs(title ="Predicted mobility by MOB-Typer")

We find some heterogenicity of mobilization in some plasmid clusters.

MOB typing distribution.

full_meta %>% 
  select(sample_id,`relaxase_type(s)`) %>% 
  separate_rows(`relaxase_type(s)`,sep =",") %>% 
  distinct() %>% 
  arrange(`relaxase_type(s)`) %>%
  group_by(sample_id) %>% 
  summarise(`relaxase_type(s)` = paste(`relaxase_type(s)`,sep = ",",collapse = ",")) %>%
  full_join(full_meta %>% 
              select(-`relaxase_type(s)`)) %>% 
  filter(!is.na(PANINI_cl)) %>%  
  ggplot(aes(x=plasmid_cluster_2, fill = `relaxase_type(s)`)) + 
  geom_bar(position = "fill") + 
  coord_flip() + 
  scale_fill_d3(palette = "category20") +
  theme_bw()
## Joining, by = "sample_id"

Spliting the multi-mobilization

full_meta %>% filter(!is.na(plasmid_cluster_2)) %>% 
  separate_rows(`relaxase_type(s)`, sep = ",") %>% 
  ggplot(aes(x=plasmid_cluster_2, fill = `relaxase_type(s)`)) + geom_bar(position = "fill") + 
  coord_flip() + 
  scale_fill_d3() +
  theme_bw()

full_meta %>% filter(!is.na(plasmid_cluster_2)) %>% 
  separate_rows(`relaxase_type(s)`, sep = ",") %>% 
  group_by(`relaxase_type(s)`) %>% 
  summarise(N = n()) %>% 
  ggplot(aes(x = `relaxase_type(s)`, y = N, fill =`relaxase_type(s)` )) + geom_col(stats = "identity") + 
  coord_flip() + 
  scale_fill_d3() +
  theme_bw()
## Warning: Ignoring unknown parameters: stats

  full_meta %>% filter(!is.na(plasmid_cluster_2)) %>% 
  separate_rows(`relaxase_type(s)`, sep = ",") %>% 
    group_by(sample_id) %>% 
    summarise(N_replicon = n()) %>% 
    ungroup() %>% 
    mutate(N_replicon = as.factor(N_replicon)) %>% 
    ggplot(aes(x = N_replicon)) + geom_bar(position = "stack") + 
  coord_flip() + 
  scale_fill_d3() +
  theme_bw()

We found several plasmids with multi-mobilization systems, although most contain a single system.

tmp <- full_meta %>% 
  separate_rows(`relaxase_type(s)`,sep =",") %>% 
  distinct() %>% 
  arrange(`relaxase_type(s)`) %>%
  group_by(sample_id) %>% 
  summarise(`relaxase_type(s)` = paste(`relaxase_type(s)`,sep = ",",collapse = ",")) %>%
  select(sample_id, relaxases = `relaxase_type(s)`)


  net %>% 
    inner_join(tmp, by = c("name" = "sample_id")) %>% 
    ggraph(layout =ly )+ 
    geom_mark_hull(aes(fill = plasmid_cluster_2, 
                     x= ly[,1], 
                     y = ly[,2],
                     label =plasmid_cluster_2 ),
                 label.margin = margin(0, 0, 0, 0, "mm"),
                 concavity = 3,
                 expand=0.005,
                 radius=0.001, 
                 label.fontsize = 9, show.legend = F)+
  scale_fill_manual(values = cm_palette)+
  new_scale_fill()+
  geom_edge_link(color = "grey") + 
  geom_node_point(aes(fill=relaxases),color ="black",size=3, shape = 21)+
  scale_color_d3(palette = "category20") +
  scale_fill_d3(palette = "category20")+
  theme_void() +
  labs(title = "Mobilization system")

We can represent the frequency of each replicon in each plasmid cluster.

full_meta %>% 
  filter(!is.na(plasmid_cluster_2)) %>% 
  group_by(plasmid_cluster_2) %>% 
  mutate(plasmid_cluster_size = n()) %>% 
  ungroup() %>% 
  separate_rows(`relaxase_type(s)`, sep = ",") %>% 
  distinct() %>% 
  group_by(`relaxase_type(s)`,plasmid_cluster_2,plasmid_cluster_size) %>% 
  summarise(N = n()) %>% 
  ungroup() %>% 
  mutate(Freq = 100*N/plasmid_cluster_size) %>% 
  ggplot(aes(y=plasmid_cluster_2, x =`relaxase_type(s)`, fill=Freq)) + geom_tile() + 
  scale_fill_material(palette = "red") +
  geom_text(aes(label = sprintf("%0.1f", round(Freq, digits = 2)))) +
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'relaxase_type(s)', 'plasmid_cluster_2'. You can override using the `.groups` argument.

REP typing (multi-replicon splitted)

We repeat the previous analysis but with the Replication system.

full_meta %>% 
  filter(!is.na(plasmid_cluster_2)) %>% 
  group_by(plasmid_cluster_2) %>% 
  mutate(plasmid_cluster_size = n()) %>% 
  ungroup() %>% 
  separate_rows(`rep_type(s)`, sep = ",") %>% 
  group_by(`rep_type(s)`,plasmid_cluster_2,plasmid_cluster_size) %>% 
  summarise(N = n()) %>% ungroup() %>% 
  mutate(Freq = 100*N/plasmid_cluster_size) %>% 
  filter(`rep_type(s)` != "-") %>% 
  ggplot(aes(x=plasmid_cluster_2, y =`rep_type(s)`, fill=Freq)) + 
  geom_tile() + 
  scale_fill_material(palette = "red") +
  geom_text(aes(label = sprintf("%0.1f", round(Freq, digits = 2)))) +
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'rep_type(s)', 'plasmid_cluster_2'. You can override using the `.groups` argument.

Here the number plasmid with multi-replicon is higher

  full_meta %>% filter(!is.na(plasmid_cluster_2)) %>% 
  separate_rows(`rep_type(s)`, sep = ",") %>% 
    group_by(sample_id) %>% 
    summarise(N_replicon = n()) %>% 
    ungroup() %>% 
    mutate(N_replicon = as.factor(N_replicon)) %>% 
    ggplot(aes(x = N_replicon)) + geom_bar(position = "stack") + 
  coord_flip() + 
  scale_fill_manual(values = cm_palette) +
  theme_bw()

full_meta %>% filter(!is.na(plasmid_cluster_2)) %>% 
  separate_rows(`rep_type(s)`, sep = ",") %>% 
  group_by(`rep_type(s)`) %>% 
  summarise(N = n()) %>% 
  ggplot(aes(x = `rep_type(s)`, y = N, fill =`rep_type(s)` )) + 
  geom_col(stats = "identity", show.legend = F) + 
  scale_fill_manual(values = cm_palette) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
## Warning: Ignoring unknown parameters: stats

Although we observed a great variability of replicons, we found some predominant replicons

Host range

Based on the predictions made with MOB-Typer we can represent the host-range of the plasmids.

net %>% activate(nodes) %>% 
  mutate(predicted_host_range_overall_rank = ifelse(is.na(predicted_host_range_overall_rank),"-",predicted_host_range_overall_rank)) %>% 
ggraph(layout =ly )+ 
  geom_mark_hull(aes(fill = plasmid_cluster_2, 
                     x= ly[,1], 
                     y = ly[,2],
                     label =plasmid_cluster_2 ),
                 label.margin = margin(0, 0, 0, 0, "mm"),
                 concavity = 3,
                 expand=0.005,
                 radius=0.001, 
                 label.fontsize = 9, show.legend = F)+
  scale_fill_manual(values = cm_palette)+
  new_scale_fill()+
  geom_edge_link(color = "grey") + 
  geom_node_point(aes(fill=predicted_host_range_overall_rank),color ="black",size=3, shape = 21)+
  scale_color_d3()+
  scale_fill_d3()+
  theme_void() +
  labs(title ="Plasmid Host-Range")

net %>% activate(nodes) %>% 
  mutate(predicted_host_range_overall_name = ifelse(is.na(predicted_host_range_overall_name),"-",predicted_host_range_overall_name)) %>% 
ggraph(layout =ly )+ 
  geom_mark_hull(aes(fill = plasmid_cluster_2, 
                     x= ly[,1], 
                     y = ly[,2],
                     label =plasmid_cluster_2 ),
                 label.margin = margin(0, 0, 0, 0, "mm"),
                 concavity = 3,
                 expand=0.005,
                 radius=0.001, 
                 label.fontsize = 9, show.legend = F)+
  scale_fill_manual(values = cm_palette)+
  new_scale_fill()+
  geom_edge_link(color = "grey") + 
  geom_node_point(aes(fill=predicted_host_range_overall_name),color ="black",size=3, shape = 21)+
  scale_color_d3()+
  scale_fill_manual(values = cm_palette)+
  theme_void() + 
  guides(fill=guide_legend(ncol=1)) +
  labs(title ="Plasmid Host-Range")

full_meta %>% filter(!is.na(plasmid_cluster_2)) %>% 
  group_by(predicted_host_range_overall_name,predicted_host_range_overall_rank) %>% 
  summarise(N = n()) %>% 
  ggplot(aes(fill = predicted_host_range_overall_name, y = N, x =predicted_host_range_overall_rank)) + 
  geom_col(stats = "identity", show.legend = T) + 
  scale_fill_manual(values = cm_palette) +
  guides(fill=guide_legend(ncol=1)) +
  coord_flip()+
  theme_bw() 
## `summarise()` has grouped output by 'predicted_host_range_overall_name'. You can override using the `.groups` argument.
## Warning: Ignoring unknown parameters: stats

Marker genes annotation (Virulence Factors, AbR and Biocide and Metals)

We search Virulence Factors, AbR and Biocide and metals resistance genes. We use 90% identity threshold to filter the results.

annot <- pato::annotate(mm_ffn, type = "nucl", database = c("AbR","VF_A","bacmet"))
annot_filtered <- annot %>% filter(pident > 0.90) %>% group_by(Genome,Protein) %>% slice_max(pident,with_ties = F)

Using the annotation of marker genes (AbR, VF and B&M) we show the frequency of each marker gene in each plasmid cluster.

pl_cl_2_size <- full_meta %>% group_by(plasmid_cluster_2) %>% summarise(plasmid_cluster_size_2 = n())

annot_filtered %>% 
  ungroup() %>% 
  select(Genome,Gene,DataBase) %>% 
  distinct() %>% 
  inner_join(full_meta %>% 
               select(sample_id, plasmid_cluster_2) %>% 
               distinct(), 
             by = c("Genome" = "sample_id")) %>% 
  select(Genome,plasmid_cluster_2,Gene,DataBase) %>% 
  distinct() %>% 
  group_by(Gene,plasmid_cluster_2,DataBase) %>%
  summarise(N = n()) %>% 
  ungroup() %>% 
  inner_join(pl_cl_2_size) %>% 
  mutate(Freq = 100*N/plasmid_cluster_size_2) %>%
  filter(plasmid_cluster_2 != "other") %>% 
  ggplot(aes(x=Gene, y = plasmid_cluster_2, fill = Freq, label = round(Freq))) + 
  geom_tile() + 
  geom_text() +
  scale_fill_material(palette = "red") +
  facet_grid(~DataBase, scales = "free_x", space = "free_x")+
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## `summarise()` has grouped output by 'Gene', 'plasmid_cluster_2'. You can override using the `.groups` argument.
## Joining, by = "plasmid_cluster_2"