Neurogenic

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.4

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.0.5

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.5

## Warning: package 'dplyr' was built under R version 4.0.5

## Warning: package 'forcats' was built under R version 4.0.3

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(qiime2R)
library(ggsignif)

## Warning: package 'ggsignif' was built under R version 4.0.5

# setwd("~/oldserver/neurogenic/")
setwd("/pita/users/rotem/neurogenic/")
metadata <- "neurogenic_table_with_reads_10Feb22.txt" %>% read_tsv()

## New names:
## • `LYMPHO` -> `LYMPHO...22`
## • `LYMPHO` -> `LYMPHO...23`
## • `` -> `...34`

## Rows: 105 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (18): sampleid, number, name, m/f, birth, date of injury, ASIA, weight, ...
## dbl (14): id_suf, age, BMI, number sample, days from accident, HGB, NEUTRO, ...
## lgl  (2): LYMPHO...22, ...34
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

metadata <- "neurogenic_table_with_reads_19Sep22.csv" %>% read_csv()

## New names:
## Rows: 105 Columns: 34
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (20): sampleid, number, name, m/f, birth, date of injury, para/tetra, AS... dbl
## (13): id_suf, age, number sample, days from accident, HGB, NEUTRO, LYMPH... lgl
## (1): LYMPHO...23
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `LYMPHO` -> `LYMPHO...23`
## • `LYMPHO` -> `LYMPHO...24`

metadata <- metadata %>% filter(number != "37")

metadata$group <- metadata$group %>% factor(levels = 
                                              c("Ambulatoric","Hospitalized", "healthy"))

Cohort

metadata %>% group_by(number) %>%
  # mutate(times = row_number()) %>%
  filter(group == "Hospitalized") %>% inner_join(
    data.frame(qiime2R::read_qza("cmr/faith_pd_vector.qza")$data) %>% 
    rename(Faith = faith_pd) %>% rownames_to_column("sampleid")) %>% 
  group_by(number) %>% mutate(delta_Faith = Faith - lag(Faith)) %>% 
  # filter(number != 8) %>% 
  ggplot(aes(x = `days from accident`, y = fct_reorder(number, Faith))) + 
  geom_line(color = "black") +
  geom_point(size = 4) +
  scale_color_gradient2(low="red", high="blue", mid = "grey") +
  # facet_wrap(~number, ncol = 3)
  theme_classic()  + 
  theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
  labs(y = "Patient", x = "Days since accident", col = "Faith")

## Joining, by = "sampleid"

Alpha Diversity

# metadata <- "neurogenic_table_with_reads_21Dec21.tsv" %>% read_tsv()
# metadata <- "neurogenic_table_with_reads_24feb22.xlsx" %>% readxl::read_xlsx(1)
# metadata <- "neurogenic_table_with_reads_24feb22.csv" %>% read_csv()
metadata <- metadata %>% filter(number != "37")
data <- qiime2R::read_qza("neurogenic_table.qza")$data
taxonomy <- read_qza("../../../pub/data/16S_DBs/processed/16S_DB1-19_merged/taxonomy.qza")$data %>% data.frame()
shannon <- vegan::diversity(data, MARGIN = 2, index = "shannon") %>% data.frame()

pl <- 
metadata %>% inner_join(
  data.frame(qiime2R::read_qza("cmr2/faith_pd_vector.qza")$data) %>% rownames_to_column("sampleid")) %>% 
  group_by(number) %>% filter(sampleid == last(sampleid)) %>%
  # group_by(number) %>% filter(sampleid == first(sampleid) | sampleid == last(sampleid)) %>%
  # group_by(number) %>% mutate(fl = case_when(sampleid == first(sampleid) ~ "first", sampleid== last(sampleid) ~ "last" )) %>%
  # mutate(group = paste(group, fl)) %>% 
  ggplot(aes(x = group, y = faith_pd, fill = group)) + 
  geom_boxplot(show.legend = F, outlier.alpha = 0) +
  geom_dotplot(binaxis = "y", stackdir = "center", alpha = .3) +
  ggsignif::geom_signif(comparisons = list(c("Hospitalized", "Ambulatoric"), c("healthy", "Hospitalized"), c("Ambulatoric", "healthy")), step_increase = .1, map_signif_level = F) +
  # ggsignif::geom_signif(comparisons = list(c("Hospitalized first", "Hospitalized last"), c("healthy first", "Hospitalized last"), c("Ambulatoric first", "Hospitalized last")), step_increase = .1) +
  theme_classic() + 
  theme(legend.position = "none") +ggtitle("All Hospitalized")

## Joining, by = "sampleid"

pl

## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

# ggsave("faith.png", pl, height = 14, width = 8, units = "cm")

Disease index

# pl <- 
metadata %>% rename(SampleID = sampleid) %>% 
  filter(!(group %>% is.na())) %>% 
  inner_join(read_csv("NG_unidi.csv")) %>% 
  group_by(number) %>% filter(SampleID == last(SampleID)) %>%
  # group_by(number) %>% filter(SampleID == first(SampleID) | SampleID == last(SampleID)) %>%
  # group_by(number) %>% mutate(fl = case_when(SampleID == first(SampleID) ~ "first", SampleID == last(SampleID) ~ "last" )) %>%
  # mutate(group = paste(group, fl)) %>%
  # filter(group != "NG NA") %>% 
  ggplot(aes(x = group, y = UniDI, fill = group)) +
  geom_boxplot(width = .3, show.legend = F, outlier.alpha = 0) +
  geom_dotplot(binaxis = "y", stackdir = "center", alpha = .3) +
  ggsignif::geom_signif(comparisons = list(c("Hospitalized", "Ambulatoric"), c("healthy", "Hospitalized"), c("Ambulatoric", "healthy")), step_increase = .1, map_signif_level = F) +
  # ggsignif::geom_signif(comparisons = mycompr, step_increase = .1, map_signif_level = T) +
  theme_classic() + 
  theme(legend.position = "none") + ggtitle("All Hospitalized")

## Rows: 102 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): SampleID
## dbl (1): UniDI
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "SampleID"
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

pl

## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

ggsave("UniDi.png", pl, height = 14, width = 8, units = "cm")

## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

Taxa bar plot

## tbp  
# metadata <- "neurogenic_table_with_reads_24feb22.csv" %>% read_csv()
metadata <- metadata %>% filter(number != "37")

tbp_table <- read_csv("tbp.tsv/level-2.csv")

## Rows: 105 Columns: 51
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (23): index, number, name, m.f, birth, accident.date, spinal, ASIA, weig...
## dbl (27): k__Archaea;p__Euryarchaeota, k__Bacteria;__, k__Bacteria;p__Actino...
## lgl  (1): LYMPHO
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

tbp_table$group[tbp_table$group  == "NG"] <- "Hospitalized"
tbp_table$group[tbp_table$group  == "NG_clinic"] <- "Ambulatoric"
tbp_table$group <- tbp_table$group %>% factor(levels = c("Ambulatoric","Hospitalized", "healthy"))
tbp_table <- tbp_table %>% 
  left_join(metadata) %>% 
  pivot_longer(cols = contains("k__"), names_to = "Taxon") %>% 
  mutate(Taxon = str_remove(Taxon, "k__Bacteria;")) %>% 
  group_by(number, Taxon) %>% summarise(reads = last(value), group = first(group)) %>% 
  filter(reads > 10)

## Joining, by = c("number", "name", "id_suf", "age", "birth", "ASIA", "weight",
## "BMI", "supplements", "diet", "smoker", "HGB", "NEUTRO", "MONO", "glucose",
## "albumin", "CRP", "sample_code", "reads_number", "group")
## `summarise()` has grouped output by 'number'. You can override using the
## `.groups` argument.

tbp_table <- tbp_table %>% group_by(number) %>% mutate(RA = reads/sum(reads)) %>% ungroup()


low_species <- tbp_table %>% group_by(Taxon) %>% summarise(reads = sum(reads)) %>% arrange(reads) %>% slice_head(n = 8) %>% pull(Taxon)
high_species <- tbp_table %>% group_by(Taxon) %>% summarise(reads = sum(reads)) %>% arrange(desc(reads)) %>% slice_head(n = 4) %>% pull(Taxon)
for(species in low_species) tbp_table <- tbp_table %>% mutate(Taxon = na_if(Taxon, species))

tbp_table %>% 
  # mutate(Taxon = replace_na(Taxon, "Others")) %>%
  # ggplot(aes(x = number, y = reads, fill = Taxon)) + geom_col(position = "fill") + 
  ggplot(aes(x = number, y = RA, fill = Taxon)) + geom_col(position = "fill") + 
  facet_grid(~group, scales = "free_x", space = "free_x") + 
  ylab("") + theme_void() + theme(axis.text.x = element_blank(), strip.text.x = element_text(size = 12))

tbp_table %>% filter(Taxon == "p__Bacteroidetes" | Taxon == "p__Firmicutes") %>% 
  pivot_wider(id_cols = c(number, group), names_from = "Taxon", values_from = "RA") %>% 
  mutate(FB_ratio = p__Firmicutes/p__Bacteroidetes) %>% 
  ggplot() + aes(x = group, y = FB_ratio, color = group) + 
  geom_boxplot() + 
  ggsignif::geom_signif(comparisons = list(
    c("Hospitalized", "Ambulatoric")
  , c("healthy", "Hospitalized")
  , c("Ambulatoric", "healthy")), step_increase = .1) + 
  theme_classic()

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1 rows containing non-finite values (stat_signif).

tbp_table %>% filter(Taxon %in% high_species) %>% 
  ggplot() + aes(x = group, y = RA, color = group) + 
  geom_boxplot() + geom_jitter(alpha = .5) + 
  facet_wrap(~Taxon, scales = "free_y") +
  ggsignif::geom_signif(comparisons = 
                list(c("Hospitalized", "Ambulatoric")
                   , c("healthy", "Hospitalized")
                   , c("Ambulatoric", "healthy")), step_increase = .1, map_signif_level = F) +
  theme_classic() + theme(strip.background = element_blank())

Alpha first and last

# tbp_table %>% group_by(Taxon) %>% summarise(cor.test())

library(ggpubr)

## 
## Attaching package: 'ggpubr'

## The following object is masked from 'package:qiime2R':
## 
##     mean_sd

tbl <- 
metadata %>% inner_join(
  data.frame(qiime2R::read_qza("cmr2/faith_pd_vector.qza")$data) %>% 
    rownames_to_column("sampleid")) %>% 
  # group_by(number) %>% filter(sampleid == last(sampleid)) %>%
  # group_by(number) %>% filter(sampleid == first(sampleid) | sampleid == last(sampleid)) %>%
  group_by(number) %>% mutate(fl = case_when(sampleid == first(sampleid) ~ "first", sampleid== last(sampleid) ~ "last" )) %>%
  mutate(group = paste(group, fl)) %>% 
  filter(group != "Hospitalized NA")

## Joining, by = "sampleid"

pl <- tbl %>% ggplot(aes(x = group, y = faith_pd, fill = group)) + 
  geom_boxplot(show.legend = F, outlier.alpha = 0) +
  geom_dotplot(binaxis = "y", stackdir = "center", alpha = .3)

mycompr <- list(  c("healthy first", "Ambulatoric first")
                , c("healthy first", "Hospitalized last")
                , c("healthy first", "Hospitalized first")
                , c("Hospitalized first", "Hospitalized last")
                , c("Hospitalized first", "Ambulatoric first")
                , c("Hospitalized last" , "Ambulatoric first")
                )
pl + stat_compare_means(paired = T)

## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

pl +
  # ggsignif::geom_signif(comparisons = list(c("Hospitalized", "Ambulatoric"), c("healthy", "Hospitalized"), c("Ambulatoric", "healthy")), step_increase = .1) +
  ggsignif::geom_signif(comparisons = mycompr, step_increase = .1) +
  theme_classic() +
  theme(legend.position = "none")

## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

df <- data.frame(num_ASV = c(37, 71, 30, 20), healthy = c(T,T,F,F), hospitalized = c(T,F,T,F))
df %>% ggplot(aes(x = healthy, y = num_ASV, fill = hospitalized)) + geom_col() + 
  labs(x = "Higher in healthy", y = "number of ASV", fill = "Higher in hospitalized") + 
  theme_classic()

df %>% gt::gt_preview()

	num_ASV	healthy	hospitalized
1	37	TRUE	TRUE
2	71	TRUE	FALSE
3	30	FALSE	TRUE
4	20	FALSE	FALSE

tbp_table_RA_fill0 <- tbp_table %>% 
  filter(!is.na(Taxon)) %>% 
  # group_by(number) %>% mutate(RA = reads/sum(reads)) %>% ungroup() %>% 
  pivot_wider(id_cols = Taxon, names_from = number, values_from = RA, values_fill = 0) %>% 
  pivot_longer(cols = -1, names_to = "number", values_to = "RA") %>%
    left_join(tbp_table %>% select(number, group)) %>%  distinct(number, Taxon, group, RA)


library(rstatix)
tbp_table_RA_fill0 %>% group_by(Taxon) %>% 
  t_test(RA ~ group) %>% adjust_pvalue(method = "BH") %>% add_significance() %>% 
  gt::gt_preview()

species_signif <- tbp_table_RA_fill0 %>% group_by(Taxon)  %>% group_by(Taxon) %>% 
  t_test(RA ~ group) %>% adjust_pvalue(method = "BH") %>%  filter(p.adj <.05) %>% pull(Taxon) %>% unique()
for(taxon in high_species){
  tbp_table_RA_fill0 %>% filter(Taxon == taxon) %>% 
    ggplot(aes(x = group, y = RA, fill = group)) + geom_boxplot() + 
    geom_jitter()+
    ggsignif::geom_signif(comparisons = list(c("Hospitalized", "Ambulatoric"), c("healthy", "Hospitalized"), c("healthy", "Ambulatoric")),
                step_increase = .1, map_signif_level = F)+ theme_classic() + ggtitle(taxon)
  ggsave(paste0(taxon, ".png"), height = 4)
}

tbp_table_RA_fill0 %>% 
  # filter(Taxon %in% species_signif) %>% 
  ggplot(aes(x = group, y = RA, fill = group)) + 
  geom_jitter(aes(color = group),alpha = .4) +
  geom_boxplot(  outlier.size = 0) + 
  ggsignif::geom_signif(comparisons = list(c("Hospitalized", "Ambulatoric"), c("healthy", "Hospitalized"), c("healthy", "Ambulatoric")),
                        step_increase = .1, map_signif_level = F)+ theme_classic() +
  facet_wrap(~Taxon, scales = "free_y")
ggsave("Taxons_level.png", height = 8, width = 8)

metadata %>% distinct(number, .keep_all = T) %>% 
    group_by(group) %>% 
    summarise(Age = mean(age), Size = n(), Males = sum(str_count(`m/f`, "m"))) %>%
    mutate(`%` = Males/Size*100)

## # A tibble: 3 × 5
##   group          Age  Size Males   `%`
##   <fct>        <dbl> <int> <int> <dbl>
## 1 Ambulatoric   41.1    19    16  84.2
## 2 Hospitalized  45.7    12    12 100  
## 3 healthy       39.6    32    30  93.8

  pl <- tbp_table %>% filter(Taxon %in%  high_species) %>% 
    ggplot(aes(x = group, y = RA, fill = group)) + 
    geom_boxplot() + geom_jitter(alpha = .7)+
    ggsignif::geom_signif(comparisons = 
                            list(  c("Hospitalized", "Ambulatoric")
                                 , c("healthy", "Hospitalized")
                                 , c("healthy", "Ambulatoric")),
                step_increase = .1, map_signif_level = F)+ theme_classic() + 
    facet_wrap(~Taxon) 
  pl

  ggsave("4_highs.png", pl, height = 5)

## Saving 7 x 5 in image

norm_data <- 
data %>% data.frame() %>% rownames_to_column("Feature.ID") %>% pivot_longer(-1) %>% 
  group_by(name) %>% mutate(RA = value/sum(value)) %>% select(-value) %>% 
  pivot_wider(id_cols = Feature.ID, names_from = name, values_from = RA)

NG_ids <- metadata %>% filter(group == "Hospitalized") %>% pull(sampleid)
hel_ids <- metadata %>% filter(group == "healthy") %>% pull(sampleid)



at <- 
read_csv("Corelation_with_time_from_accident.csv") %>% rename(Feature.ID = 1) %>% 
  filter(`_calour_direction` == 'Anti-days from accident') %>% left_join(norm_data)

## New names:
## Rows: 47 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): ...1, _feature_id, _calour_direction dbl (3): _calour_stat, _calour_pval,
## _calour_qval
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "Feature.ID"
## • `` -> `...1`

dd <- 
  read_csv("Corelation_with_time_from_accident.csv") %>% rename(Feature.ID = 1) %>% 
  filter(`_calour_direction` != 'Anti-days from accident') %>% left_join(norm_data)

## New names:
## Rows: 47 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): ...1, _feature_id, _calour_direction dbl (3): _calour_stat, _calour_pval,
## _calour_qval
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining, by = "Feature.ID"
## • `` -> `...1`

at <- at %>% select(Feature.ID, contains("calour")) %>% 
  cbind(data.frame(Ambulatoric = at %>% select(NG_ids) %>% rowMeans())) %>% 
  cbind(data.frame(Healthy = at %>% select(hel_ids) %>% rowMeans()))

## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(NG_ids)` instead of `NG_ids` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(hel_ids)` instead of `hel_ids` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

dd <- dd %>% select(Feature.ID, contains("calour")) %>% 
  cbind(data.frame(Ambulatoric = dd %>% select(NG_ids) %>% rowMeans())) %>% 
  cbind(data.frame(Healthy = dd %>% select(hel_ids) %>% rowMeans()))


# for(species in low_species) tbp_table <- tbp_table %>% mutate(Taxon = na_if(Taxon, species))


amnon <- at %>% rbind(dd) %>% 
  left_join(taxonomy) %>% 
  separate(Taxon, sep = "; ", into = c("K", "Phylum", "C", "O","F","G", "S"))

## Joining, by = "Feature.ID"

## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 7 rows [15, 23,
## 27, 31, 38, 39, 47].

amnon <- 
  amnon %>% 
  mutate(`_calour_direction` = 
         recode(`_calour_direction`, "Anti-days from accident"= "Decrease over time")) %>% 
  mutate(direction = 
         recode(`_calour_direction`, "days from accident"= "Increase over time"))

bb <- data.frame(line = c(0, .05), direction = NA, Phylum = NA)

amnon %>% 
  ggplot(aes(x = Healthy, y = Ambulatoric
             , shape = direction)) + 
  stat_ellipse(aes(fill = direction), geom = "polygon", alpha = .3) + 
  geom_point(aes(color = Phylum), size = 4, stroke = 3) + 
  scale_shape_manual(values = c(25, 24)) + 
  scale_color_manual(values = c("#F8766D", "#A3A500", "#39B600", "red","#00B0F6","blue")) +
  scale_x_continuous(trans = "log10", breaks = c(10^-5,10^-3,10^-1)) + 
  scale_y_continuous(trans = "log10", breaks = c(10^-5,10^-3,10^-1)) +
  # scale_fill_manual(values  = c("#F8766D", "#A3A500", "#39B600", "red","#00B0F6","blue")) +
  theme_classic() + 
  labs(x = ~"Average abundance in "*underline(healthy)*" group (log scale)"
     , y = ~"Average abundance in "*underline(ambulatoric)*" group (log scale)"
     , shape = ~"Correlation with time\nfrom accident in\nHospitalized group"
     , fill  = ~"Correlation with time\nfrom accident in\nHospitalized group") + 
  geom_line(data = bb, aes(x = line, y = line), linetype = 2, color = "grey") + 
   ggtitle("Bacetria species that corelated with time from accident \nare higher in the ambulatoric group and vice versa") + 
  guides(fill = guide_legend(order = 1), shape = guide_legend(order = 1))

amnon %>% 
  ggplot(aes(x = Healthy, y = Ambulatoric, shape = direction)) + 
  geom_smooth(aes(fill = direction), linetype = "blank", span = 100) +
  geom_point(aes(color = Phylum, fill = direction)
             , size = 4, stroke = 1) + 
  scale_shape_manual(values = c(25, 24)) +
  scale_x_continuous(trans = "log10", breaks = c(10^-5,10^-3,10^-1)) + 
  scale_y_continuous(trans = "log10", breaks = c(10^-5,10^-3,10^-1)) +
  theme_classic() + 
  labs(x = ~"Average abundance in "*underline(healthy)*" group (log scale)"
     , y = ~"Average abundance in "*underline(ambulatoric)*" group (log scale)"
     , shape = ~"Correlation with time\nfrom accident in\nHospitalized group"
     , fill  = ~"Correlation with time\nfrom accident in\nHospitalized group"
     , color  = ~"Phylum") + 
  geom_line(data = bb, aes(x = line, y = line), linetype = 3, color = "black") + 
   ggtitle("Bacetria species that corelated with time from accident \nare higher in the ambulatoric group and vice versa") +
  scale_color_manual(values  = c("#F8766D", "#A3A500", "#39B600", "red","#00B0F6","blue")) + 
  guides(fill = guide_legend(order = 1), shape = guide_legend(order = 1)) + 
  theme(legend.key.size = unit(.1,"native"))

amnon %>% pivot_longer(cols = c(Ambulatoric, Healthy)) %>% 
  ggplot(aes(x = name, y = value %>% log())) + 
  geom_boxplot(outlier.alpha =0) + 
  facet_wrap(~direction, scales = "free") + 
  geom_jitter(aes(color = `_calour_qval`)) + 
  geom_signif(comparisons = list(c("Ambulatoric", "Healthy")))

## Warning: Removed 8 rows containing non-finite values (stat_boxplot).

## Warning: Removed 8 rows containing non-finite values (stat_signif).

## Warning in wilcox.test.default(c(-8.98776548840483, -5.03565510067515,
## -7.03484848840462, : cannot compute exact p-value with ties

## Warning in wilcox.test.default(c(-9.28901687668794, -7.21594866474404,
## -10.124139145837, : cannot compute exact p-value with ties

## Warning: Removed 8 rows containing missing values (geom_point).

taxonomy$Taxon[1]

## [1] "k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta; f__; g__; s__"

norm_data %>% left_join(taxonomy) %>% 
  separate(Taxon, sep = "; ", into = c("k", "p", "c", "o","f","g","s")) %>% 
  pivot_longer(contains("DB"), names_to = "sampleid", values_to = "RA") %>% 
  left_join(metadata) %>% 
  # filter(g == "g__Alistipes" | g == "g__Anaerotruncus") %>%
  # filter(g == "g__Alistipes") %>% 
  filter( g == "g__Roseburia") %>% 
  filter(!is.na(group)) %>% 
  ggplot(aes(color = group, x = group, y = RA)) + geom_boxplot() + 
  geom_jitter() + 
  scale_y_continuous(trans = "log10") +
      ggsignif::geom_signif(comparisons = 
                            list(  c("Hospitalized", "Ambulatoric")
                                 , c("healthy", "Hospitalized")
                                 , c("healthy", "Ambulatoric")),
                step_increase = .1, map_signif_level = F) + 
  ggtitle("g__Megamonas") + 
  theme_classic()

## Joining, by = "Feature.ID"

## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 226 rows [9,
## 10, 11, 12, 13, 16, 17, 19, 20, 51, 79, 81, 122, 142, 143, 221, 273, 290, 300,
## 309, ...].

## Joining, by = "sampleid"

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

## Warning: Removed 1796 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1796 rows containing non-finite values (stat_signif).

## Warning: Removed 1796 rows containing missing values (geom_point).

# Maaslin2::Maaslin2(norm_data, metadata, output = "mmaslin", reference = )

"g__Alistipes"

## [1] "g__Alistipes"

"g__Anaerotruncus"

## [1] "g__Anaerotruncus"

"g__Roseburia"

## [1] "g__Roseburia"

"g__Megamonas"

## [1] "g__Megamonas"

taxonomy$Taxon[taxonomy$Taxon %>% str_detect("Megamonas")]

##  [1] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
##  [2] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
##  [3] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
##  [4] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
##  [5] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
##  [6] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
##  [7] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
##  [8] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
##  [9] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
## [10] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
## [11] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
## [12] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
## [13] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__hypermegale"
## [14] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
## [15] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__hypermegale"
## [16] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__hypermegale"
## [17] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"           
## [18] "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Megamonas; s__"

ggsignif::geom_signif()

## geom_signif: na.rm = FALSE, extend_line = 0, parse = FALSE, orientation = NA
## stat_signif: na.rm = FALSE, comparisons = NULL, test = wilcox.test, test.args = NULL, annotations = NULL, map_signif_level = FALSE, y_position = NULL, xmin = NULL, xmax = NULL, margin_top = 0.05, step_increase = 0, tip_length = 0.03, manual = FALSE, orientation = NA
## position_identity