setwd("C:/Users/ROSFRB/Desktop/Daf")

#Libraries

# Packages: install + load 
# install missing CRAN packages
install_cran_if_missing <- function(pkgs) {
  missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
  if (length(missing)) install.packages(missing, dependencies = TRUE)
}

# Helper: install missing Bioconductor packages
install_bioc_if_missing <- function(pkgs) {
  if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
  bioc_missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
  if (length(bioc_missing)) BiocManager::install(bioc_missing, ask = FALSE, update = TRUE)
}

# CRAN packages used in the Rmd
cran_pkgs <- c(
  "tidyverse",   # dplyr, tidyr, readr, tibble, ggplot2, etc.
  "ggpubr",
  "patchwork",
  "vegan",
  "pheatmap",
  "ggnewscale",  # optional (separate color scales on same plot)
  "ape"          # optional (trees for UniFrac / Faith's PD)
)

# Bioconductor packages
bioc_pkgs <- c(
  "phyloseq",
  "DESeq2",
  "apeglm"       # LFC shrinkage
)

# Install what’s missing
install_cran_if_missing(cran_pkgs)
install_bioc_if_missing(bioc_pkgs)

# Load (quietly)
suppressPackageStartupMessages({
  library(tidyverse)
  library(ggpubr)
  library(patchwork)
  library(vegan)
  library(pheatmap)
  library(ggnewscale)  
  library(ape)         
  library(phyloseq)
  library(DESeq2)
  library(apeglm)
})
## Warning: package 'tidyverse' was built under R version 4.5.1
## Warning: package 'ggpubr' was built under R version 4.5.1
## Warning: package 'patchwork' was built under R version 4.5.1
## Warning: package 'vegan' was built under R version 4.5.1
## Warning: package 'permute' was built under R version 4.5.1
## Warning: package 'pheatmap' was built under R version 4.5.1
## Warning: package 'ggnewscale' was built under R version 4.5.1
## Warning: package 'ape' was built under R version 4.5.1
## Warning: package 'generics' was built under R version 4.5.1
## Warning: package 'matrixStats' was built under R version 4.5.1
# Theme 
theme_set(theme_bw(base_size = 12))

# Quick versions readout 
cat("Loaded package versions:\n")
## Loaded package versions:
for (p in c("phyloseq","DESeq2","apeglm","vegan","ggplot2","dplyr","tidyr","patchwork","ggpubr","pheatmap")) {
  if (requireNamespace(p, quietly = TRUE)) {
    cat(sprintf("  %s: %s\n", p, as.character(utils::packageVersion(p))))
  }
}
##   phyloseq: 1.52.0
##   DESeq2: 1.48.1
##   apeglm: 1.30.0
##   vegan: 2.7.1
##   ggplot2: 3.5.2
##   dplyr: 1.1.4
##   tidyr: 1.3.1
##   patchwork: 1.3.2
##   ggpubr: 0.6.1
##   pheatmap: 1.0.13

#Establishing a colour library - Not rainbow!

# ---- Earthy palette (15 colors) ----
pal_earth15 <- c(
  "#2F5D50", # pine
  "#6B8F71", # sage
  "#8C9C68", # olive
  "#4E7A48", # moss
  "#C9A227", # ochre
  "#B98E5A", # tan
  "#D8C7A0", # sand
  "#C46D3B", # terracotta
  "#B56969", # clay
  "#8C4A2F", # rust
  "#7D5A4F", # umber
  "#5B7065", # slate
  "#2F6F72", # deep teal
  "#3F4E6B", # blue slate
  "#50514F"  # stone
)

# Convenience ggplot2 scales
scale_fill_earth15  <- function(...) ggplot2::scale_fill_manual(values = pal_earth15, ...)
scale_color_earth15 <- function(...) ggplot2::scale_color_manual(values = pal_earth15, ...)

Build phyloseq from TSV files

library(phyloseq)
# install.packages("BiocManager"); BiocManager::install("phyloseq")
# file paths 
otu_path  <- "otu_table.tsv"
tax_path  <- "taxonomy.tsv"
meta_path <- "metadata.tsv"

# Read tables
otu <- read.delim(otu_path, check.names = FALSE, row.names = 1)
tax <- read.delim(tax_path, check.names = FALSE, row.names = 1)
meta <- read.delim(meta_path, check.names = FALSE)

# Basic hygiene
# Ensure counts are non-negative integers
otu_mat <- as.matrix(otu)
mode(otu_mat) <- "numeric"
otu_mat[is.na(otu_mat)] <- 0
otu_mat <- round(otu_mat)  

# Ensure taxonomy is matrix with OTU rows
tax_mat <- as.matrix(tax)

# Ensure metadata rownames = SampleID
stopifnot("SampleID" %in% colnames(meta))
rownames(meta) <- meta$SampleID
meta$SampleID <- NULL
sam <- sample_data(meta)

# 3) Align IDs
# OTU rownames should match taxonomy rownames
if (!all(rownames(otu_mat) %in% rownames(tax_mat))) {
  warning("Some OTUs in OTU table are missing from taxonomy; subsetting to intersection.")
}
common_otus <- intersect(rownames(otu_mat), rownames(tax_mat))
otu_mat <- otu_mat[common_otus, , drop = FALSE]
tax_mat <- tax_mat[common_otus, , drop = FALSE]

# Sample names: intersection between OTU columns and metadata rows
if (!all(colnames(otu_mat) %in% rownames(sam))) {
  warning("Some samples in OTU table are missing from metadata; subsetting to intersection.")
}
common_samps <- intersect(colnames(otu_mat), rownames(sam))
otu_mat <- otu_mat[, common_samps, drop = FALSE]
sam <- sam[common_samps, , drop = FALSE]

# 4) Build phyloseq object
OTU <- otu_table(otu_mat, taxa_are_rows = TRUE)
TAX <- tax_table(tax_mat)
SAM <- sam

ps <- phyloseq(OTU, TAX, SAM)

# 5) Quick summary
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 250 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 3 sample variables ]
## tax_table()   Taxonomy Table:    [ 250 taxa by 6 taxonomic ranks ]
# sample_sums(ps)[1:5]
# taxa_sums(ps)[1:5]
# head(tax_table(ps))
# sample_data(ps)[1:5,]

#Rarefaction

# Get OTU/ASV table with samples as rows
otu <- as(otu_table(ps), "matrix")
if (taxa_are_rows(ps)) otu <- t(otu)

# Total reads per sample
tot <- rowSums(otu)

# Choose max depth and step for curves
step    <- 100
raremax <- 20000   # or set a fixed value you prefer

depths <- seq(step, raremax, by = step)

# Compute expected richness at each depth for each sample
# vegan::rarefy returns NA for depths > sample total; we'll drop those.
rar_list <- lapply(seq_len(nrow(otu)), function(i) {
  exp_s <- suppressWarnings(vegan::rarefy(otu[i, , drop = FALSE], sample = depths))
  data.frame(Sample = rownames(otu)[i], Reads = depths, Richness = as.numeric(exp_s))
})

df_rar <- bind_rows(rar_list) %>%
  filter(!is.na(Richness))

# Join sample metadata if available
if (!is.null(sample_data(ps, errorIfNULL = FALSE))) {
  md <- as(sample_data(ps), "data.frame") %>% rownames_to_column("Sample")
  df_rar <- df_rar %>% left_join(md, by = "Sample")
}

# Plot
p <- ggplot(df_rar, aes(x = Reads, y = Richness, group = Sample)) +
  geom_line(alpha = 0.6,col="#2F6F72") +
  labs(
    title = "Rarefaction curves",
    x = "Subsampled reads",
    y = "Observed richness (S)"
  ) +
  theme_minimal()

# Optional: color by a metadata column, e.g., "Group"
if ("Group" %in% names(df_rar)) {
  p <- p + aes(color = Group)
}

# Optional: label endpoints
endpoints <- df_rar %>%
  group_by(Sample) %>%
  slice_max(Reads, n = 1, with_ties = FALSE) %>%
  ungroup()

p <- p+ geom_vline(xintercept = 10000, color = "blue", linetype = "dashed", linewidth = 0.5)

p

Quality control and filtering

# Remove low-prevalence taxa, low-depth samples.
# Document choices ( keep conservative to avoid data leakage norms).

# Minimum sample depth
min_depth <- 10000
ps <- prune_samples(sample_sums(ps) >= min_depth, ps)

# Prevalence filter: keep taxa present in ≥ X% samples
prev_cutoff <- 0.05
prevalence  <- apply(otu_table(ps), 1, function(x) sum(x > 0) / length(x))
ps <- prune_taxa(prevalence >= prev_cutoff, ps)
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 246 taxa and 50 samples ]
## sample_data() Sample Data:       [ 50 samples by 3 sample variables ]
## tax_table()   Taxonomy Table:    [ 246 taxa by 6 taxonomic ranks ]

Alpha diveristy metrices

# Alpha diversity measures
alpha_df <- estimate_richness(ps, measures = c("Observed", "Shannon", "Simpson"))

# Add metadata
alpha_df <- alpha_df %>%
  rownames_to_column("SampleID") %>%
  left_join(sample_data(ps) %>% data.frame() %>% rownames_to_column("SampleID"),
            by = "SampleID")

head(alpha_df)
##   SampleID Observed  Shannon   Simpson       Diet Activity AgeGroup
## 1     S006      211 4.512662 0.9831177      Vegan      low    18-29
## 2     S007      220 4.548603 0.9833893      Vegan   medium    18-29
## 3     S008      217 4.534260 0.9823765 Vegetarian   medium    45-59
## 4     S010      210 4.482464 0.9813063 Vegetarian      low    18-29
## 5     S016      213 4.548544 0.9835540      Vegan     high    18-29
## 6     S020      219 4.575772 0.9839758      Vegan   medium    45-59

Plotting of Shannon diversity

library(ggplot2)
library(ggpubr)

# Example: Shannon by Diet
p_shannon <- ggplot(alpha_df, aes(x = Diet, y = Shannon, fill = Diet)) +
  geom_boxplot(alpha = 0.8) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
  stat_compare_means(method = "kruskal.test", label.y = max(alpha_df$Shannon) + 0.2) +
  labs(title = "Alpha diversity (Shannon index) by Diet") +
  theme_bw()+
    scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/alpha_diversityDiet.png", p_shannon, width=8, height=5, dpi=300)


p_shannon

# Example: Shannon by Diet
p_shannonA <- ggplot(alpha_df, aes(x = AgeGroup, y = Shannon, fill = AgeGroup)) +
  geom_boxplot(alpha = 0.8) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
  stat_compare_means(method = "kruskal.test", label.y = max(alpha_df$Shannon) + 0.2) +
  labs(title = "Alpha diversity (Shannon index) by Age") +
  theme_bw()+
    scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/alpha_shannon.png", p_shannonA, width=8, height=5, dpi=300)

p_shannonA

Plotting of alpha diversity (Observed, Shannon and Simpson)

alpha_long <- alpha_df %>%
  pivot_longer(cols = c("Observed","Shannon","Simpson"),
               names_to = "Index", values_to = "Value")

p_alpha <- ggplot(alpha_long, aes(x = Diet, y = Value, fill = Diet)) +
  geom_boxplot(alpha = 0.8) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1) +
  facet_wrap(~Index, scales = "free_y") +
  stat_compare_means(method = "kruskal.test", label = "p.signif", hide.ns = TRUE) +
  labs(title = "Alpha diversity across diets") +
  theme_bw()+
  scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/alpha_diversity.png", p_alpha, width=8, height=5, dpi=300)


p_alpha

Relative abundance as stacked barplots

# Crearing long table from ps_rel
ps_rel <- transform_sample_counts(ps, function(x) x / sum(x))
df_gen <- phyloseq::psmelt(ps_rel)  

# Ensure Diet exists
if (!"Diet" %in% names(df_gen)) df_gen$Diet <- "Unknown"

# Keep top N genera: in this case 15 genera
topN <- 15
top_genera <- df_gen %>%
  group_by(Genus) %>%
  summarise(mean_abund = mean(Abundance, na.rm = TRUE), .groups = "drop") %>%
  slice_max(mean_abund, n = topN) %>%
  pull(Genus)

df_gen <- df_gen %>%
  mutate(
    Genus = ifelse(is.na(Genus) | Genus == "", "Unclassified", Genus),
    Genus_lumped = ifelse(Genus %in% top_genera, Genus, "Other")
  )

# Order genera by global mean 
gen_order <- df_gen %>%
  group_by(Genus_lumped) %>%
  summarise(mean_abund = mean(Abundance, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(mean_abund)) %>%
  pull(Genus_lumped)

df_gen$Genus_lumped <- factor(df_gen$Genus_lumped, levels = gen_order)

# Create a stable per-facet sample ordering 
order_df <- df_gen %>%
  distinct(Sample, Diet) %>%         
  arrange(Diet, Sample) %>%          
  mutate(Sample_ordered = factor(Sample, levels = Sample))

# Join the ordering back
df_gen <- df_gen %>%
  left_join(order_df, by = c("Sample", "Diet"))

# Plotting 
p_bar <- ggplot(df_gen, aes(x = Sample_ordered, y = Abundance, fill = Genus_lumped)) +
  geom_col(width = 0.95) +
  facet_grid(rows = vars(Diet), scales = "free_y", space = "free_y") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(title = "Relative abundance (Genus) per sample",
       x = "Samples (ordered within Diet)", y = "Relative abundance",
       fill = "Genus") +
  theme_bw() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid = element_blank(),
    strip.text = element_text(face = "bold")
  ) +
  scale_fill_earth15()+
  coord_flip()

p_bar

ggsave("figures/relabundance_genus_per_sample_by_diet.png",
       p_bar, width = 8, height = 15, dpi = 300)

Beta diversity analysis based on bray-curtis relative abundance

Plotting Beta diversity as PCoA plots.

# Bray–Curtis distance & PCoA
ord_bc <- ordinate(ps_rel, method = "PCoA", distance = "bray")

# Variance explained for axis labels
evals <- ord_bc$values$Eigenvalues
pct <- round(100 * evals / sum(evals), 1)

# Plot
p_bc <- plot_ordination(ps_rel, ord_bc, color = "AgeGroup") +
  geom_point( size = 3, alpha = 0.8) +
  stat_ellipse(aes(group = Diet, color = Diet), linewidth = 0.8) +
  labs(title = "PCoA — Bray–Curtis",
       x = paste0("PCo1 [", pct[1], "%]"),
       y = paste0("PCo2 [", pct[2], "%]")) +
  theme_bw()+
  labs(shape="Activity level")+
  scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/Beta_divAge.png", p_bc, width=8, height=5, dpi=300)


p_bc

p_bA <- plot_ordination(ps_rel, ord_bc, color = "Diet") +
  geom_point(aes(shape = Activity), size = 3, alpha = 0.8) +
  stat_ellipse(aes(group = Diet, color = Diet), linewidth = 0.8) +
  labs(title = "PCoA — Bray–Curtis",
       x = paste0("PCo1 [", pct[1], "%]"),
       y = paste0("PCo2 [", pct[2], "%]")) +
  theme_bw()+
  labs(shape="Beta diveristy - Bray Curtis")+
  scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/Beta_diet.png", p_bc, width=8, height=5, dpi=300)


p_bA

Perform Permanova

library(vegan)

# Distance matrix
dist_bc <- phyloseq::distance(ps_rel, method = "bray")

# PERMANOVA (adonis2)
adon_res <- adonis2(as.matrix(dist_bc) ~ Diet, data = sample_data(ps_rel) %>% data.frame())
adon_res
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.matrix(dist_bc) ~ Diet, data = sample_data(ps_rel) %>% data.frame())
##          Df SumOfSqs      R2      F Pr(>F)    
## Model     2  0.42798 0.62401 39.001  0.001 ***
## Residual 47  0.25788 0.37599                  
## Total    49  0.68585 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test homogeneity of group dispersions
bd <- betadisper(as.dist(dist_bc), group = sample_data(ps_rel)$Diet)
anova(bd)
## Analysis of Variance Table
## 
## Response: Distances
##           Df    Sum Sq    Mean Sq F value Pr(>F)
## Groups     2 0.0000995 4.9729e-05  0.4194 0.6599
## Residuals 47 0.0055725 1.1857e-04
plot(bd)

Differential abundance analysis with DESeq2

library(DESeq2)
library(phyloseq)
# Agglomerate to Genus (optional: can also do Phylum or Family)
ps_genus <- tax_glom(ps, taxrank = "Species")

# Make sure sample_data has factors (important for DESeq2 design)
sample_data(ps_genus)$Diet <- factor(sample_data(ps_genus)$Diet)
dds <- phyloseq_to_deseq2(ps_genus, ~ Diet)
## converting counts to integer mode
# Calculate size factors (normalize for sequencing depth)
dds <- DESeq(dds, test = "Wald", fitType = "parametric")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
BiocManager::install("apeglm")
## Bioconductor version 3.21 (BiocManager 1.30.26), R 4.5.0 (2025-04-11 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'apeglm'
## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.5.0/library
##   packages:
##     boot, lattice, Matrix, mgcv
## Old packages: 'aplot', 'broom', 'cli', 'credentials', 'dbplyr', 'DESeq2',
##   'dtplyr', 'ggplot2', 'googledrive', 'magrittr', 'purrr', 'RcppArmadillo',
##   'stringr', 'tibble', 'xfun'
library(apeglm)
res <- results(dds, contrast = c("Diet", "Vegan", "Carnivore"))
res <- lfcShrink(dds, coef = "Diet_Vegan_vs_Carnivore", res = res)  # shrinks log2FC
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
# Order by adjusted p-value
res_sig <- res[which(res$padj < 0.05), ]
res_sig <- res_sig[order(res_sig$padj), ]

head(res_sig)
## log2 fold change (MAP): Diet Vegan vs Carnivore 
## Wald test p-value: Diet Vegan vs Carnivore 
## DataFrame with 6 rows and 5 columns
##          baseMean log2FoldChange     lfcSE       pvalue         padj
##         <numeric>      <numeric> <numeric>    <numeric>    <numeric>
## OTU0030  356.2332      -0.928139 0.0424042 3.83902e-107 1.42044e-105
## OTU0229 1046.1543       0.725086 0.0453361  2.64158e-58  4.88693e-57
## OTU0099  627.9742      -0.725663 0.0585970  6.94704e-36  8.56802e-35
## OTU0095  599.2641      -0.319581 0.0267394  3.92580e-33  3.63137e-32
## OTU0104  118.3157       1.124072 0.1006569  5.07037e-30  3.75208e-29
## OTU0146   40.5562       1.081001 0.1240993  2.43561e-19  1.30771e-18
tax <- as.data.frame(tax_table(ps_genus))
res_tax <- cbind(as(res_sig, "data.frame"), tax[rownames(res_sig), ])

head(res_tax)
##           baseMean log2FoldChange      lfcSE        pvalue          padj
## OTU0030  356.23324     -0.9281386 0.04240417 3.839021e-107 1.420438e-105
## OTU0229 1046.15435      0.7250861 0.04533609  2.641584e-58  4.886930e-57
## OTU0099  627.97421     -0.7256628 0.05859703  6.947042e-36  8.568018e-35
## OTU0095  599.26411     -0.3195809 0.02673944  3.925804e-33  3.631368e-32
## OTU0104  118.31574      1.1240724 0.10065692  5.070373e-30  3.752076e-29
## OTU0146   40.55617      1.0810010 0.12409930  2.435607e-19  1.307709e-18
##                 Phylum               Class             Order             Family
## OTU0030 Actinobacteria      Actinobacteria Bifidobacteriales Bifidobacteriaceae
## OTU0229   Fusobacteria       Fusobacteriia   Fusobacteriales   Fusobacteriaceae
## OTU0099  Bacteroidetes         Bacteroidia     Bacteroidales     Prevotellaceae
## OTU0095 Proteobacteria Alphaproteobacteria   Rhodobacterales   Rhodobacteraceae
## OTU0104     Firmicutes             Bacilli   Lactobacillales   Streptococcaceae
## OTU0146     Firmicutes          Clostridia     Clostridiales    Lachnospiraceae
##                   Genus                    Species
## OTU0030 Bifidobacterium        Bifidobacterium_sp.
## OTU0229   Fusobacterium Fusobacterium_unclassified
## OTU0099      Prevotella    Prevotella_unclassified
## OTU0095     Rhodobacter           Rhodobacter_spp.
## OTU0104   Streptococcus Streptococcus_unclassified
## OTU0146         Blautia       Blautia_unclassified

plotting of DESeq2 results

library(ggplot2)

DEs <- ggplot(res_tax, aes(x = log2FoldChange, y = -log10(padj), color = Genus)) +
  geom_point(alpha = 0.7, size = 3) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
  geom_vline(xintercept = c(0, 0), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  theme_bw() +
  labs(title = "Differential abundance: Vegan vs Carnivore",
       x = "Log2 Fold Change", y = "-log10 adjusted p-value")+
  scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/Differential_abundance.png", DEs, width=8, height=5, dpi=300)


DEs 

Pathching it all together :)

# kill legends on the others
p_shannon_nl  <- p_shannon  + theme(legend.position = "none")
p_shannonA_nl <- p_shannonA + theme(legend.position = "none")
p_alpha_nl    <- p_alpha    + theme(legend.position = "none")

combined1 <- (
  p_bar| (p / (p_alpha_nl) /
  (p_shannon_nl | p_shannonA_nl) /
  (p_bc | DEs))
) +
  plot_layout(guides = "collect") &
  theme(legend.position = "right")

combined1

ggsave("figures/Combi.png", combined1, width=20, height=15, dpi=300)