setwd("C:/Users/ROSFRB/Desktop/Daf")

#Libraries

# Packages: install + load 
# install missing CRAN packages
install_cran_if_missing <- function(pkgs) {
  missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
  if (length(missing)) install.packages(missing, dependencies = TRUE)
}

# Helper: install missing Bioconductor packages
install_bioc_if_missing <- function(pkgs) {
  if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
  bioc_missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
  if (length(bioc_missing)) BiocManager::install(bioc_missing, ask = FALSE, update = TRUE)
}

# CRAN packages used in the Rmd
cran_pkgs <- c(
  "tidyverse",   # dplyr, tidyr, readr, tibble, ggplot2, etc.
  "ggpubr",
  "patchwork",
  "vegan",
  "pheatmap",
  "ggnewscale",  # optional (separate color scales on same plot)
  "ape"          # optional (trees for UniFrac / Faith's PD)
)

# Bioconductor packages
bioc_pkgs <- c(
  "phyloseq",
  "DESeq2",
  "apeglm"       # LFC shrinkage
)

# Install what’s missing
install_cran_if_missing(cran_pkgs)
install_bioc_if_missing(bioc_pkgs)

# Load (quietly)
suppressPackageStartupMessages({
  library(tidyverse)
  library(ggpubr)
  library(patchwork)
  library(vegan)
  library(pheatmap)
  library(ggnewscale)  
  library(ape)         
  library(phyloseq)
  library(DESeq2)
  library(apeglm)
})

## Warning: package 'tidyverse' was built under R version 4.5.1

## Warning: package 'ggpubr' was built under R version 4.5.1

## Warning: package 'patchwork' was built under R version 4.5.1

## Warning: package 'vegan' was built under R version 4.5.1

## Warning: package 'permute' was built under R version 4.5.1

## Warning: package 'pheatmap' was built under R version 4.5.1

## Warning: package 'ggnewscale' was built under R version 4.5.1

## Warning: package 'ape' was built under R version 4.5.1

## Warning: package 'generics' was built under R version 4.5.1

## Warning: package 'matrixStats' was built under R version 4.5.1

# Theme 
theme_set(theme_bw(base_size = 12))

# Quick versions readout 
cat("Loaded package versions:\n")

## Loaded package versions:

for (p in c("phyloseq","DESeq2","apeglm","vegan","ggplot2","dplyr","tidyr","patchwork","ggpubr","pheatmap")) {
  if (requireNamespace(p, quietly = TRUE)) {
    cat(sprintf("  %s: %s\n", p, as.character(utils::packageVersion(p))))
  }
}

##   phyloseq: 1.52.0
##   DESeq2: 1.48.1
##   apeglm: 1.30.0
##   vegan: 2.7.1
##   ggplot2: 3.5.2
##   dplyr: 1.1.4
##   tidyr: 1.3.1
##   patchwork: 1.3.2
##   ggpubr: 0.6.1
##   pheatmap: 1.0.13

#Establishing a colour library - Not rainbow!

# ---- Earthy palette (15 colors) ----
pal_earth15 <- c(
  "#2F5D50", # pine
  "#6B8F71", # sage
  "#8C9C68", # olive
  "#4E7A48", # moss
  "#C9A227", # ochre
  "#B98E5A", # tan
  "#D8C7A0", # sand
  "#C46D3B", # terracotta
  "#B56969", # clay
  "#8C4A2F", # rust
  "#7D5A4F", # umber
  "#5B7065", # slate
  "#2F6F72", # deep teal
  "#3F4E6B", # blue slate
  "#50514F"  # stone
)

# Convenience ggplot2 scales
scale_fill_earth15  <- function(...) ggplot2::scale_fill_manual(values = pal_earth15, ...)
scale_color_earth15 <- function(...) ggplot2::scale_color_manual(values = pal_earth15, ...)

Build phyloseq from TSV files

library(phyloseq)
# install.packages("BiocManager"); BiocManager::install("phyloseq")
# file paths 
otu_path  <- "otu_table.tsv"
tax_path  <- "taxonomy.tsv"
meta_path <- "metadata.tsv"

# Read tables
otu <- read.delim(otu_path, check.names = FALSE, row.names = 1)
tax <- read.delim(tax_path, check.names = FALSE, row.names = 1)
meta <- read.delim(meta_path, check.names = FALSE)

# Basic hygiene
# Ensure counts are non-negative integers
otu_mat <- as.matrix(otu)
mode(otu_mat) <- "numeric"
otu_mat[is.na(otu_mat)] <- 0
otu_mat <- round(otu_mat)  

# Ensure taxonomy is matrix with OTU rows
tax_mat <- as.matrix(tax)

# Ensure metadata rownames = SampleID
stopifnot("SampleID" %in% colnames(meta))
rownames(meta) <- meta$SampleID
meta$SampleID <- NULL
sam <- sample_data(meta)

# 3) Align IDs
# OTU rownames should match taxonomy rownames
if (!all(rownames(otu_mat) %in% rownames(tax_mat))) {
  warning("Some OTUs in OTU table are missing from taxonomy; subsetting to intersection.")
}
common_otus <- intersect(rownames(otu_mat), rownames(tax_mat))
otu_mat <- otu_mat[common_otus, , drop = FALSE]
tax_mat <- tax_mat[common_otus, , drop = FALSE]

# Sample names: intersection between OTU columns and metadata rows
if (!all(colnames(otu_mat) %in% rownames(sam))) {
  warning("Some samples in OTU table are missing from metadata; subsetting to intersection.")
}
common_samps <- intersect(colnames(otu_mat), rownames(sam))
otu_mat <- otu_mat[, common_samps, drop = FALSE]
sam <- sam[common_samps, , drop = FALSE]

# 4) Build phyloseq object
OTU <- otu_table(otu_mat, taxa_are_rows = TRUE)
TAX <- tax_table(tax_mat)
SAM <- sam

ps <- phyloseq(OTU, TAX, SAM)

# 5) Quick summary
ps

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 250 taxa and 100 samples ]
## sample_data() Sample Data:       [ 100 samples by 3 sample variables ]
## tax_table()   Taxonomy Table:    [ 250 taxa by 6 taxonomic ranks ]

# sample_sums(ps)[1:5]
# taxa_sums(ps)[1:5]
# head(tax_table(ps))
# sample_data(ps)[1:5,]

#Rarefaction

# Get OTU/ASV table with samples as rows
otu <- as(otu_table(ps), "matrix")
if (taxa_are_rows(ps)) otu <- t(otu)

# Total reads per sample
tot <- rowSums(otu)

# Choose max depth and step for curves
step    <- 100
raremax <- 20000   # or set a fixed value you prefer

depths <- seq(step, raremax, by = step)

# Compute expected richness at each depth for each sample
# vegan::rarefy returns NA for depths > sample total; we'll drop those.
rar_list <- lapply(seq_len(nrow(otu)), function(i) {
  exp_s <- suppressWarnings(vegan::rarefy(otu[i, , drop = FALSE], sample = depths))
  data.frame(Sample = rownames(otu)[i], Reads = depths, Richness = as.numeric(exp_s))
})

df_rar <- bind_rows(rar_list) %>%
  filter(!is.na(Richness))

# Join sample metadata if available
if (!is.null(sample_data(ps, errorIfNULL = FALSE))) {
  md <- as(sample_data(ps), "data.frame") %>% rownames_to_column("Sample")
  df_rar <- df_rar %>% left_join(md, by = "Sample")
}

# Plot
p <- ggplot(df_rar, aes(x = Reads, y = Richness, group = Sample)) +
  geom_line(alpha = 0.6,col="#2F6F72") +
  labs(
    title = "Rarefaction curves",
    x = "Subsampled reads",
    y = "Observed richness (S)"
  ) +
  theme_minimal()

# Optional: color by a metadata column, e.g., "Group"
if ("Group" %in% names(df_rar)) {
  p <- p + aes(color = Group)
}

# Optional: label endpoints
endpoints <- df_rar %>%
  group_by(Sample) %>%
  slice_max(Reads, n = 1, with_ties = FALSE) %>%
  ungroup()

p <- p+ geom_vline(xintercept = 10000, color = "blue", linetype = "dashed", linewidth = 0.5)

p

Quality control and filtering

# Remove low-prevalence taxa, low-depth samples.
# Document choices ( keep conservative to avoid data leakage norms).

# Minimum sample depth
min_depth <- 10000
ps <- prune_samples(sample_sums(ps) >= min_depth, ps)

# Prevalence filter: keep taxa present in ≥ X% samples
prev_cutoff <- 0.05
prevalence  <- apply(otu_table(ps), 1, function(x) sum(x > 0) / length(x))
ps <- prune_taxa(prevalence >= prev_cutoff, ps)
ps

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 246 taxa and 50 samples ]
## sample_data() Sample Data:       [ 50 samples by 3 sample variables ]
## tax_table()   Taxonomy Table:    [ 246 taxa by 6 taxonomic ranks ]

Alpha diveristy metrices

# Alpha diversity measures
alpha_df <- estimate_richness(ps, measures = c("Observed", "Shannon", "Simpson"))

# Add metadata
alpha_df <- alpha_df %>%
  rownames_to_column("SampleID") %>%
  left_join(sample_data(ps) %>% data.frame() %>% rownames_to_column("SampleID"),
            by = "SampleID")

head(alpha_df)

##   SampleID Observed  Shannon   Simpson       Diet Activity AgeGroup
## 1     S006      211 4.512662 0.9831177      Vegan      low    18-29
## 2     S007      220 4.548603 0.9833893      Vegan   medium    18-29
## 3     S008      217 4.534260 0.9823765 Vegetarian   medium    45-59
## 4     S010      210 4.482464 0.9813063 Vegetarian      low    18-29
## 5     S016      213 4.548544 0.9835540      Vegan     high    18-29
## 6     S020      219 4.575772 0.9839758      Vegan   medium    45-59

Plotting of Shannon diversity

library(ggplot2)
library(ggpubr)

# Example: Shannon by Diet
p_shannon <- ggplot(alpha_df, aes(x = Diet, y = Shannon, fill = Diet)) +
  geom_boxplot(alpha = 0.8) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
  stat_compare_means(method = "kruskal.test", label.y = max(alpha_df$Shannon) + 0.2) +
  labs(title = "Alpha diversity (Shannon index) by Diet") +
  theme_bw()+
    scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/alpha_diversityDiet.png", p_shannon, width=8, height=5, dpi=300)


p_shannon

# Example: Shannon by Diet
p_shannonA <- ggplot(alpha_df, aes(x = AgeGroup, y = Shannon, fill = AgeGroup)) +
  geom_boxplot(alpha = 0.8) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
  stat_compare_means(method = "kruskal.test", label.y = max(alpha_df$Shannon) + 0.2) +
  labs(title = "Alpha diversity (Shannon index) by Age") +
  theme_bw()+
    scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/alpha_shannon.png", p_shannonA, width=8, height=5, dpi=300)

p_shannonA

Plotting of alpha diversity (Observed, Shannon and Simpson)

alpha_long <- alpha_df %>%
  pivot_longer(cols = c("Observed","Shannon","Simpson"),
               names_to = "Index", values_to = "Value")

p_alpha <- ggplot(alpha_long, aes(x = Diet, y = Value, fill = Diet)) +
  geom_boxplot(alpha = 0.8) +
  geom_jitter(width = 0.2, alpha = 0.6, size = 1) +
  facet_wrap(~Index, scales = "free_y") +
  stat_compare_means(method = "kruskal.test", label = "p.signif", hide.ns = TRUE) +
  labs(title = "Alpha diversity across diets") +
  theme_bw()+
  scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/alpha_diversity.png", p_alpha, width=8, height=5, dpi=300)


p_alpha

Relative abundance as stacked barplots

# Crearing long table from ps_rel
ps_rel <- transform_sample_counts(ps, function(x) x / sum(x))
df_gen <- phyloseq::psmelt(ps_rel)  

# Ensure Diet exists
if (!"Diet" %in% names(df_gen)) df_gen$Diet <- "Unknown"

# Keep top N genera: in this case 15 genera
topN <- 15
top_genera <- df_gen %>%
  group_by(Genus) %>%
  summarise(mean_abund = mean(Abundance, na.rm = TRUE), .groups = "drop") %>%
  slice_max(mean_abund, n = topN) %>%
  pull(Genus)

df_gen <- df_gen %>%
  mutate(
    Genus = ifelse(is.na(Genus) | Genus == "", "Unclassified", Genus),
    Genus_lumped = ifelse(Genus %in% top_genera, Genus, "Other")
  )

# Order genera by global mean 
gen_order <- df_gen %>%
  group_by(Genus_lumped) %>%
  summarise(mean_abund = mean(Abundance, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(mean_abund)) %>%
  pull(Genus_lumped)

df_gen$Genus_lumped <- factor(df_gen$Genus_lumped, levels = gen_order)

# Create a stable per-facet sample ordering 
order_df <- df_gen %>%
  distinct(Sample, Diet) %>%         
  arrange(Diet, Sample) %>%          
  mutate(Sample_ordered = factor(Sample, levels = Sample))

# Join the ordering back
df_gen <- df_gen %>%
  left_join(order_df, by = c("Sample", "Diet"))

# Plotting 
p_bar <- ggplot(df_gen, aes(x = Sample_ordered, y = Abundance, fill = Genus_lumped)) +
  geom_col(width = 0.95) +
  facet_grid(rows = vars(Diet), scales = "free_y", space = "free_y") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(title = "Relative abundance (Genus) per sample",
       x = "Samples (ordered within Diet)", y = "Relative abundance",
       fill = "Genus") +
  theme_bw() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid = element_blank(),
    strip.text = element_text(face = "bold")
  ) +
  scale_fill_earth15()+
  coord_flip()

p_bar

ggsave("figures/relabundance_genus_per_sample_by_diet.png",
       p_bar, width = 8, height = 15, dpi = 300)

Beta diversity analysis based on bray-curtis relative abundance

Plotting Beta diversity as PCoA plots.

# Bray–Curtis distance & PCoA
ord_bc <- ordinate(ps_rel, method = "PCoA", distance = "bray")

# Variance explained for axis labels
evals <- ord_bc$values$Eigenvalues
pct <- round(100 * evals / sum(evals), 1)

# Plot
p_bc <- plot_ordination(ps_rel, ord_bc, color = "AgeGroup") +
  geom_point( size = 3, alpha = 0.8) +
  stat_ellipse(aes(group = Diet, color = Diet), linewidth = 0.8) +
  labs(title = "PCoA — Bray–Curtis",
       x = paste0("PCo1 [", pct[1], "%]"),
       y = paste0("PCo2 [", pct[2], "%]")) +
  theme_bw()+
  labs(shape="Activity level")+
  scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/Beta_divAge.png", p_bc, width=8, height=5, dpi=300)


p_bc

p_bA <- plot_ordination(ps_rel, ord_bc, color = "Diet") +
  geom_point(aes(shape = Activity), size = 3, alpha = 0.8) +
  stat_ellipse(aes(group = Diet, color = Diet), linewidth = 0.8) +
  labs(title = "PCoA — Bray–Curtis",
       x = paste0("PCo1 [", pct[1], "%]"),
       y = paste0("PCo2 [", pct[2], "%]")) +
  theme_bw()+
  labs(shape="Beta diveristy - Bray Curtis")+
  scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/Beta_diet.png", p_bc, width=8, height=5, dpi=300)


p_bA

Perform Permanova

library(vegan)

# Distance matrix
dist_bc <- phyloseq::distance(ps_rel, method = "bray")

# PERMANOVA (adonis2)
adon_res <- adonis2(as.matrix(dist_bc) ~ Diet, data = sample_data(ps_rel) %>% data.frame())
adon_res

## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.matrix(dist_bc) ~ Diet, data = sample_data(ps_rel) %>% data.frame())
##          Df SumOfSqs      R2      F Pr(>F)    
## Model     2  0.42798 0.62401 39.001  0.001 ***
## Residual 47  0.25788 0.37599                  
## Total    49  0.68585 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Test homogeneity of group dispersions
bd <- betadisper(as.dist(dist_bc), group = sample_data(ps_rel)$Diet)
anova(bd)

## Analysis of Variance Table
## 
## Response: Distances
##           Df    Sum Sq    Mean Sq F value Pr(>F)
## Groups     2 0.0000995 4.9729e-05  0.4194 0.6599
## Residuals 47 0.0055725 1.1857e-04

plot(bd)

Differential abundance analysis with DESeq2

library(DESeq2)
library(phyloseq)
# Agglomerate to Genus (optional: can also do Phylum or Family)
ps_genus <- tax_glom(ps, taxrank = "Species")

# Make sure sample_data has factors (important for DESeq2 design)
sample_data(ps_genus)$Diet <- factor(sample_data(ps_genus)$Diet)

dds <- phyloseq_to_deseq2(ps_genus, ~ Diet)

## converting counts to integer mode

# Calculate size factors (normalize for sequencing depth)
dds <- DESeq(dds, test = "Wald", fitType = "parametric")

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

BiocManager::install("apeglm")

## Bioconductor version 3.21 (BiocManager 1.30.26), R 4.5.0 (2025-04-11 ucrt)

## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'apeglm'

## Installation paths not writeable, unable to update packages
##   path: C:/Program Files/R/R-4.5.0/library
##   packages:
##     boot, lattice, Matrix, mgcv

## Old packages: 'aplot', 'broom', 'cli', 'credentials', 'dbplyr', 'DESeq2',
##   'dtplyr', 'ggplot2', 'googledrive', 'magrittr', 'purrr', 'RcppArmadillo',
##   'stringr', 'tibble', 'xfun'

library(apeglm)
res <- results(dds, contrast = c("Diet", "Vegan", "Carnivore"))
res <- lfcShrink(dds, coef = "Diet_Vegan_vs_Carnivore", res = res)  # shrinks log2FC

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

# Order by adjusted p-value
res_sig <- res[which(res$padj < 0.05), ]
res_sig <- res_sig[order(res_sig$padj), ]

head(res_sig)

## log2 fold change (MAP): Diet Vegan vs Carnivore 
## Wald test p-value: Diet Vegan vs Carnivore 
## DataFrame with 6 rows and 5 columns
##          baseMean log2FoldChange     lfcSE       pvalue         padj
##         <numeric>      <numeric> <numeric>    <numeric>    <numeric>
## OTU0030  356.2332      -0.928139 0.0424042 3.83902e-107 1.42044e-105
## OTU0229 1046.1543       0.725086 0.0453361  2.64158e-58  4.88693e-57
## OTU0099  627.9742      -0.725663 0.0585970  6.94704e-36  8.56802e-35
## OTU0095  599.2641      -0.319581 0.0267394  3.92580e-33  3.63137e-32
## OTU0104  118.3157       1.124072 0.1006569  5.07037e-30  3.75208e-29
## OTU0146   40.5562       1.081001 0.1240993  2.43561e-19  1.30771e-18

tax <- as.data.frame(tax_table(ps_genus))
res_tax <- cbind(as(res_sig, "data.frame"), tax[rownames(res_sig), ])

head(res_tax)

##           baseMean log2FoldChange      lfcSE        pvalue          padj
## OTU0030  356.23324     -0.9281386 0.04240417 3.839021e-107 1.420438e-105
## OTU0229 1046.15435      0.7250861 0.04533609  2.641584e-58  4.886930e-57
## OTU0099  627.97421     -0.7256628 0.05859703  6.947042e-36  8.568018e-35
## OTU0095  599.26411     -0.3195809 0.02673944  3.925804e-33  3.631368e-32
## OTU0104  118.31574      1.1240724 0.10065692  5.070373e-30  3.752076e-29
## OTU0146   40.55617      1.0810010 0.12409930  2.435607e-19  1.307709e-18
##                 Phylum               Class             Order             Family
## OTU0030 Actinobacteria      Actinobacteria Bifidobacteriales Bifidobacteriaceae
## OTU0229   Fusobacteria       Fusobacteriia   Fusobacteriales   Fusobacteriaceae
## OTU0099  Bacteroidetes         Bacteroidia     Bacteroidales     Prevotellaceae
## OTU0095 Proteobacteria Alphaproteobacteria   Rhodobacterales   Rhodobacteraceae
## OTU0104     Firmicutes             Bacilli   Lactobacillales   Streptococcaceae
## OTU0146     Firmicutes          Clostridia     Clostridiales    Lachnospiraceae
##                   Genus                    Species
## OTU0030 Bifidobacterium        Bifidobacterium_sp.
## OTU0229   Fusobacterium Fusobacterium_unclassified
## OTU0099      Prevotella    Prevotella_unclassified
## OTU0095     Rhodobacter           Rhodobacter_spp.
## OTU0104   Streptococcus Streptococcus_unclassified
## OTU0146         Blautia       Blautia_unclassified

plotting of DESeq2 results

library(ggplot2)

DEs <- ggplot(res_tax, aes(x = log2FoldChange, y = -log10(padj), color = Genus)) +
  geom_point(alpha = 0.7, size = 3) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
  geom_vline(xintercept = c(0, 0), linetype = "dashed") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
  theme_bw() +
  labs(title = "Differential abundance: Vegan vs Carnivore",
       x = "Log2 Fold Change", y = "-log10 adjusted p-value")+
  scale_color_earth15() + 
  scale_fill_earth15()

ggsave("figures/Differential_abundance.png", DEs, width=8, height=5, dpi=300)


DEs

Pathching it all together :)

# kill legends on the others
p_shannon_nl  <- p_shannon  + theme(legend.position = "none")
p_shannonA_nl <- p_shannonA + theme(legend.position = "none")
p_alpha_nl    <- p_alpha    + theme(legend.position = "none")

combined1 <- (
  p_bar| (p / (p_alpha_nl) /
  (p_shannon_nl | p_shannonA_nl) /
  (p_bc | DEs))
) +
  plot_layout(guides = "collect") &
  theme(legend.position = "right")

combined1

ggsave("figures/Combi.png", combined1, width=20, height=15, dpi=300)

Microbiome analysis

Frederik Beck

2025-08-28

Build phyloseq from TSV files

Quality control and filtering

Alpha diveristy metrices

Plotting of Shannon diversity

Plotting of alpha diversity (Observed, Shannon and Simpson)

Relative abundance as stacked barplots

Beta diversity analysis based on bray-curtis relative abundance

Plotting Beta diversity as PCoA plots.

Perform Permanova

Differential abundance analysis with DESeq2

plotting of DESeq2 results

Pathching it all together :)