setwd("C:/Users/ROSFRB/Desktop/Daf")
#Libraries
# Packages: install + load
# install missing CRAN packages
install_cran_if_missing <- function(pkgs) {
missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
if (length(missing)) install.packages(missing, dependencies = TRUE)
}
# Helper: install missing Bioconductor packages
install_bioc_if_missing <- function(pkgs) {
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
bioc_missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
if (length(bioc_missing)) BiocManager::install(bioc_missing, ask = FALSE, update = TRUE)
}
# CRAN packages used in the Rmd
cran_pkgs <- c(
"tidyverse", # dplyr, tidyr, readr, tibble, ggplot2, etc.
"ggpubr",
"patchwork",
"vegan",
"pheatmap",
"ggnewscale", # optional (separate color scales on same plot)
"ape" # optional (trees for UniFrac / Faith's PD)
)
# Bioconductor packages
bioc_pkgs <- c(
"phyloseq",
"DESeq2",
"apeglm" # LFC shrinkage
)
# Install what’s missing
install_cran_if_missing(cran_pkgs)
install_bioc_if_missing(bioc_pkgs)
# Load (quietly)
suppressPackageStartupMessages({
library(tidyverse)
library(ggpubr)
library(patchwork)
library(vegan)
library(pheatmap)
library(ggnewscale)
library(ape)
library(phyloseq)
library(DESeq2)
library(apeglm)
})
## Warning: package 'tidyverse' was built under R version 4.5.1
## Warning: package 'ggpubr' was built under R version 4.5.1
## Warning: package 'patchwork' was built under R version 4.5.1
## Warning: package 'vegan' was built under R version 4.5.1
## Warning: package 'permute' was built under R version 4.5.1
## Warning: package 'pheatmap' was built under R version 4.5.1
## Warning: package 'ggnewscale' was built under R version 4.5.1
## Warning: package 'ape' was built under R version 4.5.1
## Warning: package 'generics' was built under R version 4.5.1
## Warning: package 'matrixStats' was built under R version 4.5.1
# Theme
theme_set(theme_bw(base_size = 12))
# Quick versions readout
cat("Loaded package versions:\n")
## Loaded package versions:
for (p in c("phyloseq","DESeq2","apeglm","vegan","ggplot2","dplyr","tidyr","patchwork","ggpubr","pheatmap")) {
if (requireNamespace(p, quietly = TRUE)) {
cat(sprintf(" %s: %s\n", p, as.character(utils::packageVersion(p))))
}
}
## phyloseq: 1.52.0
## DESeq2: 1.48.1
## apeglm: 1.30.0
## vegan: 2.7.1
## ggplot2: 3.5.2
## dplyr: 1.1.4
## tidyr: 1.3.1
## patchwork: 1.3.2
## ggpubr: 0.6.1
## pheatmap: 1.0.13
#Establishing a colour library - Not rainbow!
# ---- Earthy palette (15 colors) ----
pal_earth15 <- c(
"#2F5D50", # pine
"#6B8F71", # sage
"#8C9C68", # olive
"#4E7A48", # moss
"#C9A227", # ochre
"#B98E5A", # tan
"#D8C7A0", # sand
"#C46D3B", # terracotta
"#B56969", # clay
"#8C4A2F", # rust
"#7D5A4F", # umber
"#5B7065", # slate
"#2F6F72", # deep teal
"#3F4E6B", # blue slate
"#50514F" # stone
)
# Convenience ggplot2 scales
scale_fill_earth15 <- function(...) ggplot2::scale_fill_manual(values = pal_earth15, ...)
scale_color_earth15 <- function(...) ggplot2::scale_color_manual(values = pal_earth15, ...)
library(phyloseq)
# install.packages("BiocManager"); BiocManager::install("phyloseq")
# file paths
otu_path <- "otu_table.tsv"
tax_path <- "taxonomy.tsv"
meta_path <- "metadata.tsv"
# Read tables
otu <- read.delim(otu_path, check.names = FALSE, row.names = 1)
tax <- read.delim(tax_path, check.names = FALSE, row.names = 1)
meta <- read.delim(meta_path, check.names = FALSE)
# Basic hygiene
# Ensure counts are non-negative integers
otu_mat <- as.matrix(otu)
mode(otu_mat) <- "numeric"
otu_mat[is.na(otu_mat)] <- 0
otu_mat <- round(otu_mat)
# Ensure taxonomy is matrix with OTU rows
tax_mat <- as.matrix(tax)
# Ensure metadata rownames = SampleID
stopifnot("SampleID" %in% colnames(meta))
rownames(meta) <- meta$SampleID
meta$SampleID <- NULL
sam <- sample_data(meta)
# 3) Align IDs
# OTU rownames should match taxonomy rownames
if (!all(rownames(otu_mat) %in% rownames(tax_mat))) {
warning("Some OTUs in OTU table are missing from taxonomy; subsetting to intersection.")
}
common_otus <- intersect(rownames(otu_mat), rownames(tax_mat))
otu_mat <- otu_mat[common_otus, , drop = FALSE]
tax_mat <- tax_mat[common_otus, , drop = FALSE]
# Sample names: intersection between OTU columns and metadata rows
if (!all(colnames(otu_mat) %in% rownames(sam))) {
warning("Some samples in OTU table are missing from metadata; subsetting to intersection.")
}
common_samps <- intersect(colnames(otu_mat), rownames(sam))
otu_mat <- otu_mat[, common_samps, drop = FALSE]
sam <- sam[common_samps, , drop = FALSE]
# 4) Build phyloseq object
OTU <- otu_table(otu_mat, taxa_are_rows = TRUE)
TAX <- tax_table(tax_mat)
SAM <- sam
ps <- phyloseq(OTU, TAX, SAM)
# 5) Quick summary
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 250 taxa and 100 samples ]
## sample_data() Sample Data: [ 100 samples by 3 sample variables ]
## tax_table() Taxonomy Table: [ 250 taxa by 6 taxonomic ranks ]
# sample_sums(ps)[1:5]
# taxa_sums(ps)[1:5]
# head(tax_table(ps))
# sample_data(ps)[1:5,]
#Rarefaction
# Get OTU/ASV table with samples as rows
otu <- as(otu_table(ps), "matrix")
if (taxa_are_rows(ps)) otu <- t(otu)
# Total reads per sample
tot <- rowSums(otu)
# Choose max depth and step for curves
step <- 100
raremax <- 20000 # or set a fixed value you prefer
depths <- seq(step, raremax, by = step)
# Compute expected richness at each depth for each sample
# vegan::rarefy returns NA for depths > sample total; we'll drop those.
rar_list <- lapply(seq_len(nrow(otu)), function(i) {
exp_s <- suppressWarnings(vegan::rarefy(otu[i, , drop = FALSE], sample = depths))
data.frame(Sample = rownames(otu)[i], Reads = depths, Richness = as.numeric(exp_s))
})
df_rar <- bind_rows(rar_list) %>%
filter(!is.na(Richness))
# Join sample metadata if available
if (!is.null(sample_data(ps, errorIfNULL = FALSE))) {
md <- as(sample_data(ps), "data.frame") %>% rownames_to_column("Sample")
df_rar <- df_rar %>% left_join(md, by = "Sample")
}
# Plot
p <- ggplot(df_rar, aes(x = Reads, y = Richness, group = Sample)) +
geom_line(alpha = 0.6,col="#2F6F72") +
labs(
title = "Rarefaction curves",
x = "Subsampled reads",
y = "Observed richness (S)"
) +
theme_minimal()
# Optional: color by a metadata column, e.g., "Group"
if ("Group" %in% names(df_rar)) {
p <- p + aes(color = Group)
}
# Optional: label endpoints
endpoints <- df_rar %>%
group_by(Sample) %>%
slice_max(Reads, n = 1, with_ties = FALSE) %>%
ungroup()
p <- p+ geom_vline(xintercept = 10000, color = "blue", linetype = "dashed", linewidth = 0.5)
p
# Remove low-prevalence taxa, low-depth samples.
# Document choices ( keep conservative to avoid data leakage norms).
# Minimum sample depth
min_depth <- 10000
ps <- prune_samples(sample_sums(ps) >= min_depth, ps)
# Prevalence filter: keep taxa present in ≥ X% samples
prev_cutoff <- 0.05
prevalence <- apply(otu_table(ps), 1, function(x) sum(x > 0) / length(x))
ps <- prune_taxa(prevalence >= prev_cutoff, ps)
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 246 taxa and 50 samples ]
## sample_data() Sample Data: [ 50 samples by 3 sample variables ]
## tax_table() Taxonomy Table: [ 246 taxa by 6 taxonomic ranks ]
# Alpha diversity measures
alpha_df <- estimate_richness(ps, measures = c("Observed", "Shannon", "Simpson"))
# Add metadata
alpha_df <- alpha_df %>%
rownames_to_column("SampleID") %>%
left_join(sample_data(ps) %>% data.frame() %>% rownames_to_column("SampleID"),
by = "SampleID")
head(alpha_df)
## SampleID Observed Shannon Simpson Diet Activity AgeGroup
## 1 S006 211 4.512662 0.9831177 Vegan low 18-29
## 2 S007 220 4.548603 0.9833893 Vegan medium 18-29
## 3 S008 217 4.534260 0.9823765 Vegetarian medium 45-59
## 4 S010 210 4.482464 0.9813063 Vegetarian low 18-29
## 5 S016 213 4.548544 0.9835540 Vegan high 18-29
## 6 S020 219 4.575772 0.9839758 Vegan medium 45-59
library(ggplot2)
library(ggpubr)
# Example: Shannon by Diet
p_shannon <- ggplot(alpha_df, aes(x = Diet, y = Shannon, fill = Diet)) +
geom_boxplot(alpha = 0.8) +
geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
stat_compare_means(method = "kruskal.test", label.y = max(alpha_df$Shannon) + 0.2) +
labs(title = "Alpha diversity (Shannon index) by Diet") +
theme_bw()+
scale_color_earth15() +
scale_fill_earth15()
ggsave("figures/alpha_diversityDiet.png", p_shannon, width=8, height=5, dpi=300)
p_shannon
# Example: Shannon by Diet
p_shannonA <- ggplot(alpha_df, aes(x = AgeGroup, y = Shannon, fill = AgeGroup)) +
geom_boxplot(alpha = 0.8) +
geom_jitter(width = 0.2, alpha = 0.6, size = 1.5) +
stat_compare_means(method = "kruskal.test", label.y = max(alpha_df$Shannon) + 0.2) +
labs(title = "Alpha diversity (Shannon index) by Age") +
theme_bw()+
scale_color_earth15() +
scale_fill_earth15()
ggsave("figures/alpha_shannon.png", p_shannonA, width=8, height=5, dpi=300)
p_shannonA
alpha_long <- alpha_df %>%
pivot_longer(cols = c("Observed","Shannon","Simpson"),
names_to = "Index", values_to = "Value")
p_alpha <- ggplot(alpha_long, aes(x = Diet, y = Value, fill = Diet)) +
geom_boxplot(alpha = 0.8) +
geom_jitter(width = 0.2, alpha = 0.6, size = 1) +
facet_wrap(~Index, scales = "free_y") +
stat_compare_means(method = "kruskal.test", label = "p.signif", hide.ns = TRUE) +
labs(title = "Alpha diversity across diets") +
theme_bw()+
scale_color_earth15() +
scale_fill_earth15()
ggsave("figures/alpha_diversity.png", p_alpha, width=8, height=5, dpi=300)
p_alpha
# Crearing long table from ps_rel
ps_rel <- transform_sample_counts(ps, function(x) x / sum(x))
df_gen <- phyloseq::psmelt(ps_rel)
# Ensure Diet exists
if (!"Diet" %in% names(df_gen)) df_gen$Diet <- "Unknown"
# Keep top N genera: in this case 15 genera
topN <- 15
top_genera <- df_gen %>%
group_by(Genus) %>%
summarise(mean_abund = mean(Abundance, na.rm = TRUE), .groups = "drop") %>%
slice_max(mean_abund, n = topN) %>%
pull(Genus)
df_gen <- df_gen %>%
mutate(
Genus = ifelse(is.na(Genus) | Genus == "", "Unclassified", Genus),
Genus_lumped = ifelse(Genus %in% top_genera, Genus, "Other")
)
# Order genera by global mean
gen_order <- df_gen %>%
group_by(Genus_lumped) %>%
summarise(mean_abund = mean(Abundance, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_abund)) %>%
pull(Genus_lumped)
df_gen$Genus_lumped <- factor(df_gen$Genus_lumped, levels = gen_order)
# Create a stable per-facet sample ordering
order_df <- df_gen %>%
distinct(Sample, Diet) %>%
arrange(Diet, Sample) %>%
mutate(Sample_ordered = factor(Sample, levels = Sample))
# Join the ordering back
df_gen <- df_gen %>%
left_join(order_df, by = c("Sample", "Diet"))
# Plotting
p_bar <- ggplot(df_gen, aes(x = Sample_ordered, y = Abundance, fill = Genus_lumped)) +
geom_col(width = 0.95) +
facet_grid(rows = vars(Diet), scales = "free_y", space = "free_y") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(title = "Relative abundance (Genus) per sample",
x = "Samples (ordered within Diet)", y = "Relative abundance",
fill = "Genus") +
theme_bw() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid = element_blank(),
strip.text = element_text(face = "bold")
) +
scale_fill_earth15()+
coord_flip()
p_bar
ggsave("figures/relabundance_genus_per_sample_by_diet.png",
p_bar, width = 8, height = 15, dpi = 300)
# Bray–Curtis distance & PCoA
ord_bc <- ordinate(ps_rel, method = "PCoA", distance = "bray")
# Variance explained for axis labels
evals <- ord_bc$values$Eigenvalues
pct <- round(100 * evals / sum(evals), 1)
# Plot
p_bc <- plot_ordination(ps_rel, ord_bc, color = "AgeGroup") +
geom_point( size = 3, alpha = 0.8) +
stat_ellipse(aes(group = Diet, color = Diet), linewidth = 0.8) +
labs(title = "PCoA — Bray–Curtis",
x = paste0("PCo1 [", pct[1], "%]"),
y = paste0("PCo2 [", pct[2], "%]")) +
theme_bw()+
labs(shape="Activity level")+
scale_color_earth15() +
scale_fill_earth15()
ggsave("figures/Beta_divAge.png", p_bc, width=8, height=5, dpi=300)
p_bc
p_bA <- plot_ordination(ps_rel, ord_bc, color = "Diet") +
geom_point(aes(shape = Activity), size = 3, alpha = 0.8) +
stat_ellipse(aes(group = Diet, color = Diet), linewidth = 0.8) +
labs(title = "PCoA — Bray–Curtis",
x = paste0("PCo1 [", pct[1], "%]"),
y = paste0("PCo2 [", pct[2], "%]")) +
theme_bw()+
labs(shape="Beta diveristy - Bray Curtis")+
scale_color_earth15() +
scale_fill_earth15()
ggsave("figures/Beta_diet.png", p_bc, width=8, height=5, dpi=300)
p_bA
library(vegan)
# Distance matrix
dist_bc <- phyloseq::distance(ps_rel, method = "bray")
# PERMANOVA (adonis2)
adon_res <- adonis2(as.matrix(dist_bc) ~ Diet, data = sample_data(ps_rel) %>% data.frame())
adon_res
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = as.matrix(dist_bc) ~ Diet, data = sample_data(ps_rel) %>% data.frame())
## Df SumOfSqs R2 F Pr(>F)
## Model 2 0.42798 0.62401 39.001 0.001 ***
## Residual 47 0.25788 0.37599
## Total 49 0.68585 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test homogeneity of group dispersions
bd <- betadisper(as.dist(dist_bc), group = sample_data(ps_rel)$Diet)
anova(bd)
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 2 0.0000995 4.9729e-05 0.4194 0.6599
## Residuals 47 0.0055725 1.1857e-04
plot(bd)
library(DESeq2)
library(phyloseq)
# Agglomerate to Genus (optional: can also do Phylum or Family)
ps_genus <- tax_glom(ps, taxrank = "Species")
# Make sure sample_data has factors (important for DESeq2 design)
sample_data(ps_genus)$Diet <- factor(sample_data(ps_genus)$Diet)
dds <- phyloseq_to_deseq2(ps_genus, ~ Diet)
## converting counts to integer mode
# Calculate size factors (normalize for sequencing depth)
dds <- DESeq(dds, test = "Wald", fitType = "parametric")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
BiocManager::install("apeglm")
## Bioconductor version 3.21 (BiocManager 1.30.26), R 4.5.0 (2025-04-11 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'apeglm'
## Installation paths not writeable, unable to update packages
## path: C:/Program Files/R/R-4.5.0/library
## packages:
## boot, lattice, Matrix, mgcv
## Old packages: 'aplot', 'broom', 'cli', 'credentials', 'dbplyr', 'DESeq2',
## 'dtplyr', 'ggplot2', 'googledrive', 'magrittr', 'purrr', 'RcppArmadillo',
## 'stringr', 'tibble', 'xfun'
library(apeglm)
res <- results(dds, contrast = c("Diet", "Vegan", "Carnivore"))
res <- lfcShrink(dds, coef = "Diet_Vegan_vs_Carnivore", res = res) # shrinks log2FC
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
# Order by adjusted p-value
res_sig <- res[which(res$padj < 0.05), ]
res_sig <- res_sig[order(res_sig$padj), ]
head(res_sig)
## log2 fold change (MAP): Diet Vegan vs Carnivore
## Wald test p-value: Diet Vegan vs Carnivore
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## OTU0030 356.2332 -0.928139 0.0424042 3.83902e-107 1.42044e-105
## OTU0229 1046.1543 0.725086 0.0453361 2.64158e-58 4.88693e-57
## OTU0099 627.9742 -0.725663 0.0585970 6.94704e-36 8.56802e-35
## OTU0095 599.2641 -0.319581 0.0267394 3.92580e-33 3.63137e-32
## OTU0104 118.3157 1.124072 0.1006569 5.07037e-30 3.75208e-29
## OTU0146 40.5562 1.081001 0.1240993 2.43561e-19 1.30771e-18
tax <- as.data.frame(tax_table(ps_genus))
res_tax <- cbind(as(res_sig, "data.frame"), tax[rownames(res_sig), ])
head(res_tax)
## baseMean log2FoldChange lfcSE pvalue padj
## OTU0030 356.23324 -0.9281386 0.04240417 3.839021e-107 1.420438e-105
## OTU0229 1046.15435 0.7250861 0.04533609 2.641584e-58 4.886930e-57
## OTU0099 627.97421 -0.7256628 0.05859703 6.947042e-36 8.568018e-35
## OTU0095 599.26411 -0.3195809 0.02673944 3.925804e-33 3.631368e-32
## OTU0104 118.31574 1.1240724 0.10065692 5.070373e-30 3.752076e-29
## OTU0146 40.55617 1.0810010 0.12409930 2.435607e-19 1.307709e-18
## Phylum Class Order Family
## OTU0030 Actinobacteria Actinobacteria Bifidobacteriales Bifidobacteriaceae
## OTU0229 Fusobacteria Fusobacteriia Fusobacteriales Fusobacteriaceae
## OTU0099 Bacteroidetes Bacteroidia Bacteroidales Prevotellaceae
## OTU0095 Proteobacteria Alphaproteobacteria Rhodobacterales Rhodobacteraceae
## OTU0104 Firmicutes Bacilli Lactobacillales Streptococcaceae
## OTU0146 Firmicutes Clostridia Clostridiales Lachnospiraceae
## Genus Species
## OTU0030 Bifidobacterium Bifidobacterium_sp.
## OTU0229 Fusobacterium Fusobacterium_unclassified
## OTU0099 Prevotella Prevotella_unclassified
## OTU0095 Rhodobacter Rhodobacter_spp.
## OTU0104 Streptococcus Streptococcus_unclassified
## OTU0146 Blautia Blautia_unclassified
library(ggplot2)
DEs <- ggplot(res_tax, aes(x = log2FoldChange, y = -log10(padj), color = Genus)) +
geom_point(alpha = 0.7, size = 3) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed") +
geom_vline(xintercept = c(0, 0), linetype = "dashed") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed") +
theme_bw() +
labs(title = "Differential abundance: Vegan vs Carnivore",
x = "Log2 Fold Change", y = "-log10 adjusted p-value")+
scale_color_earth15() +
scale_fill_earth15()
ggsave("figures/Differential_abundance.png", DEs, width=8, height=5, dpi=300)
DEs
# kill legends on the others
p_shannon_nl <- p_shannon + theme(legend.position = "none")
p_shannonA_nl <- p_shannonA + theme(legend.position = "none")
p_alpha_nl <- p_alpha + theme(legend.position = "none")
combined1 <- (
p_bar| (p / (p_alpha_nl) /
(p_shannon_nl | p_shannonA_nl) /
(p_bc | DEs))
) +
plot_layout(guides = "collect") &
theme(legend.position = "right")
combined1
ggsave("figures/Combi.png", combined1, width=20, height=15, dpi=300)