The core microbiome was defined by a high occurrence rate (≥ 80%) across samples from mesophilic anaerobic processes. These genera were evaluated for significant differences across methane yield categories using the Kruskal-Wallis and Wilcoxon rank-sum tests.
# Load packages
library(pheatmap)
library(tibble)
library(readxl)
library(ggplot2)
library(vegan)
library(reshape2)
library(ggtext)# Load presence/absence data
pres_abs_genus_df <- read_xlsx(here("data", "meta-analysis-microbial-data.xlsx"),
sheet = "genera_abs_count") %>%
column_to_rownames("genera") %>%
# Delete samples from thermophilic processes and those with archaeal data only
dplyr::select(-"F4_4",-"F4_10",-"F27_16",-"F4_11",
-"F27_1",-"F27_14", -"F27_15", -"F4_1",
-"F4_6", -"F4_8", -"F27_11", -"F27_5",
-"F27_13", -"F4_5", -"F4_7", -"F27_8",
-"F25_5", -"F27_2", -"F27_3", -"F27_10",
-"F27_9", -"F4_12", -"F27_6", -"F4_9",
-"F4_3", -"F27_4", -"F4_2", -"F27_12",
-"F27_7", -"F25_7", -"F17_5", -"F17_3",
-"F17_6") %>%
t() %>%
as.data.frame() %>%
mutate(across(everything(), ~ ifelse(. > 0, 1, 0))) # Presence-AbsenceSelect core microbiome
# Remove singleton values (genera present in none or only one sample)
pres_abs_genus_trim_df <- pres_abs_genus_df[, which(colSums(pres_abs_genus_df) > 1)]
# Calculate the percentage of presence for each genus
pres_abs_genus_perc_df <- colSums(pres_abs_genus_trim_df)/nrow(pres_abs_genus_trim_df)
# Select genera present in at least 80% of the samples.
core_microbiome <- names(pres_abs_genus_perc_df[pres_abs_genus_perc_df >= 0.8])
# "No_identified_Bact" refers to unknown genera that were excluded from the
# core microbiome in the study
core_microbiome <- core_microbiome[core_microbiome != "No_identified_Bact"]
core_microbiome## [1] "Aminobacterium" "Clostridium" "HA73" "Methanoculleus"
## [5] "Methanosarcina" "Pelotomaculum" "Syntrophomonas" "T78"
# Label the names of the core microbiome with an asterisk (*)
colnames(pres_abs_genus_trim_df) <- ifelse(colnames(pres_abs_genus_trim_df) %in%
core_microbiome,
paste0("*", colnames(pres_abs_genus_trim_df)),
colnames(pres_abs_genus_trim_df))
# Create the heatmap to visualize the core microbiome
pheatmap(pres_abs_genus_trim_df,
clustering_distance_cols = "euclidean",
clustering_distance_rows = "euclidean",
clustering_method = "ward.D2",
fontsize = 3.5,
color = hcl.colors(2, "Geyser", rev = TRUE),
legend_breaks = c(0, 1))Presence–absence matrix for the core microbiome at the genus level. The genera belonging to the core microbiome are highlighted. Blue-green squares means presence and orange squares means absence.
Evaluation of core microbiome
# Load data
relat_abun_genus_cl_df <- read_xlsx(here("data", "meta-analysis-microbial-data.xlsx"),
sheet = "genera_abs_count") %>%
column_to_rownames("genera") %>%
# Delete samples from thermophilic processes and those with archaeal data only
dplyr::select(-"F4_4",-"F4_10",-"F27_16",-"F4_11",
-"F27_1",-"F27_14",-"F27_15", -"F4_1",
-"F4_6", -"F4_8", -"F27_11", -"F27_5",
-"F27_13", -"F4_5", -"F4_7", -"F27_8",
-"F25_5", -"F27_2", -"F27_3", -"F27_10",
-"F27_9", -"F4_12", -"F27_6", -"F4_9",
-"F4_3", -"F27_4", -"F4_2", -"F27_12",
-"F27_7", -"F25_7", -"F17_5", -"F17_3",
-"F17_6") %>%
# Relative abundance
{sweep(., 2, colSums(.), FUN = "/")} %>%
t() %>%
as.data.frame()
# Load database with methane yield categories
CH4_yield_df <- readRDS(file = here("rds","CH4_yield_df.rds")) %>%
column_to_rownames("sample_id")
# Add the 'freedman_class' column with the methane yield categories
relat_abun_genus_cl_df$freedman_class <- CH4_yield_df$freedman_class[match(rownames(relat_abun_genus_cl_df), rownames(CH4_yield_df)) ]
# Remove rows with NA values in methane yield categories
relat_abun_genus_cl_df <- relat_abun_genus_cl_df[!is.na(relat_abun_genus_cl_df$freedman_class),]# Transformation and normalization of the relative abundance of core microbiome genera.
relat_abun_core_trans <- relat_abun_genus_cl_df %>%
dplyr::select(-"freedman_class") %>%
dplyr::select(c(core_microbiome)) %>%
decostand(method = "hellinger") %>%
rbind()
# Add the 'freedman_class' column to the transformed matrix
relat_abun_core_trans$freedman_class <- CH4_yield_df$freedman_class[match(rownames(relat_abun_core_trans),rownames(CH4_yield_df))]
# Convert from 'wide' to 'long' format for ggplot2
relat_abun_core_trans <- melt(relat_abun_core_trans)
# Convert CH4 yield categories en factor class
freedman_levels <- c("C1","C2","C3","C4","C5","C6")
relat_abun_core_trans$freedman_class <- factor(relat_abun_core_trans$freedman_class,
levels = freedman_levels)# Create an empty list to store the results of significance tests and figures.
results <- list()
# Iteration over each variable
for (var in core_microbiome) {
core_data <- relat_abun_core_trans %>%
dplyr::filter(variable == var) %>%
dplyr::select(variable, freedman_class, value) %>%
stats::na.omit()
# Kruskal-Wallis Test
kruskal <- kruskal.test(value ~ freedman_class, data = core_data)
# Wilcoxon test with BH correction
wilcoxon <- pairwise.wilcox.test(core_data$value, core_data$freedman_class, p.adjust.method = "BH")
# Box-and-whisker plot
fixed_colors <- c("#cc3300", "#ff9966", "#004cff", "#ffcc00", "#99cc33", "#339900")
p <- ggplot(core_data, aes(x = freedman_class, y = value, color = freedman_class)) +
geom_boxplot(outlier.shape = NA, outlier.size = 0.5, notch = FALSE) +
geom_jitter(width = 0.2) +
scale_color_manual(values = setNames(fixed_colors, freedman_levels), drop = FALSE) +
theme_bw() +
labs(title = var) +
theme(plot.title = element_markdown(size = 10, face = "bold"))
# Save results in a list
results[[var]] <- list(
kruskal = kruskal,
wilcoxon = wilcoxon,
figure = p
)
}📈 Statistical analysis for Aminobacterium
🔹 Kruskal-Wallis test
Kruskal-Wallis rank sum test
data: value by freedman_class
Kruskal-Wallis chi-squared = 8.8862, df = 5, p-value = 0.1137
🔹 Wilcoxon test
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: core_data$value and core_data$freedman_class
C1 C2 C3 C4 C5
C2 0.190 - - - -
C3 0.065 0.442 - - -
C4 0.209 0.963 0.819 - -
C5 0.311 0.751 0.819 0.751 -
C6 0.131 0.771 0.243 0.751 0.442
P value adjustment method: BH
📈 Statistical analysis for Clostridium
🔹 Kruskal-Wallis test
Kruskal-Wallis rank sum test
data: value by freedman_class
Kruskal-Wallis chi-squared = 11.731, df = 5, p-value = 0.03866
🔹 Wilcoxon test
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: core_data$value and core_data$freedman_class
C1 C2 C3 C4 C5
C2 0.060 - - - -
C3 0.079 0.740 - - -
C4 0.060 0.740 0.740 - -
C5 0.017 0.979 0.740 0.740 -
C6 0.017 0.607 0.740 0.607 0.740
P value adjustment method: BH
📈 Statistical analysis for HA73
🔹 Kruskal-Wallis test
Kruskal-Wallis rank sum test
data: value by freedman_class
Kruskal-Wallis chi-squared = 10.757, df = 5, p-value = 0.05642
🔹 Wilcoxon test
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: core_data$value and core_data$freedman_class
C1 C2 C3 C4 C5
C2 0.16 - - - -
C3 0.20 0.31 - - -
C4 0.16 0.51 0.54 - -
C5 0.31 0.40 1.00 0.57 -
C6 0.16 1.00 0.16 0.31 0.16
P value adjustment method: BH
📈 Statistical analysis for Methanoculleus
🔹 Kruskal-Wallis test
Kruskal-Wallis rank sum test
data: value by freedman_class
Kruskal-Wallis chi-squared = 4.2313, df = 5, p-value = 0.5166
🔹 Wilcoxon test
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: core_data$value and core_data$freedman_class
C1 C2 C3 C4 C5
C2 0.98 - - - -
C3 1.00 1.00 - - -
C4 0.78 0.78 0.78 - -
C5 1.00 0.78 0.97 0.78 -
C6 1.00 0.78 0.89 0.89 0.97
P value adjustment method: BH
📈 Statistical analysis for Methanosarcina
🔹 Kruskal-Wallis test
Kruskal-Wallis rank sum test
data: value by freedman_class
Kruskal-Wallis chi-squared = 12.536, df = 5, p-value = 0.02814
🔹 Wilcoxon test
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: core_data$value and core_data$freedman_class
C1 C2 C3 C4 C5
C2 0.056 - - - -
C3 0.056 0.521 - - -
C4 0.085 0.274 0.645 - -
C5 0.064 0.219 0.740 0.795 -
C6 0.035 0.134 0.795 0.740 0.795
P value adjustment method: BH
📈 Statistical analysis for Pelotomaculum
🔹 Kruskal-Wallis test
Kruskal-Wallis rank sum test
data: value by freedman_class
Kruskal-Wallis chi-squared = 5.699, df = 5, p-value = 0.3366
🔹 Wilcoxon test
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: core_data$value and core_data$freedman_class
C1 C2 C3 C4 C5
C2 0.802 - - - -
C3 0.802 0.085 - - -
C4 1.000 0.622 0.634 - -
C5 1.000 0.634 0.802 1.000 -
C6 1.000 0.802 0.965 1.000 1.000
P value adjustment method: BH
📈 Statistical analysis for Syntrophomonas
🔹 Kruskal-Wallis test
Kruskal-Wallis rank sum test
data: value by freedman_class
Kruskal-Wallis chi-squared = 7.6925, df = 5, p-value = 0.174
🔹 Wilcoxon test
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: core_data$value and core_data$freedman_class
C1 C2 C3 C4 C5
C2 0.83 - - - -
C3 0.87 0.61 - - -
C4 0.57 0.45 0.71 - -
C5 0.45 0.45 0.71 0.96 -
C6 0.10 0.14 0.45 0.45 0.45
P value adjustment method: BH
📈 Statistical analysis for T78
🔹 Kruskal-Wallis test
Kruskal-Wallis rank sum test
data: value by freedman_class
Kruskal-Wallis chi-squared = 16.823, df = 5, p-value = 0.004849
🔹 Wilcoxon test
Pairwise comparisons using Wilcoxon rank sum test with continuity correction
data: core_data$value and core_data$freedman_class
C1 C2 C3 C4 C5
C2 0.017 - - - -
C3 0.021 0.274 - - -
C4 0.023 0.274 0.978 - -
C5 0.017 0.250 0.816 0.816 -
C6 0.017 0.816 0.274 0.274 0.274
P value adjustment method: BH
Abundance of significant genera in the core microbiome according to methane yield categories.
Differential analyses, including ALDEx2, DESeq, edgeR, and LefSe, were performed to evaluate the abundance of the entire microbial community across methane yield categories. It has been suggested that when performing differential analyses, it is important to consider the potential differences in statistical data distributions, transformations, and hypothesis tests corresponding to each analysis. This allowed for the identification of changes in the robustness between potential indicators. Considering the theoretical CH4 yield (~ 400 mL CH4/gVSadd) and the methane yield categories we defined two groups: low CH4 yields (methane yield categories C1, C2, and C3, encompassing intervals < 400 mL CH4/gVSadd) and high CH4 yields (methane yield categories C4, C5, and C6, intervals > 400 mL CH4/gVSadd). Robust attributes were those that appeared in at least two differential analyses. A Venn diagram was used to indicate overlapping taxa in differential analyses.
# Load packages
library(ALDEx2)
library(vegan)
library(DESeq2)
library(edgeR)
library(lefser)
library(ggvenn)
library(stringr)# Load absolute counts
genera_abs_df <- read_xlsx(here("data", "meta-analysis-microbial-data.xlsx"), sheet = "genera_abs_count") %>%
column_to_rownames("genera") %>%
dplyr::select(-"F25_7", -"F17_5", -"F17_3", -"F17_6") %>%
t() %>%
as.data.frame()
# Add the 'freedman_class' column with the methane yield categories
genera_abs_df$freedman_class <- CH4_yield_df$freedman_class[match(rownames(genera_abs_df), rownames(CH4_yield_df))]
# Load rarefied counts from rds object (object created in the Part1)
genera_rar_df <- readRDS(file = here("rds","genera_rarefied_counts.rds")) %>%
t() %>%
as.data.frame()
# Add the 'freedman_class' column with the methane yield categories
genera_rar_df$freedman_class <- CH4_yield_df$freedman_class[match(rownames(genera_rar_df), rownames(CH4_yield_df))]# Data preprocessing
# Delete NA in methane yield categories and automatically convert columns class (character, numeric) according to the values
# Absolute data
genera_abs_df <- na.omit(genera_abs_df) %>%
t() %>%
as.data.frame() %>%
mutate_all(as.numeric) %>%
na.omit()
# Rarefied data
genera_rar_df <- na.omit(genera_rar_df) %>%
t() %>%
as.data.frame() %>%
mutate_all(as.numeric) %>%
na.omit()
# Select rows from CH4_yield_df whose row names match the columns of genera_abs_df,
# and group the methane yield categories into two levels for differential analysis
CH4_yield_sub <- CH4_yield_df[rownames(CH4_yield_df) %in% colnames(genera_abs_df), ] %>%
mutate(CH4_yield_level = case_when(
freedman_class %in% c("C1", "C2", "C3") ~ "low",
freedman_class %in% c("C4", "C5", "C6") ~ "high",
TRUE ~ NA_character_))%>%
dplyr::select("YCH4_mLgVS", "freedman_class", "CH4_yield_level","reference") %>%
mutate(reference = reference %>%
str_replace_all("\\.", "") %>%
str_replace_all(" ", "_")) %>%
mutate_all(~ type.convert(as.character(.), as.is = TRUE))
# Reorder colData to match countData column order
CH4_yield_sub <- CH4_yield_sub[colnames(genera_abs_df), ]ALDEx2
# Seed
set.seed(123)
# Perform centered log-ratio (CLR) transformation and generate Monte Carlo samples
x.clr <- aldex.clr(reads = genera_abs_df, conds = CH4_yield_sub$CH4_yield_level, mc.samples=128, denom = "zero",
verbose = FALSE)
# Perform Welch’s t-test and Wilcoxon rank test for differential abundance
x_tt <- aldex.ttest(x.clr, paired.test = FALSE, verbose = FALSE)
# Estimate effect sizes and confidence intervals for each genera
x_effect <- aldex.effect(x.clr, CI = TRUE, verbose = FALSE)
# Combine the statistical test results and effect size estimates into a single data frame
x.all <- data.frame(x_tt, x_effect)
# Generate a csv archive (for Venn diagram analysis)
write.csv(x.all, here("outs", "ALDEX2.csv"))DESeq2
# Create a DESeq2 dataset from genus-level count data and sample metadata,
# using CH4 yield level as the design factor
dds <- DESeqDataSetFromMatrix(countData = genera_abs_df,
colData = CH4_yield_sub,
design = ~ CH4_yield_level)
# Choose factor levels. DESeq2 functions require specifying the reference level for comparison.
dds$CH4_yield_level <- factor(dds$CH4_yield_level, levels = c("low","high"))
dds$reference <- factor(dds$reference, levels =
c("Svensson_et_al_2018","Peng_et_al_2018","Basak_et_al_2021",
"He_et_al_2017","Lim_et_al_2020","Zhang_et_al_2020",
"Amha_et_al_2017"))
# Runs the DESeq2 pipeline to estimate size factors, dispersions, and fit the
# model for differential analysis
dds <- DESeq(dds)
# Include the "reference" factor as it may influence microbial community selection.
# In experiments with many samples (e.g., 50 or 100), variation affecting observed counts is likely.
# Adding variables to the design helps control additional variation in the counts.
levels(dds$reference) <- sub("-.*", "", levels(dds$reference))
# Re-run the DESeq object to perform the analysis again using a multifactorial design.
# Since the categories are of primary interest, they are placed last in the formula.
design(dds) <- formula(~ reference + CH4_yield_level)
ddsMF <- DESeq(dds)
# Extract results from DESeq analysis
res <- results(ddsMF)
# Generate a csv archive
write.csv(as.data.frame(res), here("outs", "DESeq2.csv"))EdgeR
# Seed
set.seed(123)
# Create a DGEList object from the absolute abundance data (genera_abs_df).
# The 'samples' argument links metadata (CH4_yield_sub) to each sample.
# The 'group' argument defines the experimental grouping variable (CH4_yield_level)
dgeGlm <- DGEList(counts = genera_abs_df,
samples = CH4_yield_sub,
group = CH4_yield_sub$CH4_yield_level)
# Calculate normalization factors to scale raw library sizes,
# accounting for compositional differences between samples.
dgeGlm <- calcNormFactors(dgeGlm)
# Estimate the common dispersion across all genera, assuming a shared dispersion value.
dgeGlm <- estimateCommonDisp(dgeGlm)
# Estimate genera-specific (tagwise) dispersions based on the common dispersion
# as a starting point.
dgeGlm <- estimateTagwiseDisp(dgeGlm)
# Perform an exact test on the DE analysis using the `dgeGlm` object.
# The test is conducted between the "high" and "low" groups,
# with the dispersion method set to "tagwise" (which uses tagwise dispersion estimates).
dex <- exactTest(dgeGlm, pair=c("high","low"), dispersion="tagwise")
# Adjusts the p-values using the Benjamini-Hochberg (BH) method to control for
# false discovery rate (FDR).
fdrvalues <- p.adjust(dex$table$PValue, method='BH')
# The adjusted p-values are then added to the 'dex$table' as a new column 'fdrvalues'.
dex$table$fdr <- fdrvalues
# Picking and FDR cutoff
summary(decideTestsDGE(dex,p=0.05))
summary(decideTestsDGE(dex,p=0.01))
summary(decideTestsDGE(dex,p=0.005))
# Perform differential expression analysis using the decideTestsDGE function with a p-value cutoff
# of 0.05 and BH (Benjamini-Hochberg) adjustment for multiple testing.
de <- decideTestsDGE(dex, p = 0.05, adjust = "BH")
# Extract the row names (genera names) from the 'dex' object where the corresponding test result is
# significant (p-value < 0.05) and store them in 'detags'.
detags <- rownames(dex)[as.logical(de)]
# Convert the 'dex' object to a dataframe
df <- as.data.frame(dex)
# Export the results to a CSV file
out <- topTags(dex, n=Inf)
write.csv(out, here("outs", "EdgeR.csv"))LEfSe
# Seed
set.seed(123)
# Create a SummarizedExperiment object using the counts matrix from genera_rar_df and the associated column data from CH4_yield_sub.
data.se <- SummarizedExperiment(list(counts=as.matrix(genera_rar_df)), colData = CH4_yield_sub)
# Perform LEfSe analysis on the data, using "CH4_yield_level" as the grouping column.
# The LDA threshold is set to 1.5, meaning features with an LDA score greater than 1.5
# will be considered as significantly different between the groups.
res_group <- lefser(data.se, groupCol = "CH4_yield_level", lda.threshold = 1.5)
# Export the results to a CSV file
write.csv(res_group, here("outs", "LEfSe.csv"))Venn diagram
# Load the results data from the differential analysis
ALDEx2_0.05 <- read.csv(here("outs", "ALDEx2.csv")) %>%
as.data.frame() %>%
dplyr::filter(wi.eBH < 0.05) %>%
dplyr::select(1)
DESeq2_0.05 <- read.csv(here("outs", "DESeq2.csv")) %>%
as.data.frame() %>%
dplyr::filter(padj < 0.05) %>%
dplyr::select(1)
EdgeR_0.05 <- read.csv(here("outs", "EdgeR.csv")) %>%
as.data.frame() %>%
dplyr::filter(FDR < 0.05) %>%
dplyr::select(1)
LefSE_0.05 <- read.csv(here("outs", "LEfSe.csv")) %>%
as.data.frame() %>%
dplyr::select(2)# Create a list called 'genera_list' containing the results from four different methods
genera_list <- list(
DeSeq2 = DESeq2_0.05$X,
EdgeR = EdgeR_0.05$X,
LefSE = LefSE_0.05$features,
ALDEx2 = ALDEx2_0.05$X)
# This code creates a Venn diagram using the 'ggvenn' function to visualize the overlap
# between different set
ggvenn(genera_list,
fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF"),
stroke_size = 0.5, set_name_size = 4)A Venn diagram illustrates the number of genera obtained and shared between differential analysis methods (EdgeR, LefSe, DESeq2, and ALDEx2).
Determination of robust fold change genera
# Create a list with genus intersections in at least two differential analysis
pairwise_intersections <- combn(names(genera_list), 2, simplify = FALSE, FUN = function(pair) {
inters <- intersect(genera_list[[pair[1]]], genera_list[[pair[2]]])
setNames(list(inters), paste(pair, collapse = "_and_"))
})
# Combines all elements of the list 'pairwise_intersections' into a single vector
pairwise_intersections <- do.call(c, pairwise_intersections)
# Find genera with robust fold change attribute
robust_genera <- unique(unlist(pairwise_intersections))
robust_genera## [1] "Corynebacterium" "Lactobacillus" "Prevotella"
## [4] "Methanothermobacter" "W5" "Sphaerochaeta"
## [7] "Sedimentibacter"
# Transposes the 'genera_abs_df' dataframe and then converts the transposed matrix into a new dataframe.
df_robu_genera <- genera_abs_df %>%
t() %>%
as.data.frame()
# Add information on methane yield to the table of absolute abundances
df_robu_genera <- cbind(df_robu_genera,
CH4_yield_sub[rownames(df_robu_genera),
c("YCH4_mLgVS", "freedman_class", "CH4_yield_level")])
# Add a column with sample names using the row names
df_robu_genera <- df_robu_genera %>%
dplyr::mutate(sampleID = rownames(df_robu_genera)) %>%
dplyr::relocate(sampleID, .before = 1)
# Convert the dataframe from wide format to long format
df_robu_genera <- melt(df_robu_genera)
# Select genera with robust fold change
sub_robu_genera <- subset(df_robu_genera, variable %in%
robust_genera)# Average and standard deviation function
avg_sd <- function(data, varname, groupnames){
require(plyr)
summary_func <- function(x, col){
c(mean = mean(x[[col]], na.rm=TRUE),
sd = sd(x[[col]], na.rm=TRUE))
}
data_sum<-ddply(data, groupnames, .fun=summary_func,
varname)
data_sum <- rename(data_sum, c("mean" = varname))
return(data_sum)
}# Averages and standard deviations of genera abundance at low and high CHâ‚„ yields
counts_data_p <- avg_sd(sub_robu_genera, varname="value",
groupnames=c("CH4_yield_level", "variable"))
# Create a vector with the desired order for plotting the key taxa
# (Methanothermobacter was excluded as it is an archaeon associated with thermophilic processes)
genera_order <- c("Sedimentibacter","Sphaerochaeta","Prevotella",
"Corynebacterium","W5","Lactobacillus")
# Transform genera as factor
counts_data_p$variable <- factor(counts_data_p$variable, levels = genera_order)
# Create a vector with the desired order for plotting the methane yield groups
class_order <- c("high", "low")
# Transform methane yield group as factor
counts_data_p$CH4_yield_level <- factor(counts_data_p$CH4_yield_level, levels = class_order)
# Removes all rows with missing (NA) values
counts_data_p <- na.omit(counts_data_p)# Barplot
ggplot(counts_data_p, aes(x = variable, y = value, fill = CH4_yield_level)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(
values = c("low" = "#ff0000ff", "high" = "#008000ff"),
labels = c(expression("Low CH"[4]*" yield"), expression("High CH"[4]*" yield"))) +
labs(x = "Genera with robust fold change", y = "Absolute counts", fill = "Methane yield categories") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, size = 10),
axis.title.x = element_text(color = "black", size = 10),
axis.title.y = element_text(color = "black", size = 10)
)The average absolute abundances of key taxa selected from differential abundances within the entire microbial community are depicted. In the chart, the red bars represent the mean abundance observed in the low CH4 yield group (C1, C2, and C3 methane yield categories), while the green bars represent the mean abundance in the high CH4 yield group (C4, C5, and C6 methane yield categories).
The capability of GAMLSS to predict the CH4 yield of key alpha diversity indices and the abundance of key genera identified by Attribute 1. Universality or Attribute 2. Robust fold changes was evaluated. Additionally, to test whether the model performance could be improved, reported physicochemical indicators, such as TAN and acetic acid, were evaluated. The fit of both models, only potential microbial indicator or their combination with physicochemical parameters was compared using p-value, generalized R-squared (R2), and Akaike information criterion (AIC). The Wald test was used to evaluate the significance of the predictive variables used in GAMLSS.
# Load packages
library(gamlss)
library(ggplot2)
library(gamlss.ggplots)
library(ggeffects)
# Deactivate plyr (it causes an error when reading an xlsx file)
detach("package:plyr", unload = TRUE, character.only = TRUE)# Load data
# Alpha diversity
# sOTUs data
a_sotus_df <- readRDS(file = here("rds","df_alpha_div_sotus.rds"))
# Add the suffix '_sotus' to all column names
colnames(a_sotus_df) <- paste0(colnames(a_sotus_df), "_sotus")
# Same with genera data
a_genera_df <- readRDS(file = here("rds","df_alpha_div_genera.rds"))
colnames(a_genera_df) <- paste0(colnames(a_genera_df), "_genera")
# Integrate data on diversity indices, key physicochemical parameters,
# and CHâ‚„ performance categories
alpha_div_df <- cbind(
a_sotus_df,
a_genera_df)
# Relative abundance of microbial community
relat_abun_micr_comm_df <- read_xlsx(here("data", "meta-analysis-microbial-data.xlsx"),
sheet = "genera_abs_count") %>%
column_to_rownames("genera") %>%
# Delete samples from thermophilic processes and those with archaeal data only
dplyr::select(-"F4_4",-"F4_10",-"F27_16",-"F4_11",
-"F27_1",-"F27_14", -"F27_15", -"F4_1",
-"F4_6",-"F4_8",-"F27_11", -"F27_5",
-"F27_13",-"F4_5",-"F4_7",-"F27_8",
-"F25_5", -"F27_2",-"F27_3",-"F27_10",
-"F27_9", -"F4_12",-"F27_6", -"F4_9",
-"F4_3",-"F27_4", -"F4_2", -"F27_12",
-"F27_7",-"F25_7", -"F17_5", -"F17_3",
-"F17_6") %>%
{sweep(., 2, colSums(.), FUN = "/")} %>%
t() %>%
as.data.frame()
# Create a vector of key genera from core microbiome and robust fold change
Key_genera <- union(robust_genera, core_microbiome)
# Extract the relative abundance values of key genera.
relat_abun_key_genera_df <- relat_abun_micr_comm_df %>%
dplyr::select(Key_genera)# Create a dataframe with information on key physicochemical and biological indicators related to methane yield
gamlss_data <- merge(CH4_yield_df, alpha_div_df, by = "row.names", all = TRUE) %>%
column_to_rownames("Row.names") %>%
merge(relat_abun_key_genera_df, by = "row.names", all = TRUE) %>%
column_to_rownames("Row.names")
# Create a vector with the potential microbial indicators (key alpha diversity indices and key genera)
micro_ind <- c("Aminobacterium","Chao1_sotus","Clostridium", "Corynebacterium",
"HA73", "Lactobacillus", "Methanosarcina", "pielou_genera",
"Prevotella","q0_sotus", "q1_genera", "Sedimentibacter",
"Shannon_genera", "Sphaerochaeta","T78", "W5")GAMLSS models
# Model 1 - Only microbial indicator
# Empty list to store the results
results_m1 <- list()
# Loop to create a model for each indicator
for (var in micro_ind) {
result <- tryCatch({
# Subset data for the model
ind_data <- gamlss_data %>%
dplyr::select("YCH4_mLgVS", all_of(var)) %>%
na.omit()
# Create GAMLSS model
model <- gamlss(
formula = as.formula(paste("YCH4_mLgVS ~ pb(", var, ")", sep = "")),
family = NO,
data = ind_data,
trace = FALSE
)
# Subset data for the model
R2 <- Rsq(model)
p_value <- capture.output(summary(model))
p_value <- p_value[8:26] # Show significant information
AIC_val <- AIC(model)
# Add prediction
pred_data <- ind_data %>%
mutate(prediction_CH4 = predict(model))
# Plot results
p <- ggplot(data = pred_data, aes_string(x = var, y = "YCH4_mLgVS")) +
geom_point(alpha = 0.2, color = "black") +
geom_smooth(aes_string(x = var, y = "prediction_CH4"), size = 1, color = "black") +
labs(title = var, x = "Relative abundance (%)", y = expression("Methane yield (mLCH"[4]*"/gVS)")) +
theme_bw() +
theme(legend.position = "bottom",
axis.line = element_line(color = "black"),
panel.grid.major = element_line(color = "black", linetype = "dashed"))
# Return results if everything went well
list(
R2 = R2,
p_value = p_value,
AIC = AIC_val,
figure = p
)
}, error = function(e) {
message(paste("Error with variable:", var, "->", e$message))
return(NULL) # Equivalent to "next"
})
# If the result is not NULL, save it
if (!is.null(result)) {
results_m1[[var]] <- result
}
}📈 Model validation for Aminobacterium
🔹 R-squared (pseudo)
R² = 0.112
🔹 AIC
AIC = 915.65
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 410.89 26.34 15.597 <2e-16 ***"
[7] "pb(Aminobacterium) -2204.64 1214.12 -1.816 0.074 . "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.15577 0.08513 60.57 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for Chao1_sotus
🔹 R-squared (pseudo)
R² = 0.082
🔹 AIC
AIC = 992.77
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 238.71010 48.42399 4.93 5.22e-06 ***"
[7] "pb(Chao1_sotus) 0.25167 0.09792 2.57 0.0123 * "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.2484 0.0822 63.85 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for Clostridium
🔹 R-squared (pseudo)
R² = 0.153
🔹 AIC
AIC = 911.21
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 416.19 23.02 18.077 < 2e-16 ***"
[7] "pb(Clostridium) -5397.25 1674.83 -3.223 0.00198 ** "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.13185 0.08513 60.29 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for Corynebacterium
🔹 R-squared (pseudo)
R² = 0.053
🔹 AIC
AIC = 918.92
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 374.09 22.18 16.868 <2e-16 ***"
[7] "pb(Corynebacterium) 61256.79 47068.56 1.301 0.198 "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.18791 0.08513 60.94 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for HA73
🔹 R-squared (pseudo)
R² = 0.062
🔹 AIC
AIC = 917.05
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 423.87 30.02 14.121 <2e-16 ***"
[7] "pb(HA73) -9622.33 4791.94 -2.008 0.0487 * "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.18285 0.08513 60.88 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for Lactobacillus
🔹 R-squared (pseudo)
R² = 0.033
🔹 AIC
AIC = 919.21
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 389.41 22.37 17.407 <2e-16 ***"
[7] "pb(Lactobacillus) -183430.78 120406.48 -1.523 0.132 "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.19849 0.08513 61.07 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for Methanosarcina
🔹 R-squared (pseudo)
R² = 0.055
🔹 AIC
AIC = 919.46
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 371.11 28.06 13.226 <2e-16 ***"
[7] "pb(Methanosarcina) 76.51 130.08 0.588 0.558 "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.18662 0.08513 60.93 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for pielou_genera
🔹 R-squared (pseudo)
R² = 0.024
🔹 AIC
AIC = 997.26
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|)"
[6] "(Intercept) 174.9 130.1 1.344 0.183"
[7] "pb(pielou_genera) 394.6 289.7 1.362 0.177"
[8] ""
[9] "------------------------------------------------------------------"
[10] "Sigma link function: log"
[11] "Sigma Coefficients:"
[12] " Estimate Std. Error t value Pr(>|t|) "
[13] "(Intercept) 5.2788 0.0822 64.22 <2e-16 ***"
[14] "---"
[15] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[16] ""
[17] "------------------------------------------------------------------"
[18] "NOTE: Additive smoothing terms exist in the formulas: "
[19] " i) Std. Error for smoothers are for the linear effect only. "
📈 Model validation for q0_sotus
🔹 R-squared (pseudo)
R² = 0.155
🔹 AIC
AIC = 990.74
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 243.1197 49.2556 4.936 5.33e-06 ***"
[7] "pb(q0_sotus) 0.2674 0.1118 2.392 0.0195 * "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.2072 0.0822 63.35 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for q1_genera
🔹 R-squared (pseudo)
R² = 0.075
🔹 AIC
AIC = 995.82
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 296.78 69.30 4.283 5.79e-05 ***"
[7] "pb(q1_genera) 10.15 12.66 0.802 0.425 "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.2524 0.0822 63.9 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for Sedimentibacter
🔹 R-squared (pseudo)
R² = 0.293
🔹 AIC
AIC = 901.79
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 404.76 22.12 18.301 < 2e-16 ***"
[7] "pb(Sedimentibacter) -10758.29 2033.37 -5.291 1.59e-06 ***"
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.04153 0.08513 59.23 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for Shannon_genera
🔹 R-squared (pseudo)
R² = 0.026
🔹 AIC
AIC = 997.69
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 223.12 110.55 2.018 0.0473 *"
[7] "pb(Shannon_genera) 79.40 68.01 1.168 0.2469 "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.2778 0.0822 64.21 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for Sphaerochaeta
🔹 R-squared (pseudo)
R² = 0.117
🔹 AIC
AIC = 914.36
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 348.86 24.41 14.291 <2e-16 ***"
[7] "pb(Sphaerochaeta) 80366.00 31200.62 2.576 0.0123 * "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.15298 0.08513 60.53 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for T78
🔹 R-squared (pseudo)
R² = 0.129
🔹 AIC
AIC = 911.93
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 417.06 23.44 17.795 <2e-16 ***"
[7] "pb(T78) -938.51 293.10 -3.202 0.0021 ** "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.14579 0.08513 60.45 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
📈 Model validation for W5
🔹 R-squared (pseudo)
R² = 0.005
🔹 AIC
AIC = 921.15
🔹 Model Summary
[1] ""
[2] "------------------------------------------------------------------"
[3] "Mu link function: identity"
[4] "Mu Coefficients:"
[5] " Estimate Std. Error t value Pr(>|t|) "
[6] "(Intercept) 380.15 22.26 17.076 <2e-16 ***"
[7] "pb(W5) 24694.90 42792.50 0.577 0.566 "
[8] "---"
[9] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[10] ""
[11] "------------------------------------------------------------------"
[12] "Sigma link function: log"
[13] "Sigma Coefficients:"
[14] " Estimate Std. Error t value Pr(>|t|) "
[15] "(Intercept) 5.21261 0.08513 61.23 <2e-16 ***"
[16] "---"
[17] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[18] ""
[19] "------------------------------------------------------------------"
# Model 2 - Physicochemical indicators + Microbial indicator
# Empty list to store the results
results_m2 <- list()
# Loop to create a model for each indicator
for (var in micro_ind) {
result <- tryCatch({
# Subset data for the model
ind_data_mix <- gamlss_data %>%
dplyr::select("YCH4_mLgVS", all_of(var),
"total_ammonia_nitrogen_mg_l",
"acetic_acid_mg_l") %>%
na.omit()
# Create GAMLSS model physicochemical indicators + microbial indicator
model_mix <- gamlss(
formula = as.formula(paste("YCH4_mLgVS ~ pb(", var, ") + pb(total_ammonia_nitrogen_mg_l) + pb(acetic_acid_mg_l)", sep = "")),
family = NO,
data = ind_data_mix,
trace = FALSE
)
# Model evaluation
R2 <- Rsq(model_mix)
p_value <- capture.output(summary(model_mix))
p_value <- p_value[8:26] # Show significant information
AIC_val <- AIC(model_mix)
# Add prediction
pred_data_mix <- ind_data_mix %>%
mutate(prediction_CH4 = predict(model_mix))
# Plot results
p <- ggplot(data = pred_data_mix, aes_string(x = var, y = "YCH4_mLgVS")) +
geom_point(alpha = 0.2, color = "black") +
geom_smooth(aes_string(x = var, y = "prediction_CH4"), size = 1, color = "black") +
labs(title = paste(var, "+ ammonia + acetic acid"), x = "Relative abundance (%)", y = expression("Methane yield (mLCH"[4]*"/gVS)")) +
theme_bw() +
theme(legend.position = "bottom",
axis.line = element_line(color = "black"),
panel.grid.major = element_line(color = "black", linetype = "dashed"))
# Return results if everything went well
list(
R2 = R2,
p_value = p_value,
AIC = AIC_val,
figure = p
)
}, error = function(e) {
message(paste("Error with variable:", var, "->", e$message))
return(NULL) # Equivalent to "next"
})
# If the result is not NULL, save it
if (!is.null(result)) {
results_m2[[var]] <- result
}
}📈 Model validation for Aminobacterium
🔹 R-squared (pseudo)
R² = 0.489
🔹 AIC
AIC = 609.19
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 542.54185 51.07780 10.622 2.12e-13 ***"
[8] "pb(Aminobacterium) 2297.89353 964.30028 2.383 0.02184 * "
[9] "pb(total_ammonia_nitrogen_mg_l) -0.02783 0.02317 -1.201 0.23656 "
[10] "pb(acetic_acid_mg_l) -0.07206 0.02264 -3.183 0.00276 ** "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.643 0.101 45.97 <2e-16 ***"
[19] "---"
📈 Model validation for Chao1_sotus
🔹 R-squared (pseudo)
R² = 0.623
🔹 AIC
AIC = 555.03
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 859.15013 68.55108 12.533 1.31e-15 ***"
[8] "pb(Chao1_sotus) -0.39269 0.08205 -4.786 2.23e-05 ***"
[9] "pb(total_ammonia_nitrogen_mg_l) -0.06958 0.02084 -3.339 0.0018 ** "
[10] "pb(acetic_acid_mg_l) -0.10744 0.02183 -4.922 1.44e-05 ***"
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.5053 0.1043 43.21 <2e-16 ***"
[19] "---"
📈 Model validation for Clostridium
🔹 R-squared (pseudo)
R² = 0.308
🔹 AIC
AIC = 618.96
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 5.740e+02 8.555e+01 6.710 3.05e-08 ***"
[8] "pb(Clostridium) -9.351e+02 5.049e+04 -0.019 0.985 "
[9] "pb(total_ammonia_nitrogen_mg_l) -3.789e-02 2.771e-02 -1.367 0.179 "
[10] "pb(acetic_acid_mg_l) -5.550e-02 2.636e-02 -2.105 0.041 * "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.795 0.101 47.47 <2e-16 ***"
[19] "---"
📈 Model validation for Corynebacterium
🔹 R-squared (pseudo)
R² = 0.531
🔹 AIC
AIC = 602.04
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 6.070e+02 4.954e+01 12.251 1.33e-15 ***"
[8] "pb(Corynebacterium) 2.185e+05 5.815e+04 3.758 0.000512 ***"
[9] "pb(total_ammonia_nitrogen_mg_l) -6.088e-02 2.294e-02 -2.654 0.011116 * "
[10] "pb(acetic_acid_mg_l) -4.669e-02 2.149e-02 -2.172 0.035396 * "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.600 0.101 45.54 <2e-16 ***"
[19] "---"
📈 Model validation for HA73
🔹 R-squared (pseudo)
R² = 0.59
🔹 AIC
AIC = 602.09
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 525.68076 47.10056 11.161 8.52e-14 ***"
[8] "pb(HA73) 8148.66365 3368.30648 2.419 0.02024 * "
[9] "pb(total_ammonia_nitrogen_mg_l) -0.01734 0.02123 -0.817 0.41893 "
[10] "pb(acetic_acid_mg_l) -0.09713 0.02470 -3.932 0.00033 ***"
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.533 0.101 44.87 <2e-16 ***"
[19] "---"
📈 Model validation for Lactobacillus
🔹 R-squared (pseudo)
R² = 0.327
🔹 AIC
AIC = 617.59
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 5.821e+02 5.905e+01 9.857 1.04e-12 ***"
[8] "pb(Lactobacillus) -1.133e+05 9.607e+04 -1.179 0.245 "
[9] "pb(total_ammonia_nitrogen_mg_l) -4.472e-02 2.726e-02 -1.641 0.108 "
[10] "pb(acetic_acid_mg_l) -3.767e-02 2.985e-02 -1.262 0.214 "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.781 0.101 47.33 <2e-16 ***"
[19] "---"
📈 Model validation for Methanosarcina
🔹 R-squared (pseudo)
R² = 0.667
🔹 AIC
AIC = 593.49
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 506.96124 42.79594 11.846 1.84e-14 ***"
[8] "pb(Methanosarcina) 333.13466 80.25877 4.151 0.000175 ***"
[9] "pb(total_ammonia_nitrogen_mg_l) -0.02316 0.01876 -1.235 0.224357 "
[10] "pb(acetic_acid_mg_l) -0.09577 0.01984 -4.827 2.18e-05 ***"
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.429 0.101 43.85 <2e-16 ***"
[19] "---"
📈 Model validation for pielou_genera
🔹 R-squared (pseudo)
R² = 0.511
🔹 AIC
AIC = 567.77
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 403.25472 98.91976 4.077 0.000207 ***"
[8] "pb(pielou_genera) 482.59823 194.93485 2.476 0.017561 * "
[9] "pb(total_ammonia_nitrogen_mg_l) -0.05902 0.02357 -2.504 0.016405 * "
[10] "pb(acetic_acid_mg_l) -0.05912 0.02239 -2.641 0.011674 * "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.6356 0.1043 44.46 <2e-16 ***"
[19] "---"
📈 Model validation for q0_sotus
🔹 R-squared (pseudo)
R² = 0.608
🔹 AIC
AIC = 556.82
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 841.38867 68.62309 12.261 2.68e-15 ***"
[8] "pb(q0_sotus) -0.41266 0.09166 -4.502 5.47e-05 ***"
[9] "pb(total_ammonia_nitrogen_mg_l) -0.06741 0.02121 -3.179 0.00281 ** "
[10] "pb(acetic_acid_mg_l) -0.10398 0.02211 -4.704 2.89e-05 ***"
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.5248 0.1043 43.4 <2e-16 ***"
[19] "---"
📈 Model validation for q1_genera
🔹 R-squared (pseudo)
R² = 0.503
🔹 AIC
AIC = 567.83
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 495.98256 70.40817 7.044 1.43e-08 ***"
[8] "pb(q1_genera) 24.19261 9.74004 2.484 0.0172 * "
[9] "pb(total_ammonia_nitrogen_mg_l) -0.06101 0.02381 -2.562 0.0142 * "
[10] "pb(acetic_acid_mg_l) -0.06056 0.02256 -2.685 0.0104 * "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.6444 0.1043 44.55 <2e-16 ***"
[19] "---"
📈 Model validation for Sedimentibacter
🔹 R-squared (pseudo)
R² = 0.43
🔹 AIC
AIC = 612.16
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 6.607e+02 5.891e+01 11.214 2.69e-14 ***"
[8] "pb(Sedimentibacter) -6.598e+03 1.599e+03 -4.125 0.000168 ***"
[9] "pb(total_ammonia_nitrogen_mg_l) -5.553e-02 2.509e-02 -2.213 0.032321 * "
[10] "pb(acetic_acid_mg_l) -4.140e-02 2.397e-02 -1.727 0.091403 . "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.697 0.101 46.5 <2e-16 ***"
[19] "---"
📈 Model validation for Shannon_genera
🔹 R-squared (pseudo)
R² = 0.509
🔹 AIC
AIC = 567.28
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 412.78382 92.80872 4.448 6.49e-05 ***"
[8] "pb(Shannon_genera) 127.97448 49.09426 2.607 0.0127 * "
[9] "pb(total_ammonia_nitrogen_mg_l) -0.05974 0.02364 -2.527 0.0155 * "
[10] "pb(acetic_acid_mg_l) -0.05959 0.02243 -2.656 0.0112 * "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.6385 0.1043 44.49 <2e-16 ***"
[19] "---"
📈 Model validation for T78
🔹 R-squared (pseudo)
R² = 0.468
🔹 AIC
AIC = 606.04
🔹 Model Summary
[1] "Fitting method: RS() "
[2] ""
[3] "------------------------------------------------------------------"
[4] "Mu link function: identity"
[5] "Mu Coefficients:"
[6] " Estimate Std. Error t value Pr(>|t|) "
[7] "(Intercept) 588.70224 52.30236 11.256 1.54e-14 ***"
[8] "pb(T78) 6196.83288 2002.24271 3.095 0.00342 ** "
[9] "pb(total_ammonia_nitrogen_mg_l) -0.04888 0.02394 -2.042 0.04722 * "
[10] "pb(acetic_acid_mg_l) -0.06139 0.02284 -2.687 0.01013 * "
[11] "---"
[12] "Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1"
[13] ""
[14] "------------------------------------------------------------------"
[15] "Sigma link function: log"
[16] "Sigma Coefficients:"
[17] " Estimate Std. Error t value Pr(>|t|) "
[18] "(Intercept) 4.663 0.101 46.16 <2e-16 ***"
[19] "---"
Complete matrices were constructed resulting in a total ammonia nitrogen (TAN) matrix (n=46), long chain fatty acids (LCFA) matrix (n=26), and volatile fatty acids (VFA) matrix (n=23). Each matrix was assembled to include the largest number of samples that assessed these metabolites. The variables used in the matrices may possess characteristics that challenge the assumptions of a statistical procedure, such as values that are not normally distributed, or may not be comparable with other variables owing to variations in scale. For the relative abundance data, we applied the Hellinger transformation. In contrast, for the concentrations of AD by-products and CH4 yield, logarithmic transformation was employed to reduce the skewness of the measurement variable and attain linearity.Then, Spearman’s correlation coefficients were determined.
# Load database with physicochemical information (part 1)
phy_df <- readRDS(file = here("rds","df_physicochemical_data_metanalysis.rds")) %>%
column_to_rownames("sample_id")
# Create a dataframe containing key physicochemical and microbial information
corr_df <- merge(phy_df, relat_abun_key_genera_df, by = "row.names", all = TRUE) %>%
column_to_rownames("Row.names")
# Filter the data
corr_df <- corr_df %>%
# Remove columns with only NA values
dplyr::select(where(~ !all(is.na(.)))) %>%
# Remove non-numeric columns
dplyr::select(where(is.numeric)) %>%
# Remove columns with variables not to be analyzed (low number of data, low information)
dplyr::select(-"temperature_c",-"organic_loading_rate_g_vs_l_d",
-"hrt_d", -"free_ammonia_nitrogen_mg_l",
-"volatile_fatty_acid_mg_l",-"caproic_acid_mg_l",
-"isobutyric_acid_mg_l",-"isovaleric_acid_mg_l",
-"formic_acid_mg_l",-"sulfate_mg_l",
-"pr_ac",-"tan_acetic" ) %>%
# Remove rows with no microbial information. Consider a column of a taxon (Aminobacterium) as a reference
dplyr::filter(!is.na(Aminobacterium))TAN correlation analysis
# Built TAN matrix
TAN_df <- corr_df %>%
dplyr::select("total_ammonia_nitrogen_mg_l", "acetic_acid_mg_l",
"propionic_acid_mg_l", "ch4_yield_m_lch4_g_v_sadded",
"Corynebacterium", "Lactobacillus", "Prevotella",
"Sphaerochaeta", "Sedimentibacter", "Aminobacterium",
"Clostridium", "HA73", "Methanoculleus", "Methanosarcina",
"Methanothermobacter", "Pelotomaculum", "Syntrophomonas",
"T78", "W5") %>%
na.omit() %>%
dplyr::select(where(~ !all(. == 0, na.rm = TRUE))) # Remove columns with only zeros
TAN_df <- TAN_df[!rownames(TAN_df) %in%
c("F25_7", "F17_5", "F17_3", "F17_6",
"F25_3", "F25_4", "F25_1"), ] # Samples without observed archaea# Pre-treated data
# Select relative abundance to be transformed using Hellinger Transformation
TAN_genera_df <- TAN_df %>%
dplyr::select(any_of(c("Corynebacterium", "Lactobacillus",
"Prevotella", "Sphaerochaeta",
"Sedimentibacter", "Aminobacterium",
"Clostridium", "HA73",
"Methanoculleus", "Methanosarcina",
"Methanothermobacter", "Pelotomaculum",
"Syntrophomonas", "T78", "W5")))
# Apply Hellinger transformation
TAN_genera_df <- decostand(TAN_genera_df, method = "hellinger")
# Select physicochemical parameters to be transformed using log Transformation
TAN_phy_df <- TAN_df %>%
dplyr::select(any_of(c("total_ammonia_nitrogen_mg_l", "acetic_acid_mg_l",
"propionic_acid_mg_l", "ch4_yield_m_lch4_g_v_sadded")))
# Apply Log transformation
TAN_phy_df <- decostand(TAN_phy_df, method = "log")
# Merge transformed data
TAN_transf_df <- merge(TAN_genera_df, TAN_phy_df, by=0, all=TRUE) %>%
column_to_rownames("Row.names")# Spearman correlations
cor_mat_TAN <- cor(TAN_transf_df, method = "spearman", use="complete.obs")
# Obtain significance
testRes_TAN <- cor.mtest(TAN_transf_df, method = "spearman", conf.level = 0.95)# Create a correlation matrix based on microorganisms vs environmental variables
cor_mat_TAN <- data.frame(cor_mat_TAN) %>%
dplyr::select("total_ammonia_nitrogen_mg_l", "acetic_acid_mg_l",
"propionic_acid_mg_l", "ch4_yield_m_lch4_g_v_sadded")
# Extract relevant column names for TAN variables
TAN_names <- data.frame(TAN_transf_df) %>%
dplyr::select(-"total_ammonia_nitrogen_mg_l", -"acetic_acid_mg_l",
-"propionic_acid_mg_l", -"ch4_yield_m_lch4_g_v_sadded") %>%
names()
# Reorder the correlation matrix to match the order of the TAN variables
cor_mat_TAN <- cor_mat_TAN[TAN_names, ]
# Convert the correlation matrix back into a matrix format
cor_mat_TAN <- as.matrix(cor_mat_TAN)
# Extract p-values for testing and filter for relevant columns
testRes_TAN <- testRes_TAN$p
testRes_TAN <- data.frame(testRes_TAN) %>%
dplyr::select("total_ammonia_nitrogen_mg_l", "acetic_acid_mg_l",
"propionic_acid_mg_l", "ch4_yield_m_lch4_g_v_sadded")
# Reorder the p-value matrix to match the TAN variables
testRes_TAN <- testRes_TAN[TAN_names, ]
# Convert the p-value matrix back into a matrix format
testRes_TAN <- as.matrix(testRes_TAN)
# Merge the correlation matrix with the p-values for significance testing
corr_test_TAN <- merge(cor_mat_TAN, testRes_TAN, by = "row.names") %>%
column_to_rownames("Row.names") %>%
mutate(across(where(is.numeric), ~ round(.x, 3))) # Round numerical values to three decimal places
# Display the final results as a data table (correlations and p-values)
datatable(corr_test_TAN)# Heatmap with Spearman correlations
breaksList = seq(-1, 1, by = 0.1)
pheatmap(cor_mat_TAN,
breaks = breaksList,
clustering_distance_cols = "euclidean",
clustering_distance_rows = "euclidean",
clustering_method = "ward.D2",
display_numbers = matrix(ifelse(testRes_TAN <= 0.05 & testRes_TAN > 0.01, "*",
ifelse(testRes_TAN <=0.01 & testRes_TAN >0.001, "**",
ifelse(testRes_TAN <=0.001, "***", ""))), nrow(testRes_TAN)),
fontsize = 10,
col = colorRampPalette(c("firebrick3", "white", "navy"))(20),
number_color = "black",
border_color = "black",
cellwidth = 10,
cellheight = 10,
legend_breaks = c(-1,-0.5,0,0.5,1))The heatmap illustrates the correlations between the key taxa and AD by-products from TAN matrix.
LCFA correlation analysis
# Built LCFA matrix
LCFA_df <- corr_df %>%
dplyr::select("total_ammonia_nitrogen_mg_l", "acetic_acid_mg_l",
"propionic_acid_mg_l", "ch4_yield_m_lch4_g_v_sadded",
"myristic_acid_mg_l", "palmitic_acid_mg_l",
"stearic_acid_mg_l", "oleic_acid_mg_l",
"linoleic_acid_mg_l",
"Corynebacterium", "Lactobacillus", "Prevotella",
"Sphaerochaeta","Sedimentibacter", "Aminobacterium",
"Clostridium", "HA73", "Methanoculleus", "Methanosarcina",
"Methanothermobacter", "Pelotomaculum", "Syntrophomonas",
"T78", "W5") %>%
na.omit() %>%
# Remove columns that only contain zero values
dplyr::select(where(~ !all(. == 0, na.rm = TRUE)))
# Excluding samples that do not show any observed archaea based on row names
LCFA_df <- LCFA_df[!rownames(LCFA_df) %in%
c("F25_7", "F17_5", "F17_3", "F17_6", "F25_3", "F25_4", "F25_1"), ]# Pre-treated data
# Select relative abundance to be transformed using Hellinger Transformation
LCFA_genera_df <- LCFA_df %>%
dplyr::select(any_of(c("Corynebacterium", "Lactobacillus",
"Prevotella", "Sphaerochaeta",
"Sedimentibacter", "Aminobacterium",
"Clostridium", "HA73", "Methanoculleus",
"Methanosarcina","Methanothermobacter",
"Pelotomaculum", "Syntrophomonas",
"T78", "W5")))
# Apply Hellinger transformation
LCFA_genera_df <- decostand(LCFA_genera_df, method = "hellinger")
# Select physicochemical parameters to be transformed using log Transformation
LCFA_phy_df <- LCFA_df %>%
dplyr::select(any_of(c("total_ammonia_nitrogen_mg_l",
"acetic_acid_mg_l","propionic_acid_mg_l",
"ch4_yield_m_lch4_g_v_sadded",
"myristic_acid_mg_l",
"palmitic_acid_mg_l",
"stearic_acid_mg_l","oleic_acid_mg_l",
"linoleic_acid_mg_l")))
# Apply Log transformation
LCFA_phy_df <- decostand(LCFA_phy_df, method = "log")
# Merge transformed data
LCFA_transf_df <- merge(LCFA_genera_df, LCFA_phy_df, by=0, all=TRUE) %>%
column_to_rownames("Row.names")# Spearman correlations
cor_mat_LCFA <- cor(LCFA_transf_df, method = "spearman", use="complete.obs")
# Obtain significance
testRes_LCFA <- cor.mtest(LCFA_transf_df, method = "spearman", conf.level = 0.95)# Create a correlation matrix based on microorganisms vs environmental variables
cor_mat_LCFA <- data.frame(cor_mat_LCFA) %>%
dplyr::select("total_ammonia_nitrogen_mg_l", "acetic_acid_mg_l",
"propionic_acid_mg_l",
"ch4_yield_m_lch4_g_v_sadded", "myristic_acid_mg_l",
"palmitic_acid_mg_l",
"stearic_acid_mg_l", "oleic_acid_mg_l", "linoleic_acid_mg_l")
# Extract relevant column names for LCFA variables
LCFA_names <- data.frame(LCFA_transf_df) %>%
dplyr::select(-"total_ammonia_nitrogen_mg_l", -"acetic_acid_mg_l",
-"propionic_acid_mg_l", -"ch4_yield_m_lch4_g_v_sadded",
-"myristic_acid_mg_l", -"palmitic_acid_mg_l",
-"stearic_acid_mg_l", -"oleic_acid_mg_l",
-"linoleic_acid_mg_l") %>%
names()
# Reorder the correlation matrix to match the order of the TAN variables
cor_mat_LCFA <- cor_mat_LCFA[LCFA_names, ]
# Convert the correlation matrix back into a matrix format
cor_mat_LCFA <-as.matrix(cor_mat_LCFA)
# Extract p-values for testing and filter for relevant columns
testRes_LCFA <- testRes_LCFA$p
testRes_LCFA <- data.frame(testRes_LCFA) %>%
dplyr::select("total_ammonia_nitrogen_mg_l", "acetic_acid_mg_l",
"propionic_acid_mg_l",
"ch4_yield_m_lch4_g_v_sadded", "myristic_acid_mg_l",
"palmitic_acid_mg_l",
"stearic_acid_mg_l", "oleic_acid_mg_l",
"linoleic_acid_mg_l")
# Reorder the p-value matrix to match the TAN variables
testRes_LCFA <- testRes_LCFA[LCFA_names, ]
# Convert the p-value matrix back into a matrix format
testRes_LCFA <-as.matrix(testRes_LCFA)
# Merge the correlation matrix with the p-values for significance testing
corr_test_LCFA <- merge(cor_mat_LCFA, testRes_LCFA, by = "row.names") %>%
column_to_rownames("Row.names") %>%
# Round numerical values to three decimal places
mutate(across(where(is.numeric), ~ round(.x, 3)))
# Display the final results as a data table (x = correlaciones y = p value)
datatable(corr_test_LCFA)# Heatmap with Spearman correlations
breaksList = seq(-1, 1, by = 0.1)
pheatmap(cor_mat_LCFA,
breaks = breaksList,
clustering_distance_cols = "euclidean",
clustering_distance_rows = "euclidean",
clustering_method = "ward.D2",
display_numbers = matrix(ifelse(testRes_LCFA <= 0.05 & testRes_LCFA > 0.01, "*",
ifelse(testRes_LCFA <=0.01 & testRes_LCFA >0.001, "**",
ifelse(testRes_LCFA <=0.001, "***", ""))), nrow(testRes_LCFA)),
fontsize = 10,
col = colorRampPalette(c("firebrick3", "white", "navy"))(20),
number_color = "black",
border_color = "black",
cellwidth = 10,
cellheight = 10,
legend_breaks = c(-1,-0.5,0,0.5,1))The heatmap illustrates the correlations between the key taxa and AD by-products from LCFA matrix.
VFA correlation analysis
# Built VFA matrix
VFA_df <- corr_df %>%
dplyr::select("acetic_acid_mg_l", "propionic_acid_mg_l", "butyric_acid_mg_l",
"valeric_acid_mg_l","ch4_yield_m_lch4_g_v_sadded",
"Corynebacterium", "Lactobacillus", "Prevotella",
"Sphaerochaeta",
"Sedimentibacter", "Aminobacterium", "Clostridium", "HA73",
"Methanoculleus", "Methanosarcina","Methanothermobacter",
"Pelotomaculum", "Syntrophomonas", "T78", "W5") %>%
na.omit() %>%
dplyr::select(where(~ !all(. == 0, na.rm = TRUE))) # Remove columns with only zeros
# Samples without observed archaea
VFA_df <- VFA_df[!rownames(VFA_df) %in% c("F25_7", "F17_5", "F17_3",
"F17_6", "F25_3", "F25_4", "F25_1"), ]# Pre-treated data
# Select relative abundance to be transformed using Hellinger Transformation
VFA_genera_df <- VFA_df %>%
dplyr::select(any_of(c("Corynebacterium", "Lactobacillus",
"Prevotella","Sphaerochaeta",
"Sedimentibacter",
"Aminobacterium", "Clostridium", "HA73",
"Methanoculleus", "Methanosarcina",
"Methanothermobacter", "Pelotomaculum",
"Syntrophomonas",
"T78", "W5")))
# Apply Hellinger transformation
VFA_genera_df <- decostand(VFA_genera_df, method = "hellinger")
# Select physicochemical parameters to be transformed using log Transformation
VFA_phy_df <- VFA_df %>%
dplyr::select(any_of(c("acetic_acid_mg_l", "propionic_acid_mg_l",
"butyric_acid_mg_l", "valeric_acid_mg_l",
"ch4_yield_m_lch4_g_v_sadded")))
# Apply Log transformation
VFA_phy_df <- decostand(VFA_phy_df, method = "log")
# Merge transformed data
VFA_transf_df <- merge(VFA_genera_df, VFA_phy_df, by=0, all=TRUE) %>%
column_to_rownames("Row.names")# Spearman correlations
cor_mat_VFA <- cor(VFA_transf_df, method = "spearman", use="complete.obs")
# Obtain significance
testRes_VFA <- cor.mtest(VFA_transf_df, method = "spearman", conf.level = 0.95)# Create a correlation matrix based on microorganisms vs environmental variables
cor_mat_VFA <- data.frame(cor_mat_VFA) %>%
dplyr::select("acetic_acid_mg_l", "propionic_acid_mg_l",
"butyric_acid_mg_l",
"valeric_acid_mg_l",
"ch4_yield_m_lch4_g_v_sadded")
# Extract relevant column names for VFA variables
VFA_names <- data.frame(VFA_transf_df) %>%
dplyr::select(-"acetic_acid_mg_l", -"propionic_acid_mg_l",
-"butyric_acid_mg_l",
-"valeric_acid_mg_l",
-"ch4_yield_m_lch4_g_v_sadded") %>%
names()
# Reorder the correlation matrix to match the order of the VFA variables
cor_mat_VFA <- cor_mat_VFA[VFA_names, ]
# Convert the correlation matrix back into a matrix format
cor_mat_VFA <-as.matrix(cor_mat_VFA)
# Extract p-values for testing and filter for relevant columns
testRes_VFA <- testRes_VFA$p
testRes_VFA <- data.frame(testRes_VFA) %>%
dplyr::select("acetic_acid_mg_l", "propionic_acid_mg_l",
"butyric_acid_mg_l", "valeric_acid_mg_l",
"ch4_yield_m_lch4_g_v_sadded")
# Reorder the p-value matrix to match the VFA variables
testRes_VFA <- testRes_VFA[VFA_names, ]
# Convert the p-value matrix back into a matrix format
testRes_VFA <-as.matrix(testRes_VFA)
# Merge the correlation matrix with the p-values for significance testing
corr_test_VFA <- merge(cor_mat_VFA, testRes_VFA, by = "row.names") %>%
column_to_rownames("Row.names") %>%
mutate(across(where(is.numeric), ~ round(.x, 3))) # Round numerical values to three decimal places
# Display the final results as a data table (x = correlaciones y = p value)
datatable(corr_test_VFA)# Heatmap with Spearman correlations
breaksList = seq(-1, 1, by = 0.1)
pheatmap(cor_mat_VFA,
breaks = breaksList,
clustering_distance_cols = "euclidean",
clustering_distance_rows = "euclidean",
clustering_method = "ward.D2",
display_numbers = matrix(ifelse(testRes_VFA <= 0.05 & testRes_VFA > 0.01, "*",
ifelse(testRes_VFA <=0.01 & testRes_VFA >0.001, "**",
ifelse(testRes_VFA <=0.001, "***", ""))), nrow(testRes_VFA)),
fontsize = 10,
col = colorRampPalette(c("firebrick3", "white", "navy"))(20),
number_color = "black",
border_color = "black",
cellwidth = 10,
cellheight = 10,
legend_breaks = c(-1,-0.5,0,0.5,1))The heatmap illustrates the correlations between the key taxa and AD by-products from VFA matrix.
2MMS
(..)
_O__( u )__0_