2. Data Loading and Preprocessing
# Load required libraries
suppressPackageStartupMessages({
library(tidyverse)
library(ggplot2)
library(gridExtra)
library(pROC)
library(PRROC)
library(corrplot)
library(knitr)
library(kableExtra)
library(DT)
library(plotly)
library(RColorBrewer)
library(viridis)
})
theme_set(theme_bw() +
theme(text = element_text(size = 12),
plot.title = element_text(face = "bold", size = 14),
axis.title = element_text(face = "bold"),
legend.position = "bottom"))
metric_colors <- c("counts" = "#1f77b4", "tpm" = "#ff7f0e", "rpkm" = "#2ca02c",
"relative" = "#d62728", "breadth" = "#9467bd", "coverage" = "#8c564b")
tool_colors <- c("ALDEx2" = "#1f77b4", "ANCOM" = "#ff7f0e", "basic" = "#2ca02c",
"corncob" = "#d62728", "dearseq" = "#9467bd", "edgeR" = "#8c564b",
"limma" = "#e377c2", "metagenomeSeq" = "#7f7f7f", "NOISeq" = "#bcbd22")
Genomic Sample Distribution
The genomes and their distribution was:
Sample Genome Distribution Across Synthetic Samples
genome
|
seq_type
|
C01
|
C02
|
C03
|
C04
|
C05
|
C06
|
C07
|
C08
|
C09
|
C10
|
T01
|
T02
|
T03
|
T04
|
T05
|
T06
|
T07
|
T08
|
T09
|
T10
|
GCA_000006785.2_ASM678v2.fa
|
chromosome
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
10
|
GCA_000013285.1_ASM1328v1.fa
|
chromosome
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
GCA_000013785.1_ASM1378v1.fa
|
chromosome
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
GCA_000015865.1_ASM1586v1.fa
|
chromosome
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
GCA_000022345.1_ASM2234v1.fa
|
chromosome
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
GCA_000023785.1_ASM2378v1.fa
|
chromosome
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
GCA_000025905.1_ASM2590v1.fa
|
chromosome
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
GCA_000092125.1_ASM9212v1.fa
|
chromosome
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
GCA_000092825.1_ASM9282v1.fa
|
chromosome
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
GCA_000092985.1_ASM9298v1.fa
|
chromosome
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
7
|
GCA_000143845.1_ASM14384v1.fa
|
chromosome
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
19
|
GCA_000143985.1_ASM14398v1.fa
|
chromosome
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
6
|
GCA_000227705.3_ASM22770v3.fa
|
chromosome
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
12
|
GCA_000231385.3_ASM23138v3.fa
|
chromosome
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
11
|
GCA_000233715.3_ASM23371v3.fa
|
chromosome
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
GCA_000235405.3_ASM23540v3.fa
|
chromosome
|
13
|
13
|
13
|
13
|
13
|
13
|
13
|
13
|
13
|
13
|
104
|
104
|
104
|
104
|
104
|
104
|
104
|
104
|
104
|
104
|
GCA_000242255.3_ASM24225v3.fa
|
chromosome
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
8
|
32
|
32
|
32
|
32
|
32
|
32
|
32
|
32
|
32
|
32
|
GCA_000255115.3_ASM25511v3.fa
|
chromosome
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
20
|
20
|
20
|
20
|
20
|
20
|
20
|
20
|
20
|
20
|
GCA_000265425.1_ASM26542v1.fa
|
chromosome
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
GCA_000325705.1_ASM32570v1.fa
|
chromosome
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
GCA_000439255.1_ASM43925v1.fa
|
chromosome
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
5
|
GCA_000445015.1_ASM44501v1.fa
|
chromosome
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
9
|
GCA_001692755.1_ASM169275v1.fa
|
chromosome
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
15
|
60
|
60
|
60
|
60
|
60
|
60
|
60
|
60
|
60
|
60
|
GCA_001877055.1_ASM187705v1.fa
|
chromosome
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
14
|
Ground Truth
# Load ground truth
truth_table <- read_csv('/home/mrahman/da_analysis/data/camisim_realistic/truth_table.csv') %>%
mutate(genome = str_replace(genome, "\\.fa$", "")) %>%
mutate(genome = str_replace_all(genome, "[^A-Za-z0-9]", "")) %>%
distinct(genome, .keep_all = TRUE)
cat("DA genomes:", sum(truth_table$is_DA), "\n")
## DA genomes: 4
cat("Non-DA genomes:", sum(!truth_table$is_DA), "\n")
## Non-DA genomes: 20
truth_table %>%
group_by(is_DA) %>%
summarise(
n_genomes = n(),
mean_log2FC = mean(true_log2FC, na.rm = TRUE),
min_log2FC = min(true_log2FC, na.rm = TRUE),
max_log2FC = max(true_log2FC, na.rm = TRUE),
.groups = 'drop'
) %>%
kable(caption = "Ground Truth Summary", digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Ground Truth Summary
is_DA
|
n_genomes
|
mean_log2FC
|
min_log2FC
|
max_log2FC
|
FALSE
|
20
|
0.00
|
0
|
0
|
TRUE
|
4
|
2.25
|
2
|
3
|
Ground Truth Table: True Differential Abundance Status
genome
|
is_DA
|
true_log2FC
|
baseline_weight
|
enriched_weight
|
GCA0000067852ASM678v2
|
FALSE
|
0
|
10
|
10
|
GCA0000132851ASM1328v1
|
FALSE
|
0
|
14
|
14
|
GCA0000137851ASM1378v1
|
FALSE
|
0
|
9
|
9
|
GCA0000158651ASM1586v1
|
FALSE
|
0
|
8
|
8
|
GCA0000223451ASM2234v1
|
FALSE
|
0
|
15
|
15
|
GCA0000237851ASM2378v1
|
FALSE
|
0
|
9
|
9
|
GCA0000259051ASM2590v1
|
FALSE
|
0
|
12
|
12
|
GCA0000921251ASM9212v1
|
FALSE
|
0
|
11
|
11
|
GCA0000928251ASM9282v1
|
FALSE
|
0
|
15
|
15
|
GCA0000929851ASM9298v1
|
FALSE
|
0
|
7
|
7
|
GCA0001438451ASM14384v1
|
FALSE
|
0
|
19
|
19
|
GCA0001439851ASM14398v1
|
FALSE
|
0
|
6
|
6
|
GCA0002277053ASM22770v3
|
FALSE
|
0
|
12
|
12
|
GCA0002313853ASM23138v3
|
FALSE
|
0
|
11
|
11
|
GCA0002337153ASM23371v3
|
FALSE
|
0
|
5
|
5
|
GCA0002354053ASM23540v3
|
TRUE
|
3
|
13
|
104
|
GCA0002422553ASM24225v3
|
TRUE
|
2
|
8
|
32
|
GCA0002551153ASM25511v3
|
TRUE
|
2
|
5
|
20
|
GCA0002654251ASM26542v1
|
FALSE
|
0
|
14
|
14
|
GCA0003257051ASM32570v1
|
FALSE
|
0
|
5
|
5
|
GCA0004392551ASM43925v1
|
FALSE
|
0
|
5
|
5
|
GCA0004450151ASM44501v1
|
FALSE
|
0
|
9
|
9
|
GCA0016927551ASM169275v1
|
TRUE
|
2
|
15
|
60
|
GCA0018770551ASM187705v1
|
FALSE
|
0
|
14
|
14
|
Loading DA Results
# Define available metrics
metrics <- c("counts", "tpm", "rpkm", "relative", "breadth", "coverage")
cat("Available metrics:", paste(metrics, collapse = ", "), "\n")
## Available metrics: counts, tpm, rpkm, relative, breadth, coverage
# load DA results for a specific metric
load_metric_results <- function(metric) {
da_dir <- paste0('/home/mrahman/da_analysis/data/da/', metric)
da_files <- list.files(da_dir, pattern = '^DA_.*\\.csv$', full.names = TRUE)
read_da_file <- function(path) {
tool <- basename(path) %>% str_replace_all('^DA_|\\.csv$', '')
df <- read_csv(path, col_names = FALSE, show_col_types = FALSE) %>%
rename(genome = X1) %>%
filter(genome != "genome") %>%
mutate(tool = tool,
metric = metric,
genome = str_replace(genome, "\\.fa$", ""),
genome = str_replace_all(genome, "[^A-Za-z0-9]", ""))
# Handle different column structures
if (ncol(df) >= 3) {
df <- df %>%
rename(rawP = X2, adjP = X3) %>%
mutate(rawP = as.numeric(rawP), adjP = as.numeric(adjP))
} else {
df <- df %>% mutate(rawP = NA_real_, adjP = NA_real_)
}
return(df)
}
result <- map_dfr(da_files, read_da_file) %>%
distinct(tool, genome, .keep_all = TRUE) %>%
group_by(tool, metric) %>%
mutate(adjP = p.adjust(rawP, method = "BH")) %>% # recompute
ungroup() %>%
mutate(adjP = replace_na(adjP, 1))
return(result)
}
all_metrics_results <- map_dfr(metrics, load_metric_results)
master_df <- all_metrics_results %>%
left_join(truth_table, by = 'genome') %>%
complete(genome, tool, metric, fill = list(adjP = 1)) %>%
mutate(score = -log10(adjP),
is_DA = replace_na(is_DA, FALSE))
cat("Total combinations:", nrow(master_df), "\n")
## Total combinations: 1296
# Show available combinations
combinations <- master_df %>%
count(metric, tool) %>%
arrange(metric, tool)
DT::datatable(combinations,
caption = "Available Tool-Metric Combinations",
options = list(pageLength = 10, scrollX = TRUE))
3. Performance Analysis Across Metrics
performance_metrics <- master_df %>%
group_by(tool, metric) %>%
summarise(
n_genomes = n(),
n_da_genomes = sum(is_DA, na.rm = TRUE),
TP = sum(adjP < 0.05 & is_DA, na.rm = TRUE),
FP = sum(adjP < 0.05 & !is_DA, na.rm = TRUE),
TN = sum(adjP >= 0.05 & !is_DA, na.rm = TRUE),
FN = sum(adjP >= 0.05 & is_DA, na.rm = TRUE),
.groups = 'drop'
) %>%
mutate(
Precision = ifelse(TP + FP > 0, TP / (TP + FP), 0),
Recall = ifelse(TP + FN > 0, TP / (TP + FN), 0),
F1_Score = ifelse(Precision + Recall > 0, 2 * Precision * Recall / (Precision + Recall), 0),
Specificity = ifelse(TN + FP > 0, TN / (TN + FP), 0),
NPV = ifelse(TN + FN > 0, TN / (TN + FN), 0)
) %>%
mutate(across(where(is.numeric), ~round(., 3)))
calculate_auc_metrics <- function(df) {
unique_combinations <- df %>% distinct(tool, metric)
auc_results <- data.frame(
tool = character(),
metric = character(),
auc_roc = numeric(),
auc_pr = numeric(),
stringsAsFactors = FALSE
)
pb <- txtProgressBar(min = 0, max = nrow(unique_combinations), style = 3)
for(i in 1:nrow(unique_combinations)) {
current_tool <- unique_combinations$tool[i]
current_metric <- unique_combinations$metric[i]
tool_data <- df %>%
filter(tool == current_tool, metric == current_metric, !is.na(score), !is.na(is_DA))
auc_roc_val <- NA_real_
auc_pr_val <- NA_real_
if (nrow(tool_data) > 5 && sum(tool_data$is_DA) > 0 && sum(!tool_data$is_DA) > 0) {
# Calculate AUC-ROC
tryCatch({
roc_obj <- pROC::roc(tool_data$is_DA, tool_data$score, quiet = TRUE)
auc_roc_val <- as.numeric(pROC::auc(roc_obj))
}, error = function(e) {
})
# Calculate AUC-PR
da_scores <- tool_data$score[tool_data$is_DA == TRUE]
non_da_scores <- tool_data$score[tool_data$is_DA == FALSE]
da_scores <- da_scores[!is.na(da_scores)]
non_da_scores <- non_da_scores[!is.na(non_da_scores)]
if (length(da_scores) > 0 && length(non_da_scores) > 0) {
tryCatch({
pr_obj <- PRROC::pr.curve(scores.class0 = da_scores,
scores.class1 = non_da_scores,
curve = FALSE)
auc_pr_val <- pr_obj$auc.integral
}, error = function(e) {
# Silent error handling
})
}
}
auc_results <- rbind(auc_results, data.frame(
tool = current_tool,
metric = current_metric,
auc_roc = auc_roc_val,
auc_pr = auc_pr_val,
stringsAsFactors = FALSE
))
setTxtProgressBar(pb, i)
}
close(pb)
return(auc_results)
}
auc_metrics <- calculate_auc_metrics(master_df) %>%
mutate(across(where(is.numeric), ~round(., 3)))
## | | | 0% | |= | 2% | |=== | 4% | |==== | 6% | |===== | 7% | |====== | 9% | |======== | 11% | |========= | 13%
## | |========== | 15% | |============ | 17% | |============= | 19% | |============== | 20%
## | |================ | 22% | |================= | 24% | |================== | 26% | |=================== | 28% | |===================== | 30% | |====================== | 31% | |======================= | 33% | |========================= | 35%
## | |========================== | 37% | |=========================== | 39% | |============================= | 41% | |============================== | 43%
## | |=============================== | 44%
## | |================================ | 46%
## | |================================== | 48%
## | |=================================== | 50%
## | |==================================== | 52%
## | |====================================== | 54%
## | |======================================= | 56% | |======================================== | 57% | |========================================= | 59% | |=========================================== | 61% | |============================================ | 63% | |============================================= | 65% | |=============================================== | 67% | |================================================ | 69% | |================================================= | 70% | |=================================================== | 72% | |==================================================== | 74% | |===================================================== | 76% | |====================================================== | 78%
## | |======================================================== | 80%
## | |========================================================= | 81%
## | |========================================================== | 83% | |============================================================ | 85%
## | |============================================================= | 87%
## | |============================================================== | 89%
## | |================================================================ | 91%
## | |================================================================= | 93%
## | |================================================================== | 94% | |=================================================================== | 96%
## | |===================================================================== | 98%
## | |======================================================================| 100%
# Combine all performance metrics
complete_performance <- performance_metrics %>%
left_join(auc_metrics, by = c("tool", "metric"))
# Display comprehensive performance table
DT::datatable(complete_performance %>%
select(tool, metric, F1_Score, auc_pr, Precision, Recall, auc_roc) %>%
arrange(metric, desc(F1_Score)),
caption = "Complete Performance Matrix",
options = list(pageLength = 15, scrollX = TRUE)) %>%
formatRound(columns = c("F1_Score", "auc_pr", "Precision", "Recall", "auc_roc"), digits = 3)