library(umap)
library(ggplot2)
# 1. Put your exact file names here
files <- c(
"control_01.report",
"control_02.report",
"treatment_01.report",
"treatment_02_report" # change if your real file name differs
)
# 2. Parse Kraken report at genus level
parse_kraken <- function(file) {
lines <- readLines(file)
parsed <- lapply(lines, function(line) {
parts <- strsplit(line, "\t")[[1]]
if (length(parts) < 6) return(NULL)
data.frame(
percent = as.numeric(parts[1]),
rank = parts[4],
name = trimws(parts[6]),
stringsAsFactors = FALSE
)
})
df <- do.call(rbind, parsed)
df <- df[df$rank == "G", c("name", "percent")]
colnames(df) <- c("Taxon", "Percent")
df
}
# 3. Gather data from all files
data_list <- lapply(seq_along(files), function(i) {
df <- parse_kraken(files[i])
colnames(df)[2] <- files[i]
df
})
# 4. Merge all samples by taxon
merged <- Reduce(function(x, y) merge(x, y, by = "Taxon", all = TRUE), data_list)
merged[is.na(merged)] <- 0
# 5. Make Taxon row names, but also keep a copy safely
merged_matrix <- merged
rownames(merged_matrix) <- merged_matrix$Taxon
merged_matrix$Taxon <- NULL
# 6. Extract top 2 taxa per sample
top2_df <- do.call(rbind, lapply(colnames(merged_matrix), function(sample) {
vec <- merged_matrix[, sample]
# preserve names explicitly
names(vec) <- rownames(merged_matrix)
vals <- sort(vec, decreasing = TRUE)[1:2]
data.frame(
sample = sample,
rank = c("1st", "2nd"),
taxon = names(vals),
percent = as.numeric(vals),
row.names = NULL,
stringsAsFactors = FALSE
)
}))
print(top2_df)
## sample rank taxon percent
## 1 control_01.report 1st Streptococcus 9.08
## 2 control_01.report 2nd Mediterraneibacter 4.15
## 3 control_02.report 1st Bacteroides 20.98
## 4 control_02.report 2nd Phocaeicola 13.11
## 5 treatment_01.report 1st Bacteroides 17.09
## 6 treatment_01.report 2nd Alistipes 3.90
## 7 treatment_02_report 1st Bacteroides 15.13
## 8 treatment_02_report 2nd Faecalibacterium 9.28
top2_df <- data.frame(
sample = c(
"control_01.report", "control_01.report",
"control_02.report", "control_02.report",
"treatment_01.report", "treatment_01.report",
"treatment_02_report", "treatment_02_report"
),
rank = c("1st","2nd","1st","2nd","1st","2nd","1st","2nd"),
taxon = c(
"Streptococcus", "Mediterraneibacter",
"Bacteroides", "Phocaeicola",
"Bacteroides", "Alistipes",
"Bacteroides", "Faecalibacterium"
),
percent = c(9.08, 4.15, 20.98, 13.11, 17.09, 3.90, 15.13, 9.28)
)
library(ggplot2)
ggplot(top2_df, aes(x = sample, y = percent, fill = taxon)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Top 2 Taxa per Sample",
x = "Sample",
y = "Relative Abundance (%)"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)

library(ggplot2)
ggplot(top2_df, aes(x = taxon, y = percent, fill = sample)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Abundance of Top Taxa Across Samples",
x = "Taxa",
y = "Relative Abundance (%)"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)

taxa_of_interest <- unique(top2_df$taxon)
subset_matrix <- merged_matrix[taxa_of_interest, , drop = FALSE]
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
plot_df <- subset_matrix %>%
as.data.frame() %>%
mutate(taxon = rownames(.)) %>%
pivot_longer(
cols = -taxon,
names_to = "sample",
values_to = "percent"
)
library(ggplot2)
ggplot(plot_df, aes(x = taxon, y = percent, fill = sample)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Relative Abundance of Selected Taxa Across Samples",
x = "Taxa",
y = "Relative Abundance (%)"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)

plot_df <- plot_df %>%
mutate(
is_top2 = ifelse(paste(sample, taxon) %in%
paste(top2_df$sample, top2_df$taxon),
TRUE, FALSE)
)
p <- ggplot(plot_df, aes(x = taxon, y = percent, fill = sample)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Relative Abundance of Taxa Across Samples",
x = "Taxa",
y = "Relative Abundance (%)"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p +
geom_bar(
data = subset(plot_df, is_top2),
aes(x = taxon, y = percent),
stat = "identity",
fill = NA,
color = "black",
linewidth = 1.2
)

plot_df <- plot_df %>%
mutate(
label = case_when(
paste(sample, taxon) %in% paste(top2_df$sample[top2_df$rank == "1st"],
top2_df$taxon[top2_df$rank == "1st"]) ~ "1",
paste(sample, taxon) %in% paste(top2_df$sample[top2_df$rank == "2nd"],
top2_df$taxon[top2_df$rank == "2nd"]) ~ "2",
TRUE ~ ""
)
)
ggplot(plot_df, aes(x = taxon, y = percent, fill = sample)) +
geom_bar(stat = "identity") +
geom_text(
aes(label = label),
position = position_stack(vjust = 0.5),
color = "black",
size = 5,
fontface = "bold"
) +
theme_minimal() +
labs(
title = "Relative Abundance of Taxa Across Samples",
x = "Taxa",
y = "Relative Abundance (%)"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)

top10_list <- lapply(colnames(merged_matrix), function(sample) {
vec <- merged_matrix[, sample]
names(vec) <- rownames(merged_matrix)
top10 <- sort(vec, decreasing = TRUE)[1:10]
data.frame(
sample = sample,
taxon = names(top10),
percent = as.numeric(top10),
stringsAsFactors = FALSE
)
})
top10_df <- do.call(rbind, top10_list)
library(ggplot2)
ggplot(top10_df, aes(x = sample, y = percent, fill = taxon)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Top 10 Bacterial Genera per Sample",
x = "Sample",
y = "Relative Abundance (%)"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
