# =========================
# 1. LOAD PACKAGES
# =========================
#install.packages("umap")
#install.packages("ggplot2")
#install.packages("ggrepel")
library(umap)
library(ggplot2)
library(ggrepel)
# =========================
# 2. FILE NAMES
# =========================
files <- c(
"control_01.report",
"control_02.report",
"treatment_01.report",
"treatment_02_report"
)
# =========================
# 3. PARSE KRAKEN REPORTS (GENUS LEVEL)
# =========================
data_list <- lapply(files, function(f) {
lines <- readLines(f)
parsed <- lapply(lines, function(line) {
parts <- strsplit(line, "\t")[[1]]
if (length(parts) < 6) return(NULL)
data.frame(
percent = as.numeric(parts[1]),
rank = parts[4],
name = trimws(parts[6]),
stringsAsFactors = FALSE
)
})
df <- do.call(rbind, parsed)
# keep genus level
df <- df[df$rank == "G", ]
df <- df[, c("name", "percent")]
colnames(df) <- c("Taxon", f)
df
})
# =========================
# 4. MERGE INTO MATRIX
# =========================
merged <- Reduce(function(x, y) merge(x, y, by="Taxon", all=TRUE), data_list)
merged[is.na(merged)] <- 0
rownames(merged) <- merged$Taxon
merged$Taxon <- NULL
data_t <- t(merged)
# =========================
# 5. DEFINE ACVD TAXA (GENUS)
# =========================
acvd_taxa <- c(
"Escherichia", "Klebsiella", "Enterobacter",
"Streptococcus",
"Atopobium", "Solobacterium", "Lactobacillus",
"Oribacterium", "Parvimonas", "Peptostreptococcus",
"Fusobacterium", "Actinomyces", "Megasphaera",
"Eggerthella", "Ruminococcus", "Clostridium",
"Enterococcus",
"Lachnospiraceae", "Erysipelotrichaceae",
"Subdoligranulum", "Blautia", "Collinsella",
"Bifidobacterium", "Veillonella",
"Dialister", "Coprobacillus", "Anaerococcus",
"Weissella", "Desulfovibrio",
"Methanobrevibacter", "Akkermansia",
"Bacteroides"
)
# =========================
# 6. FILTER ONLY ACVD TAXA (FUZZY MATCH)
# =========================
keep <- sapply(colnames(data_t), function(x) {
any(sapply(acvd_taxa, function(t) grepl(t, x)))
})
data_filtered <- data_t[, keep]
# =========================
# 7. CLEAN DATA
# =========================
# remove zero-variance taxa
vars <- apply(data_filtered, 2, var)
data_filtered <- data_filtered[, vars > 0]
# scale data
data_filtered <- scale(data_filtered)
# =========================
# 8. RUN UMAP
# =========================
config <- umap.defaults
config$n_neighbors <- 2
config$min_dist <- 0.3
umap_res <- umap(data_filtered, config = config)
df <- as.data.frame(umap_res$layout)
colnames(df) <- c("UMAP1", "UMAP2")
# =========================
# 9. ADD METADATA
# =========================
df$group <- c("control", "control", "ACVD", "ACVD")
# =========================
# 10. PLOT (CLEAN + READABLE)
# =========================
ggplot(df, aes(x = UMAP1, y = UMAP2, color = group)) +
geom_point(size = 8, alpha = 0.9) +
stat_ellipse(level = 0.95, linewidth = 1.2) +
theme_minimal() +
coord_fixed() +
scale_x_continuous(expand = expansion(mult = 0.3)) +
scale_y_continuous(expand = expansion(mult = 0.3))
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_path()`).
