# =========================
# 1. LOAD PACKAGES
# =========================
#install.packages("umap")
#install.packages("ggplot2")
#install.packages("ggrepel")

library(umap)
library(ggplot2)
library(ggrepel)

# =========================
# 2. FILE NAMES
# =========================
files <- c(
  "control_01.report",
  "control_02.report",
  "treatment_01.report",
  "treatment_02_report"
)

# =========================
# 3. PARSE KRAKEN REPORTS (GENUS LEVEL)
# =========================
data_list <- lapply(files, function(f) {
  
  lines <- readLines(f)
  
  parsed <- lapply(lines, function(line) {
    parts <- strsplit(line, "\t")[[1]]
    
    if (length(parts) < 6) return(NULL)
    
    data.frame(
      percent = as.numeric(parts[1]),
      rank = parts[4],
      name = trimws(parts[6]),
      stringsAsFactors = FALSE
    )
  })
  
  df <- do.call(rbind, parsed)
  
  # keep genus level
  df <- df[df$rank == "G", ]
  
  df <- df[, c("name", "percent")]
  colnames(df) <- c("Taxon", f)
  
  df
})

# =========================
# 4. MERGE INTO MATRIX
# =========================
merged <- Reduce(function(x, y) merge(x, y, by="Taxon", all=TRUE), data_list)
merged[is.na(merged)] <- 0

rownames(merged) <- merged$Taxon
merged$Taxon <- NULL

data_t <- t(merged)

# =========================
# 5. DEFINE ACVD TAXA (GENUS)
# =========================
acvd_taxa <- c(
  "Escherichia", "Klebsiella", "Enterobacter",
  "Streptococcus",
  "Atopobium", "Solobacterium", "Lactobacillus",
  "Oribacterium", "Parvimonas", "Peptostreptococcus",
  "Fusobacterium", "Actinomyces", "Megasphaera",
  "Eggerthella", "Ruminococcus", "Clostridium",
  "Enterococcus",
  "Lachnospiraceae", "Erysipelotrichaceae",
  "Subdoligranulum", "Blautia", "Collinsella",
  "Bifidobacterium", "Veillonella",
  "Dialister", "Coprobacillus", "Anaerococcus",
  "Weissella", "Desulfovibrio",
  "Methanobrevibacter", "Akkermansia",
  "Bacteroides"
)

# =========================
# 6. FILTER ONLY ACVD TAXA (FUZZY MATCH)
# =========================
keep <- sapply(colnames(data_t), function(x) {
  any(sapply(acvd_taxa, function(t) grepl(t, x)))
})

data_filtered <- data_t[, keep]

# =========================
# 7. CLEAN DATA
# =========================
# remove zero-variance taxa
vars <- apply(data_filtered, 2, var)
data_filtered <- data_filtered[, vars > 0]

# scale data
data_filtered <- scale(data_filtered)

# =========================
# 8. RUN UMAP
# =========================
config <- umap.defaults

config$n_neighbors <- 2
config$min_dist <- 0.3

umap_res <- umap(data_filtered, config = config)
df <- as.data.frame(umap_res$layout)
colnames(df) <- c("UMAP1", "UMAP2")

# =========================
# 9. ADD METADATA
# =========================
df$group <- c("control", "control", "ACVD", "ACVD")

# =========================
# 10. PLOT (CLEAN + READABLE)
# =========================
ggplot(df, aes(x = UMAP1, y = UMAP2, color = group)) +
  geom_point(size = 8, alpha = 0.9) +
  stat_ellipse(level = 0.95, linewidth = 1.2) +
  theme_minimal() +
  coord_fixed() +
  scale_x_continuous(expand = expansion(mult = 0.3)) +
  scale_y_continuous(expand = expansion(mult = 0.3))
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_path()`).