# =========================
# Setup
# =========================
library(dplyr)
library(fgsea)
library(msigdbr)
library(ggplot2)
library(writexl)

set.seed(1)

# IMPORTANTE:
# El archivo ESAvsES.tsv debe estar en la MISMA carpeta que este .Rmd
# =========================
# Cargar datos y ranking
# =========================

comp <- "ESAvsES"
infile <- paste0(comp, ".tsv")

stopifnot(file.exists(infile))

data_dge <- read.delim(
  infile,
  header = TRUE, sep = "\t", check.names = FALSE
)

rank_list <- data_dge[, c(1, 2, 5)]  # GeneID, logFC, P.Value
colnames(rank_list) <- c("Gene", "logFC", "P.Value")

rank_list$fcSign <- sign(rank_list$logFC)
rank_list$logP   <- -log10(rank_list$P.Value)
rank_list$metric <- rank_list$logP / rank_list$fcSign

ranks <- rank_list$metric
names(ranks) <- rank_list$Gene

# =========================
# Gene sets
# =========================

msig_go_mouse <- msigdbr(
  species = "Mus musculus",
  collection = "M5",
  db_species = "MM"
)

msig_hallmark_mouse <- msigdbr(
  species = "Mus musculus",
  collection = "MH",
  db_species = "MM"
)

msigdb_gene_sets <- bind_rows(msig_go_mouse, msig_hallmark_mouse)

m_list <- msigdb_gene_sets %>%
  split(x = .$ensembl_gene, f = .$gs_name)

# =========================
# GSEA
# =========================

fgseaRes <- fgsea(
  m_list,
  ranks,
  minSize = 15,
  maxSize = 500
)

# =========================
# Top pathways + tabla GSEA
# =========================

topUp <- fgseaRes %>%
  filter(NES > 0) %>%
  slice_min(padj, n = 5)

topDown <- fgseaRes %>%
  filter(NES < 0) %>%
  slice_min(padj, n = 5)

topPathways <- bind_rows(topUp, topDown) %>%
  arrange(desc(NES))

plotGseaTable(
  pathways = m_list[topPathways$pathway],
  stats    = ranks,
  fgseaRes = fgseaRes,
  gseaParam = 0.5
)

pdf(paste0(comp, ".pdf"), width = 14, height = 14)
plotGseaTable(
  m_list[topPathways$pathway],
  ranks,
  fgseaRes,
  gseaParam = 0.5
)


# =========================
# Exportar resultados a Excel
# =========================

fgseaRes_export <- fgseaRes
fgseaRes_export$leadingEdge <- vapply(
  fgseaRes_export$leadingEdge,
  function(x) sprintf('c("%s")', paste(x, collapse = '", "')),
  character(1)
)

write_xlsx(
  fgseaRes_export,
  paste0(comp, "_fgsea_results.xlsx")
)
# =========================
# Dotplot
# =========================

fgsea_plot_df <- fgseaRes %>%
  filter(!grepl("^GOCC_", pathway)) %>%
  filter(padj <= 0.05) %>%
  mutate(
    Count = lengths(leadingEdge),
    GeneRatio = Count / size,
    GeneRatioPct = 100 * GeneRatio
  ) %>%
  filter(!is.na(GeneRatioPct), Count > 0) %>%
  arrange(padj, desc(abs(NES))) %>%
  slice_head(n = 20)

p <- ggplot(
  fgsea_plot_df,
  aes(x = GeneRatioPct, y = reorder(pathway, GeneRatioPct))
) +
  geom_point(aes(size = Count, color = padj), alpha = 0.9) +
  labs(
    x = "% de genes del set en leadingEdge (Count/size)",
    y = NULL,
    color = "p.adj",
    size = "Count"
  ) +
  theme_bw() +
  theme(
    panel.grid.major = element_line(color = "grey85", linewidth = 0.3),
    panel.grid.minor = element_line(color = "grey92", linewidth = 0.2),
    panel.grid.minor.y = element_blank()
  )

p

ggsave(
  filename = paste0(comp, "_dotplot.pdf"),
  plot = p,
  width = 10,
  height = 6,
  units = "in",
  device = cairo_pdf
)