Checking for Batch Effects

Given that we are aiming at replacing the OG S. cerevisiae data with the new yH545 data, I want to interrogate the batch effects present between the two runs.

To do this, I will run a joint normalization of TMM followed by Log2CPM as originally performed. The goal of the joint normalization is to ensure that the normalization factor is equal for the two runs. Otherwise, the two separately normalized runs would not be directly comparable.

library(here)
library(edgeR)
library(dplyr)
library(tibble)
library(tidyr)
library(stringr)
library(ggplot2)
library(ggprism)

base_dir <- here("14_DESeq2_062026data")
input_dir <- file.path(base_dir, "inputs")
output_dir <- file.path(base_dir, "outputs", "yH545")
figure_dir <- file.path(base_dir, "figures", "Counts-QC", "yH545")

new_counts_file <- file.path(input_dir, "counts_new", "yH545_salmon.merged.gene_counts.tsv")
og_counts_file <- file.path(input_dir, "counts_OG", "SCER-salmon.merged.gene_counts.tsv")

stopifnot(file.exists(new_counts_file), file.exists(og_counts_file))

save_fig <- function(plot, filename, width = 10, height = 8) {
  ggsave(file.path(figure_dir, filename), plot = plot,
         width = width, height = height, dpi = 300)
}

collapse_lanes <- function(counts_raw) {
  has_lane <- any(grepl("\\.L00[12]$", colnames(counts_raw)))
  if (!has_lane) return(as.matrix(counts_raw))
  sample_base <- gsub("\\.L00[12]$", "", colnames(counts_raw))
  unique_samples <- unique(sample_base)
  collapsed <- sapply(unique_samples, function(samp) {
    cols <- grep(paste0("^", samp, "\\.L00"), colnames(counts_raw), value = TRUE)
    rowSums(counts_raw[, cols, drop = FALSE])
  })
  rownames(collapsed) <- rownames(counts_raw)
  collapsed
}

extract_tp_min <- function(sample_names) {
  m <- as.integer(gsub("m", "", str_extract(sample_names, "[0-9]+m")))
  t <- as.integer(str_remove(str_extract(sample_names, "t[0-9]+"), "t"))
  ifelse(!is.na(m), m, t)
}

new_raw <- read.delim(new_counts_file, row.names = 1, check.names = FALSE)
og_raw <- read.delim(og_counts_file, row.names = 1, check.names = FALSE)

new_raw <- new_raw[, !colnames(new_raw) %in% "gene_name", drop = FALSE]
og_raw <- og_raw[, !colnames(og_raw) %in% "gene_name", drop = FALSE]

new_counts <- collapse_lanes(new_raw)
og_counts <- collapse_lanes(og_raw)

new_counts <- new_counts[, !grepl("mock", colnames(new_counts), ignore.case = TRUE), drop = FALSE]
og_counts <- og_counts[, !grepl("mock", colnames(og_counts), ignore.case = TRUE), drop = FALSE]

common_genes <- intersect(rownames(new_counts), rownames(og_counts))
new_counts <- new_counts[common_genes, , drop = FALSE]
og_counts <- og_counts[common_genes, , drop = FALSE]

colnames(new_counts) <- paste0("new__", colnames(new_counts))
colnames(og_counts) <- paste0("old__", colnames(og_counts))
combined <- cbind(og_counts, new_counts)
combined <- apply(combined, 2, function(x) as.integer(round(x)))
rownames(combined) <- common_genes

meta <- tibble(
  sample = colnames(combined),
  batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
  timepoint_min = extract_tp_min(sample)
) |>
  mutate(
    batch = factor(batch, levels = c("old (2023)", "new (2026)")),
    timepoint_min = factor(timepoint_min, levels = sort(unique(timepoint_min)))
  )

#keep unfiltered matrix for library size
combined_raw_full <- combined

keep <- rowSums(combined >= 10) >= 2
combined <- combined[keep, ]

dge <- DGEList(counts = combined)
dge <- calcNormFactors(dge, method = "TMM")
lcpm <- cpm(dge, log = TRUE, prior.count = 2)

write.csv(lcpm, file.path(output_dir, "01_combined_scer_jointTMM_log2cpm.csv"))

pca <- prcomp(t(lcpm), center = TRUE, scale. = FALSE)

var_explained <- (pca$sdev^2) / sum(pca$sdev^2) * 100

pca_df <- as.data.frame(pca$x[, 1:2]) |>
  rownames_to_column("sample") |>
  left_join(meta, by = "sample") |>
  mutate(timepoint_min = droplevels(timepoint_min))

stopifnot(nrow(pca_df) == ncol(lcpm), !any(is.na(pca_df$timepoint_min)))

shared_tps <- c(0, 30, 60, 90, 120, 180, 240, 360, 480)
pca_df <- pca_df |>
  mutate(shared = as.integer(as.character(timepoint_min)) %in% shared_tps)

write.csv(pca_df, file.path(output_dir, "01_pca_coordinates.csv"), row.names = FALSE)

x_lab <- sprintf("PC1 (%.1f%%)", var_explained[1])
y_lab <- sprintf("PC2 (%.1f%%)", var_explained[2])

p_batch <- ggplot(pca_df, aes(x = PC1, y = PC2, fill = batch)) +
  geom_point(shape = 21, size = 3.5, stroke = 0.8, colour = "grey20") +
  scale_fill_manual(values = c("old (2023)" = "#377EB8", "new (2026)" = "#E41A1C"),
                    name = NULL) +
  labs(x = x_lab, y = y_lab, title = "Combined S. cerevisiae PCA, by batch") +
  theme_prism(base_size = 12) +
  theme(legend.position = "bottom")

p_time <- ggplot(pca_df, aes(x = PC1, y = PC2, fill = timepoint_min, shape = batch)) +
  geom_point(size = 3.5, stroke = 0.8, colour = "grey20") +
  scale_shape_manual(values = c("old (2023)" = 21, "new (2026)" = 24), name = NULL) +
  scale_fill_viridis_d(option = "viridis", name = "timepoint (min)") +
  guides(fill = guide_legend(override.aes = list(shape = 21, colour = "grey20"))) +
  labs(x = x_lab, y = y_lab, title = "Combined S. cerevisiae PCA, by timepoint") +
  theme_prism(base_size = 12) +
  theme(legend.position = "right")

save_fig(p_batch, "01_pca_by_batch.png", width = 8, height = 7)
save_fig(p_time, "01_pca_by_timepoint.png", width = 9, height = 7)

p_batch

p_time

Notes

It appears that batch is not a dominant axis of variation.

PC1 seems to be the time axis. As you move from -PC1 to +PC1, you move across the timecourse, early to late. PC2 is some additional response axis. However, you cannot call it a batch. All together, this explains 73.5% variance.

extract_tp_min <- function(sample_names) {
  m <- as.integer(gsub("m", "", str_extract(sample_names, "[0-9]+m")))
  t <- as.integer(str_remove(str_extract(sample_names, "t[0-9]+"), "t"))
  ifelse(!is.na(m), m, t)
}

lcpm <- as.matrix(read.csv(
  file.path(output_dir, "01_combined_scer_jointTMM_log2cpm.csv"),
  row.names = 1, check.names = FALSE
))

sample_meta <- tibble(sample = colnames(lcpm)) |>
  mutate(
    batch = ifelse(grepl("^new__", sample), "new", "old"),
    timepoint_min = extract_tp_min(sample)
  )

shared_tps <- c(0, 30, 60, 90, 120, 180, 240, 360, 480)
lfc_tps <- setdiff(shared_tps, 0)

mean_profile <- function(mat, meta, b, t) {
  cols <- meta$sample[meta$batch == b & meta$timepoint_min == t]
  rowMeans(mat[, cols, drop = FALSE])
}

abs_conc <- bind_rows(lapply(shared_tps, function(t) {
  o <- mean_profile(lcpm, sample_meta, "old", t)
  n <- mean_profile(lcpm, sample_meta, "new", t)
  tibble(timepoint_min = t, gene = names(o), old_mean = o, new_mean = n)
})) |>
  mutate(tp = factor(timepoint_min, levels = shared_tps))

abs_cor <- abs_conc |>
  group_by(timepoint_min, tp) |>
  summarise(
    abs_pearson = cor(old_mean, new_mean, method = "pearson"),
    abs_spearman = cor(old_mean, new_mean, method = "spearman"),
    .groups = "drop"
  )

old_t0 <- mean_profile(lcpm, sample_meta, "old", 0)
new_t0 <- mean_profile(lcpm, sample_meta, "new", 0)

lfc_conc <- bind_rows(lapply(lfc_tps, function(t) {
  old_lfc <- mean_profile(lcpm, sample_meta, "old", t) - old_t0
  new_lfc <- mean_profile(lcpm, sample_meta, "new", t) - new_t0
  tibble(timepoint_min = t, gene = names(old_lfc), old_lfc = old_lfc, new_lfc = new_lfc)
})) |>
  mutate(tp = factor(timepoint_min, levels = lfc_tps))

lfc_cor <- lfc_conc |>
  group_by(timepoint_min, tp) |>
  summarise(
    lfc_pearson = cor(old_lfc, new_lfc, method = "pearson"),
    lfc_spearman = cor(old_lfc, new_lfc, method = "spearman"),
    .groups = "drop"
  )

summary_tbl <- abs_cor |>
  select(timepoint_min, abs_pearson, abs_spearman) |>
  left_join(select(lfc_cor, timepoint_min, lfc_pearson, lfc_spearman),
            by = "timepoint_min") |>
  arrange(timepoint_min)

write.csv(summary_tbl, file.path(output_dir, "02_concordance_correlation_summary.csv"),
          row.names = FALSE)

p_abs <- ggplot(abs_conc, aes(x = old_mean, y = new_mean)) +
  geom_point(size = 0.4, alpha = 0.25, colour = "grey30") +
  geom_abline(slope = 1, intercept = 0, colour = "#E41A1C", linewidth = 0.6) +
  geom_text(
    data = abs_cor,
    aes(x = -Inf, y = Inf, label = sprintf("r = %.3f", abs_pearson)),
    hjust = -0.15, vjust = 1.5, size = 3.4, inherit.aes = FALSE
  ) +
  facet_wrap(~ tp, ncol = 3) +
  labs(
    x = "old (2023) mean log2CPM",
    y = "new (2026) mean log2CPM",
    title = "Old vs new expression concordance, shared timepoints"
  ) +
  theme_prism(base_size = 11) +
  theme(strip.text = element_text(size = 10))

p_lfc <- ggplot(lfc_conc, aes(x = old_lfc, y = new_lfc)) +
  geom_point(size = 0.4, alpha = 0.25, colour = "grey30") +
  geom_hline(yintercept = 0, colour = "grey75", linewidth = 0.3) +
  geom_vline(xintercept = 0, colour = "grey75", linewidth = 0.3) +
  geom_abline(slope = 1, intercept = 0, colour = "#E41A1C", linewidth = 0.6) +
  geom_text(
    data = lfc_cor,
    aes(x = -Inf, y = Inf, label = sprintf("r = %.3f", lfc_pearson)),
    hjust = -0.15, vjust = 1.5, size = 3.4, inherit.aes = FALSE
  ) +
  facet_wrap(~ tp, ncol = 4) +
  labs(
    x = "old (2023) log2FC vs t0",
    y = "new (2026) log2FC vs t0",
    title = "Old vs new response concordance (log2FC), shared timepoints"
  ) +
  theme_prism(base_size = 11) +
  theme(strip.text = element_text(size = 10))

save_fig(p_abs, "02_concordance_scatter_absolute.png", width = 11, height = 10)
save_fig(p_lfc, "02_concordance_scatter_lfc.png", width = 12, height = 7)

p_abs

p_lfc

pho_ids <- c(
  "gene-YML123C" = "PHO84",
  "gene-YGR233C" = "PHO81",
  "gene-YOL001W" = "PHO80",
  "gene-YFR034C" = "PHO4"
)
stopifnot(all(names(pho_ids) %in% rownames(lcpm)))

batch_colors <- c("old (2023)" = "#377EB8", "new (2026)" = "#E41A1C")

plot_pho_gene <- function(id, symbol) {
  g_long <- tibble(
    sample = colnames(lcpm),
    expr = lcpm[id, ]
  ) |>
    left_join(sample_meta, by = "sample") |>
    filter(timepoint_min %in% shared_tps) |>
    mutate(batch = factor(ifelse(batch == "old", "old (2023)", "new (2026)"),
                          levels = c("old (2023)", "new (2026)")))

  g_t0 <- g_long |>
    filter(timepoint_min == 0) |>
    group_by(batch) |>
    summarise(t0_mean = mean(expr), .groups = "drop")

  g_long <- g_long |>
    left_join(g_t0, by = "batch") |>
    mutate(lfc = expr - t0_mean)

  g_plot <- g_long |>
    select(batch, timepoint_min, expr, lfc) |>
    pivot_longer(c(expr, lfc), names_to = "metric", values_to = "value") |>
    mutate(metric = factor(recode(metric,
                                  expr = "absolute (log2CPM)",
                                  lfc = "log2FC vs t0"),
                           levels = c("absolute (log2CPM)", "log2FC vs t0")))

  g_mean <- g_plot |>
    group_by(batch, timepoint_min, metric) |>
    summarise(value = mean(value), .groups = "drop")

  p <- ggplot(g_plot, aes(x = timepoint_min, y = value, colour = batch)) +
    geom_point(size = 2, alpha = 0.6) +
    geom_line(data = g_mean, aes(group = batch), linewidth = 1) +
    facet_wrap(~ metric, scales = "free_y") +
    scale_colour_manual(values = batch_colors, name = NULL) +
    scale_x_continuous(breaks = shared_tps) +
    labs(x = "Timepoint (min)", y = NULL,
         title = paste0(symbol, " expression, old vs new")) +
    theme_prism(base_size = 12) +
    theme(legend.position = "bottom")

  save_fig(p, paste0("02_", symbol, "_old_vs_new.png"), width = 11, height = 5)
  p
}

pho_plots <- Map(plot_pho_gene, names(pho_ids), pho_ids)
pho_plots

## $`gene-YML123C`

## 
## $`gene-YGR233C`

## 
## $`gene-YOL001W`

## 
## $`gene-YFR034C`

sd_meta <- tibble(sample = colnames(lcpm)) |>
  mutate(batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
         tp = extract_tp_min(sample))

rep_sd <- bind_rows(lapply(split(sd_meta, list(sd_meta$batch, sd_meta$tp), drop = TRUE), function(grp) {
  cols <- grp$sample
  if (length(cols) < 2) return(NULL)
  tibble(
    batch = grp$batch[1],
    tp = grp$tp[1],
    gene = rownames(lcpm),
    sd = matrixStats::rowSds(lcpm[, cols, drop = FALSE])
  )
})) |>
  mutate(
    batch = factor(batch, levels = c("old (2023)", "new (2026)")),
    tp = factor(tp, levels = sort(unique(tp)))
  )

sd_median <- rep_sd |>
  group_by(batch, tp) |>
  summarise(median_sd = median(sd), .groups = "drop")

write.csv(sd_median, file.path(output_dir, "03_replicate_sd_median.csv"), row.names = FALSE)

batch_colors <- c("old (2023)" = "#377EB8", "new (2026)" = "#E41A1C")

p_sd <- ggplot(rep_sd, aes(x = tp, y = sd, fill = batch)) +
  geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2, position = position_dodge(0.8)) +
  scale_fill_manual(values = batch_colors, name = NULL) +
  coord_cartesian(ylim = c(0, quantile(rep_sd$sd, 0.99))) +
  labs(x = "Timepoint (min)", y = "Within-replicate SD (log2CPM)",
       title = "Replicate spread per timepoint, old vs new") +
  theme_prism(base_size = 12) +
  theme(legend.position = "bottom")

save_fig(p_sd, "03_replicate_sd_boxplot.png", width = 11, height = 6)

p_sd

rle_meta <- tibble(sample = colnames(lcpm)) |>
  mutate(batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
         tp = extract_tp_min(sample))

#reference = within batch x timepoint median, so deviation reflects disagreement with sibling replicates
rle_mat <- lcpm
for (g in unique(interaction(rle_meta$batch, rle_meta$tp, drop = TRUE))) {
  cols <- rle_meta$sample[interaction(rle_meta$batch, rle_meta$tp, drop = TRUE) == g]
  if (length(cols) < 2) next
  ref <- matrixStats::rowMedians(lcpm[, cols, drop = FALSE])
  rle_mat[, cols] <- lcpm[, cols, drop = FALSE] - ref
}

rle_long <- as.data.frame(rle_mat) |>
  rownames_to_column("gene") |>
  pivot_longer(-gene, names_to = "sample", values_to = "rle") |>
  left_join(rle_meta, by = "sample") |>
  filter(tp %in% shared_tps) |>
  mutate(
    batch = factor(batch, levels = c("old (2023)", "new (2026)")),
    sample_lab = gsub("^(old__|new__)", "", sample),
    sample_lab = factor(sample_lab, levels = unique(sample_lab[order(tp, batch)]))
  )

batch_colors <- c("old (2023)" = "#377EB8", "new (2026)" = "#E41A1C")

p_rle <- ggplot(rle_long, aes(x = sample_lab, y = rle, fill = batch)) +
  geom_hline(yintercept = 0, colour = "grey60", linewidth = 0.3) +
  geom_boxplot(outlier.size = 0.2, outlier.alpha = 0.15) +
  scale_fill_manual(values = batch_colors, name = NULL) +
  coord_cartesian(ylim = c(-0.6, 0.6)) +
  labs(x = NULL, y = "RLE (log2CPM vs within-group median)",
       title = "Per-library RLE, old vs new") +
  theme_prism(base_size = 10) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
        legend.position = "bottom")

save_fig(p_rle, "03_rle_boxplot.png", width = 16, height = 6)

p_rle

#sample-to-sample euclidean distance, new data only
new_cols <- colnames(lcpm)[grepl("^new__", colnames(lcpm))]
lcpm_new <- lcpm[, new_cols]

new_meta <- tibble(sample = new_cols) |>
  mutate(tp = extract_tp_min(sample),
         sample_lab = gsub("^new__", "", sample))

sample_dists <- dist(t(lcpm_new))
dist_mat <- as.matrix(sample_dists)
dimnames(dist_mat) <- list(new_meta$sample_lab, new_meta$sample_lab)

annot <- new_meta |>
  mutate(timepoint = factor(tp, levels = sort(unique(tp)))) |>
  select(sample_lab, timepoint) |>
  column_to_rownames("sample_lab")

annot_colors <- list(
  timepoint = setNames(viridisLite::viridis(nlevels(annot$timepoint)),
                       levels(annot$timepoint))
)

ph <- pheatmap::pheatmap(
  dist_mat,
  clustering_distance_rows = sample_dists,
  clustering_distance_cols = sample_dists,
  annotation_row = annot,
  annotation_col = annot,
  annotation_colors = annot_colors,
  color = colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")))(255),
  fontsize_row = 7, fontsize_col = 7,
  main = "Sample-to-sample distance, new (2026) data (Euclidean, log2CPM)",
  silent = TRUE
)

png(file.path(figure_dir, "03_sample_distance_heatmap_new.png"),
    width = 10, height = 10, units = "in", res = 300)
grid::grid.newpage(); grid::grid.draw(ph$gtable)
dev.off()

## png 
##   2

grid::grid.newpage(); grid::grid.draw(ph$gtable)

#per-library log2CPM distribution
dist_meta <- tibble(sample = colnames(lcpm)) |>
  mutate(batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
         tp = extract_tp_min(sample),
         sample_lab = gsub("^(old__|new__)", "", sample))

dist_long <- as.data.frame(lcpm) |>
  pivot_longer(everything(), names_to = "sample", values_to = "log2cpm") |>
  left_join(dist_meta, by = "sample") |>
  mutate(
    batch = factor(batch, levels = c("old (2023)", "new (2026)")),
    sample_lab = factor(sample_lab, levels = unique(sample_lab[order(tp, batch)]))
  )

p_dist <- ggplot(dist_long, aes(x = sample_lab, y = log2cpm, fill = batch)) +
  geom_boxplot(outlier.size = 0.2, outlier.alpha = 0.15) +
  scale_fill_manual(values = batch_colors, name = NULL) +
  labs(x = NULL, y = "log2CPM",
       title = "Per-library count distribution, old vs new") +
  theme_prism(base_size = 10) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
        legend.position = "bottom")

save_fig(p_dist, "03_count_distribution_boxplot.png", width = 16, height = 6)

p_dist

libsize_meta <- tibble(sample = colnames(combined_raw_full)) |>
  mutate(batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
         tp = extract_tp_min(sample),
         sample_lab = gsub("^(old__|new__)", "", sample))

libsize_df <- tibble(
  sample = colnames(combined_raw_full),
  total_reads = colSums(combined_raw_full)
) |>
  left_join(libsize_meta, by = "sample") |>
  mutate(
    batch = factor(batch, levels = c("old (2023)", "new (2026)")),
    sample_lab = factor(sample_lab, levels = unique(sample_lab[order(tp, batch)]))
  )

write.csv(libsize_df |> select(sample, batch, tp, total_reads),
          file.path(output_dir, "03_library_size.csv"), row.names = FALSE)

p_libsize <- ggplot(libsize_df, aes(x = sample_lab, y = total_reads / 1e6, fill = batch)) +
  geom_col() +
  scale_fill_manual(values = batch_colors, name = NULL) +
  labs(x = NULL, y = "Total assigned reads (millions)",
       title = "Library size per sample, old vs new") +
  theme_prism(base_size = 10) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
        legend.position = "bottom")

save_fig(p_libsize, "03_library_size_barplot.png", width = 16, height = 6)

p_libsize

Counts QC, remaining species

The yH545 batch/concordance work above defines the count-level QC we want for every run. The other five runs have no 2023 counterpart, so the concordance analyses (per-timepoint correlation, LFC concordance, PHO old-vs-new) do not apply. What carries over is the single-batch count-integrity QC: library size, RLE, replicate spread, count distribution, sample distance, and PCA. Each run is normalized on its own (single-batch TMM), replicate siblings are grouped by condition x timepoint.

extract_condition <- function(sample_names) {
  vapply(strsplit(sample_names, "\\."), function(x) x[3], character(1))
}

run_counts_qc <- function(code, counts_file) {
  fig_dir <- file.path(base_dir, "figures", "Counts-QC", code)
  out_dir <- file.path(base_dir, "outputs", code)

  sf <- function(plot, filename, width = 10, height = 8) {
    ggsave(file.path(fig_dir, filename), plot = plot, width = width, height = height, dpi = 300)
  }

  raw <- read.delim(counts_file, row.names = 1, check.names = FALSE)
  raw <- raw[, !colnames(raw) %in% "gene_name", drop = FALSE]
  counts <- collapse_lanes(raw)
  gene_ids <- rownames(counts)
  counts <- apply(counts, 2, function(x) as.integer(round(x)))
  rownames(counts) <- gene_ids

  meta <- tibble(
    sample = colnames(counts),
    condition = extract_condition(sample),
    tp = extract_tp_min(sample)
  ) |>
    mutate(
      sample_lab = gsub("^[^.]+\\.[^.]+\\.", "", sample),
      condition = factor(condition, levels = sort(unique(condition)))
    ) |>
    arrange(condition, tp) |>
    mutate(
      sample_lab = factor(sample_lab, levels = unique(sample_lab)),
      tp_f = factor(tp, levels = sort(unique(tp))),
      group = paste(condition, tp, sep = "_")
    )

  conditions <- levels(meta$condition)
  cond_pal <- setNames(RColorBrewer::brewer.pal(max(3, length(conditions)), "Set1")[seq_along(conditions)], conditions)
  cond_shapes <- setNames(c(21, 24, 22, 23)[seq_along(conditions)], conditions)

  #library size on raw counts, before filtering
  libsize_df <- tibble(sample = colnames(counts), total_reads = colSums(counts)) |>
    left_join(meta, by = "sample")
  write.csv(libsize_df |> select(sample, condition, tp, total_reads),
            file.path(out_dir, "04_library_size.csv"), row.names = FALSE)
  p_libsize <- ggplot(libsize_df, aes(x = sample_lab, y = total_reads / 1e6, fill = condition)) +
    geom_col() +
    scale_fill_manual(values = cond_pal, name = NULL) +
    labs(x = NULL, y = "Total assigned reads (millions)",
         title = paste0(code, " library size per sample")) +
    theme_prism(base_size = 10) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
          legend.position = "bottom")
  sf(p_libsize, "04_library_size_barplot.png", width = 14, height = 6)
  print(p_libsize)

  #filter + single-batch TMM
  keep <- rowSums(counts >= 10) >= 2
  counts <- counts[keep, ]
  dge <- DGEList(counts = counts)
  dge <- calcNormFactors(dge, method = "TMM")
  lcpm <- cpm(dge, log = TRUE, prior.count = 2)
  write.csv(lcpm, file.path(out_dir, "04_log2cpm.csv"))

  #pca, fill by timepoint, shape by condition
  pca <- prcomp(t(lcpm), center = TRUE, scale. = FALSE)
  ve <- (pca$sdev^2) / sum(pca$sdev^2) * 100
  pca_df <- as.data.frame(pca$x[, 1:2]) |>
    rownames_to_column("sample") |>
    left_join(meta, by = "sample")
  write.csv(pca_df |> select(sample, condition, tp, PC1, PC2),
            file.path(out_dir, "04_pca_coordinates.csv"), row.names = FALSE)
  p_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, fill = tp_f, shape = condition)) +
    geom_point(size = 3.5, stroke = 0.8, colour = "grey20") +
    scale_shape_manual(values = cond_shapes, name = NULL) +
    scale_fill_viridis_d(option = "viridis", name = "timepoint (min)") +
    guides(fill = guide_legend(override.aes = list(shape = 21, colour = "grey20"))) +
    labs(x = sprintf("PC1 (%.1f%%)", ve[1]), y = sprintf("PC2 (%.1f%%)", ve[2]),
         title = paste0(code, " PCA")) +
    theme_prism(base_size = 12) +
    theme(legend.position = "right")
  sf(p_pca, "04_pca.png", width = 9, height = 7)
  print(p_pca)

  #within-replicate sd per condition x timepoint
  rep_sd <- bind_rows(lapply(split(meta, meta$group, drop = TRUE), function(grp) {
    cols <- grp$sample
    if (length(cols) < 2) return(NULL)
    tibble(condition = grp$condition[1], tp = grp$tp[1], gene = rownames(lcpm),
           sd = matrixStats::rowSds(lcpm[, cols, drop = FALSE]))
  })) |>
    mutate(tp = factor(tp, levels = sort(unique(tp))))
  if (nrow(rep_sd) > 0) {
    write.csv(rep_sd |> group_by(condition, tp) |> summarise(median_sd = median(sd), .groups = "drop"),
              file.path(out_dir, "04_replicate_sd_median.csv"), row.names = FALSE)
    p_sd <- ggplot(rep_sd, aes(x = tp, y = sd, fill = condition)) +
      geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2, position = position_dodge(0.8)) +
      scale_fill_manual(values = cond_pal, name = NULL) +
      coord_cartesian(ylim = c(0, quantile(rep_sd$sd, 0.99))) +
      labs(x = "Timepoint (min)", y = "Within-replicate SD (log2CPM)",
           title = paste0(code, " replicate spread per timepoint")) +
      theme_prism(base_size = 12) +
      theme(legend.position = "bottom")
    sf(p_sd, "04_replicate_sd_boxplot.png", width = 11, height = 6)
    print(p_sd)
  }

  #per-library rle, reference = within condition x timepoint median
  rle_mat <- lcpm
  for (gp in unique(meta$group)) {
    cols <- meta$sample[meta$group == gp]
    if (length(cols) < 2) next
    ref <- matrixStats::rowMedians(lcpm[, cols, drop = FALSE])
    rle_mat[, cols] <- lcpm[, cols, drop = FALSE] - ref
  }
  rle_long <- as.data.frame(rle_mat) |>
    rownames_to_column("gene") |>
    pivot_longer(-gene, names_to = "sample", values_to = "rle") |>
    left_join(meta, by = "sample")
  p_rle <- ggplot(rle_long, aes(x = sample_lab, y = rle, fill = condition)) +
    geom_hline(yintercept = 0, colour = "grey60", linewidth = 0.3) +
    geom_boxplot(outlier.size = 0.2, outlier.alpha = 0.15) +
    scale_fill_manual(values = cond_pal, name = NULL) +
    coord_cartesian(ylim = c(-0.6, 0.6)) +
    labs(x = NULL, y = "RLE (log2CPM vs within-group median)",
         title = paste0(code, " per-library RLE")) +
    theme_prism(base_size = 10) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
          legend.position = "bottom")
  sf(p_rle, "04_rle_boxplot.png", width = 14, height = 6)
  print(p_rle)

  #per-library count distribution
  dist_long <- as.data.frame(lcpm) |>
    pivot_longer(everything(), names_to = "sample", values_to = "log2cpm") |>
    left_join(meta, by = "sample")
  p_dist <- ggplot(dist_long, aes(x = sample_lab, y = log2cpm, fill = condition)) +
    geom_boxplot(outlier.size = 0.2, outlier.alpha = 0.15) +
    scale_fill_manual(values = cond_pal, name = NULL) +
    labs(x = NULL, y = "log2CPM",
         title = paste0(code, " per-library count distribution")) +
    theme_prism(base_size = 10) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
          legend.position = "bottom")
  sf(p_dist, "04_count_distribution_boxplot.png", width = 14, height = 6)
  print(p_dist)

  #sample-to-sample distance
  sample_dists <- dist(t(lcpm))
  dist_mat <- as.matrix(sample_dists)
  lab_lookup <- setNames(as.character(meta$sample_lab), meta$sample)
  dimnames(dist_mat) <- list(lab_lookup[colnames(lcpm)], lab_lookup[colnames(lcpm)])
  annot <- meta |>
    select(sample_lab, condition, timepoint = tp) |>
    mutate(timepoint = factor(timepoint, levels = sort(unique(timepoint)))) |>
    as.data.frame()
  rownames(annot) <- annot$sample_lab
  annot$sample_lab <- NULL
  annot_colors <- list(
    condition = cond_pal,
    timepoint = setNames(viridisLite::viridis(nlevels(annot$timepoint)), levels(annot$timepoint))
  )
  ph <- pheatmap::pheatmap(
    dist_mat,
    clustering_distance_rows = sample_dists,
    clustering_distance_cols = sample_dists,
    annotation_row = annot, annotation_col = annot,
    annotation_colors = annot_colors,
    color = colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")))(255),
    fontsize_row = 7, fontsize_col = 7,
    main = paste0(code, " sample-to-sample distance (Euclidean, log2CPM)"),
    silent = TRUE
  )
  png(file.path(fig_dir, "04_sample_distance_heatmap.png"), width = 10, height = 10, units = "in", res = 300)
  grid::grid.newpage(); grid::grid.draw(ph$gtable)
  dev.off()
  grid::grid.newpage(); grid::grid.draw(ph$gtable)

  invisible(lcpm)
}

qc_codes <- c("yH001", "yH149", "yH181", "yH714", "yH1053")
qc_files <- file.path(input_dir, "counts_new", paste0(qc_codes, "_salmon.merged.gene_counts.tsv"))
invisible(Map(run_counts_qc, qc_codes, qc_files))

yH001 (C. glabrata, mock) counts QC notes

Set: 3 libraries: mock 30m rep1, mock 30m rep2, mock 60m rep1. Only 30m is replicated. 60m is n=1, dropped before sequencing

Library size barplot

Depth imbalance: 30m rep1 ~4M reads vs ~14–15M for the other two (3–4x shallower).Saw this in FastqC but flagged it here again

Count distribution boxplot

All three libraries equal median (~6) and spread after TMM.
Despite the raw depth gap, tmm normalization equalized the distributions. rep1’s shallowness does not permeate post-norm

PCA

Uninformative at n=3. Three points, replicates do not even group.
not reading into this

Replicate SD boxplot

Only 30m present (60m dropped, cannot compute SD from one library).
Single box, median SD ~0.16, reasonable. No trend possible with one replicated timepoint.

RLE boxplot

Two 30m libraries centered near zero. Expected: with n=2 the within-group median is their midpoint, so centering is automatic and not evidence of agreement.
60m box is meaningless, compared against itself (n=1).

Sample distance heatmap

The two 30m replicates are not nearest neighbors. 30m rep2 is closer to 60m rep1 than to 30m rep1.

MA plot, 30m rep1 vs rep2

Explains the heatmap. Scatter fans out at low A, tightens at high A
Disagreement is confined to low-count genes, well-expressed genes agree tightly.

Bottom line

30m replicate disagreement is depth-driven and confined to low-count genes.
rep1 is shallow but still usable. Low count genes deserve more scrutiny in conclusions drawn

yH149 (K. lactis, mock) counts QC notes

Set: 4 libraries: mock 30m rep1, mock 30m rep2, mock 60m rep1, mock 60m rep2. Both timepoints replicated (n=2 each).

Library size barplot

~2x spread: 30m rep1 lowest (~7M), 60m rep1 highest (~15M).
Milder than yH001. No library alarmingly shallow. Mild imbalance

Count distribution boxplot

All four aligned at median ~6, equal IQR after TMM.
Normalization worked.

PCA

n=4, limited, but better than yH001. PC1 separates timepoint: 30m left (−PC1), 60m right (+PC1).
Replicates spread on PC2 but stay on their timepoint’s side of PC1. Low n so I can’t make many conclusions here

Replicate SD boxplot

30m median SD ~0.11, 60m median SD ~0.19. 60m pair agrees less well.
Both in acceptable range. 60m is the noisier timepoint.

RLE boxplot

All four centered near zero. No library shifted off zero.
30m pair tighter boxes; 60m pair (rep1, rep2) visibly wider. Corroborates the SD story: 60m looser.

Sample distance heatmap

Replicates pair correctly by timepoint: 30m reps are nearest neighbors, 60m reps are nearest neighbors, and the two timepoints separate.

Bottom line

Clean set. Both timepoints replicated, replicates pair correctly by timepoint, normalization equalized libraries, no broken or shifted library.

yH181 (C. glabrata, casamino acids + glucose) counts QC notes

Set: 14 libraries, two conditions (casamino, glucose), three timepoints (0, 450, 690 min), mostly triplicate.

Library size barplot

~2x spread: casamino t0m rep3 tallest (~21M), glucose t690m rep3 shortest (~11M).
No library dangerously shallow. casamino t0 reps slightly higher depth. Mild imbalance, not a flag.

Count distribution boxplot

All 14 libraries aligned at median ~6, equal IQR, across both conditions.
Normalization clean

PCA

PC1 (62%) separates by both time and condition
casamino t0 (purple) far left, tight cluster of 3; 450 and 690 spread right. Replicates cluster within groups.
PC1 driven hard by t0 sitting far from everything
One replicate, 690rep3 more discordant from its others (Casamino)

Replicate SD boxplot

Resolved by condition x timepoint. Medians: casamino-0 ~0.15, casamino-450 ~0.12, glucose-450 ~0.14, casamino-690 ~0.22, glucose-690 ~0.08.
Standout: casamino-690 is the loosest group (~0.22); glucose-690 the tightest. casamino-690 replicates agree less well.
All within acceptable range. casamino-690 is the noisest librayr.

RLE boxplot

One flag: casamino.acids.t690m.rep3box is wider than every other library (~−0.25 to +0.22), median still near zero.
Its two siblings (t690m rep1, rep2) are normal-width. rep3 is what inflates the casamino-690 SD group.
Corroborates the PCA. For some reason, rep3 is more discordant at 690 casamino than the other two samples @ 690

Sample distance heatmap

Mostly clean. Replicates correctly group by condition x timepoint across the board: casamino t0 (tight dark 3x3), glucose 450, glucose 690, casamino 690, casamino 450 all cluster.
Biologically sensible: casamino t0 is the outgroup (baseline, most distinct).
Wrinkle: casamino t690m rep3 (the loose library) sits in its casamino-690 block but is slightly more distant than its siblings. Still clusters with rep1/rep2, so loose-but-not-displaced.

Bottom line

Strong set. Real two-condition three-timepoint structure, normalization clean, replicates correctly cluster by condition x timepoint in both PCA and distance heatmap, biologically sensible geometry (t0 baseline as outgroup).
One consistent flag across SD, RLE, and heatmap: casamino.acids.t690m.rep3, the loosest single library, inflating the casamino-690 group spread.

yH714 (C. albicans, mock) counts QC notes

Set: 4 libraries: mock 30m rep1, mock 30m rep2, mock 60m rep1, mock 60m rep2. Both timepoints replicated (n=2 each).

Library size barplot

Tight range ~13.5M to ~15.5M.
No imbalance, no flag.

Count distribution boxplot

All four aligned at median ~5.5, equal IQR after TMM.
Normalization clean.

PCA

n=4, limited. PC1 (55.7%) separates timepoint: 30m left (−PC1), 60m right (+PC1).

Replicate SD boxplot

30m median SD ~0.12, 60m median SD ~0.10. Both tight, comparable.
No noisy samples.

RLE boxplot

All four centered near zero with comparable, normal-width boxes.
No outlier library, no shift. Clean.

Sample distance heatmap

The key plot, and it is clean. Replicates pair correctly by timepoint: 30m reps are nearest neighbors (dark block), 60m reps are nearest neighbors (dark block), and the two timepoints separate (light cells between blocks).
Replicate coherence holds.

Bottom line

Cleanest of the minimal mock sets. Both timepoints replicated, replicates pair correctly by timepoint, normalization clean, no broken or shifted library, no noisy sample.

yH1053 (S. cerevisiae ppx1Δ ppn1Δ, noPi + mock) counts QC notes

Set: 26 libraries: noPi timecourse (0, 15, 30, 45, 60, 90, 120, 180, 240, 360, 480 min, 2 reps each) + mock (30m, 60m, 2 reps each).

Library size barplot

Range ~10M to ~20M. mock 30m rep1 is the tallest (~20M); most libraries cluster ~10–15M.
No dangerously shallow library.

Count distribution boxplot

All 26 libraries aligned at median ~6, equal IQR, across both mock and noPi.
Normalization clean

PCA

Informative at this n. PC1 (55.0%) is the time axis: noPi sweeps from t0 (purple, far left) through to t480 (yellow, far right) in temporal order.
mock samples (circles) sit among the early noPi timepoints, consistent with mock being a near-baseline condition.
Very interesting progression pattern; almost “worm” like
This genetic background seems to have a different trajectory than WT yH545 from this PCA view

Replicate SD boxplot

Most groups tight (median SD ~0.08–0.12).
noPi-15m is the loosest group (median ~0.23), clearly above the rest. Early timepoints (15, 30) noisier than mid/late.
No replicate variation that I would flag directly

RLE boxplot

All 26 libraries centered near zero, no shifted library.
noPi 15m rep1 and rep2 are the widest boxes, corroborating the SD plot: 15m is the noisy timepoint.
No single outlier library; the 15m looseness is shared across both 15m reps, so it reads as presumable early-timepoint variability, not one bad library.

Sample distance heatmap

Replicates pair correctly by condition x timepoint across the board: every timepoint’s two reps form a dark diagonal block (noPi 0, 15, 30, 45, 60, 90, 120, 180, 240, 360, 480 and mock 30, 60 all cluster).
Biologically sensible: t0 and the 360/480 late timepoints are the outgroups (most distinct from mid-course); mock 30/60 cluster near noPi early timepoints (mock is near-baseline).
FYI: Non-monotonic timepoint ordering in the dendrogram reflects transcriptome-state similarity, not a labeling error (same as yH545).

Bottom line

Strong set. Full noPi timecourse + mock, all replicated, replicates pair correctly by condition x timepoint, normalization clean, no broken or shifted library.
Early timepoints (noPi-15m especially, also 30m) are the noisy corner: higher SD, wider RLE, shared across both reps. Genuine early-timepoint variability, not one bad library. Same pattern as yH545.
PPX1/PPN1 ~0 counts confirmed (see plots)

#chunk 05: DESeq2 DEG analysis for the 2026 E009 reseq runs
#same thresholds as before; same design as befor e
#timecourse noPi-vs-t0 (own t0): yH545, yH1053
#mock-vs-original-t0 (no own t0): yH001 (Cgla), yH149 (Klac), yH714 (Calb)
#JA will do yH181 for his analyssi 

library(DESeq2)
library(tidyverse)
library(patchwork)

padj_threshold <- 0.01
lfc_threshold  <- 1

counts_new_dir <- here("14_DESeq2_062026data", "inputs", "counts_new")
counts_og_dir  <- here("14_DESeq2_062026data", "inputs", "counts_OG")
out_base       <- here("14_DESeq2_062026data", "outputs")
fig_dir        <- here("14_DESeq2_062026data", "figures", "DEG")
dir.create(fig_dir, recursive = TRUE, showWarnings = FALSE)

#helpers
clean_gene_id <- function(ids, prefix) if (nzchar(prefix)) sub(paste0("^", prefix), "", ids) else ids
parse_tp_new  <- function(x) as.integer(str_extract(x, "\\d+(?=m\\.rep)"))   #new naming: noPi.30m.rep1
parse_cond    <- function(x) str_extract(x, "mock|noPi")
parse_tp_og   <- function(x) {                                              #old naming: t0000 zero-pad, or Xm
  z1 <- str_extract(x, "(?<=t)\\d{3,4}")
  z2 <- str_extract(x, "\\d+(?=m)")
  ifelse(!is.na(z1), as.integer(z1), as.integer(z2))
}

save_fig <- function(p, fname, width, height) {
  ggsave(file.path(fig_dir, fname), plot = p, width = width, height = height,
         dpi = 300, bg = "white")
}

#shared universe/DEG summarization (factored out so both run functions stay parallel)
summarise_run <- function(results_list) {
  all_res <- bind_rows(results_list)
  universe <- all_res |>
    group_by(gene_id) |>
    dplyr::summarise(
      baseMean = mean(baseMean, na.rm = TRUE),
      max_abs_lfc = max(abs(log2FoldChange), na.rm = TRUE),
      min_padj = min(padj, na.rm = TRUE),
      .groups = "drop"
    )
  sig_any <- all_res |>
    dplyr::filter(!is.na(padj), padj < padj_threshold, abs(log2FoldChange) > lfc_threshold) |>
    dplyr::pull(gene_id) |>
    unique()
  degs <- all_res |>
    dplyr::filter(gene_id %in% sig_any, !is.na(padj)) |>
    group_by(gene_id) |>
    dplyr::slice_min(padj, n = 1, with_ties = FALSE) |>
    ungroup() |>
    mutate(direction = ifelse(log2FoldChange > 0, "up", "down")) |>
    dplyr::select(gene_id, baseMean, log2FoldChange, padj, timepoint, direction)
  degs_up <- degs |> dplyr::filter(direction == "up")
  degs_down <- degs |> dplyr::filter(direction == "down")
  message(sprintf("  universe: %d | DEGs: %d (up: %d, down: %d)",
                  nrow(universe), nrow(degs), nrow(degs_up), nrow(degs_down)))
  list(universe = universe, degs = degs, degs_up = degs_up, degs_down = degs_down)
}

#timecourse noPi-vs-t0, reference t0 lives inside the file
run_species_deseq <- function(run_code, counts_file, id_prefix, condition_keep = NULL) {
  message(sprintf("=== %s ===", run_code))
  counts_raw <- read_tsv(counts_file, show_col_types = FALSE)
  id_col <- names(counts_raw)[1]; name_col <- names(counts_raw)[2]
  gene_ids_clean <- clean_gene_id(counts_raw[[id_col]], id_prefix)
  sample_cols <- setdiff(names(counts_raw), c(id_col, name_col))

  if (!is.null(condition_keep)) {
    sample_cols <- sample_cols[parse_cond(sample_cols) %in% condition_keep]
  }

  count_mat <- as.matrix(counts_raw[, sample_cols])
  count_mat <- apply(count_mat, 2, function(x) as.integer(round(x)))
  rownames(count_mat) <- gene_ids_clean

  tp_min <- parse_tp_new(colnames(count_mat))
  stopifnot(!any(is.na(tp_min)))
  tp_levels <- as.character(sort(unique(tp_min)))
  col_data <- data.frame(
    sample = colnames(count_mat),
    timepoint = factor(as.character(tp_min), levels = tp_levels),
    row.names = colnames(count_mat)
  )
  ref_tp <- if ("0" %in% tp_levels) "0" else tp_levels[1]
  col_data$timepoint <- relevel(col_data$timepoint, ref = ref_tp)
  test_tps <- setdiff(tp_levels, ref_tp)
  message(sprintf("  ref: %s | test: %s", ref_tp, paste(test_tps, collapse = ", ")))

  keep <- rowSums(count_mat >= 10) >= 2
  count_mat <- count_mat[keep, ]
  message(sprintf("  genes after filter: %d", nrow(count_mat)))

  dds <- DESeqDataSetFromMatrix(countData = count_mat, colData = col_data, design = ~ timepoint)
  dds <- DESeq(dds, quiet = TRUE)

  results_list <- list()
  for (tp in test_tps) {
    keep_samp <- col_data$timepoint %in% c(ref_tp, tp)
    dds_sub <- dds[, keep_samp]
    colData(dds_sub)$timepoint <- droplevels(colData(dds_sub)$timepoint)
    colData(dds_sub)$timepoint <- relevel(colData(dds_sub)$timepoint, ref = ref_tp)
    dds_sub <- DESeq(dds_sub, quiet = TRUE)
    coef_name <- resultsNames(dds_sub)[2]
    res <- lfcShrink(dds_sub, coef = coef_name, type = "apeglm", quiet = TRUE)
    res_df <- as.data.frame(res) |>
      rownames_to_column("gene_id") |>
      mutate(timepoint = tp, contrast = paste0(tp, "_vs_", ref_tp))
    results_list[[tp]] <- res_df
  }

  out <- summarise_run(results_list)
  c(list(species = run_code, type = "timecourse", dds = dds, results_list = results_list), out)
}

#mock-vs-original-t0, baseline pulled from the original-species matrix and joined on common genes
run_mock_deseq <- function(run_code, mock_file, og_file, id_prefix) {
  message(sprintf("=== %s (mock vs original t0) ===", run_code))

  m_raw <- read_tsv(mock_file, show_col_types = FALSE)
  m_id <- names(m_raw)[1]; m_nm <- names(m_raw)[2]
  m_ids <- clean_gene_id(m_raw[[m_id]], id_prefix)
  m_samp <- setdiff(names(m_raw), c(m_id, m_nm))
  m_samp <- m_samp[parse_cond(m_samp) %in% "mock"]
  m_tp <- parse_tp_new(m_samp)
  stopifnot(!any(is.na(m_tp)))
  m_mat <- as.matrix(m_raw[, m_samp]); rownames(m_mat) <- m_ids

  o_raw <- read_tsv(og_file, show_col_types = FALSE)
  o_id <- names(o_raw)[1]; o_nm <- names(o_raw)[2]
  o_ids <- clean_gene_id(o_raw[[o_id]], id_prefix)
  o_samp <- setdiff(names(o_raw), c(o_id, o_nm))
  o_tp <- parse_tp_og(o_samp)
  message(sprintf("  original timepoints parsed: %s",
                  paste(sort(unique(o_tp)), collapse = ", ")))
  ref_samp <- o_samp[!is.na(o_tp) & o_tp == 0]
  stopifnot(length(ref_samp) >= 1)              #loud fail if original t0 not found
  o_mat <- as.matrix(o_raw[, ref_samp, drop = FALSE]); rownames(o_mat) <- o_ids

  common <- intersect(rownames(m_mat), rownames(o_mat))
  message(sprintf("  mock genes: %d | og genes: %d | common: %d",
                  nrow(m_mat), nrow(o_mat), length(common)))
  m_mat <- m_mat[common, , drop = FALSE]; o_mat <- o_mat[common, , drop = FALSE]

  count_mat <- cbind(o_mat, m_mat)
  count_mat <- apply(count_mat, 2, function(x) as.integer(round(x)))
  rownames(count_mat) <- common

  grp <- c(rep("t0", ncol(o_mat)), paste0("mock", m_tp))
  grp_levels <- c("t0", paste0("mock", sort(unique(m_tp))))
  col_data <- data.frame(
    sample = colnames(count_mat),
    group = factor(grp, levels = grp_levels),
    row.names = colnames(count_mat)
  )
  col_data$group <- relevel(col_data$group, ref = "t0")

  keep <- rowSums(count_mat >= 10) >= 2
  count_mat <- count_mat[keep, ]
  message(sprintf("  genes after filter: %d", nrow(count_mat)))

  dds <- DESeqDataSetFromMatrix(countData = count_mat, colData = col_data, design = ~ group)
  dds <- DESeq(dds, quiet = TRUE)

  test_grps <- setdiff(levels(col_data$group), "t0")
  results_list <- list()
  for (g in test_grps) {
    keep_samp <- col_data$group %in% c("t0", g)
    dds_sub <- dds[, keep_samp]
    colData(dds_sub)$group <- droplevels(colData(dds_sub)$group)
    colData(dds_sub)$group <- relevel(colData(dds_sub)$group, ref = "t0")
    dds_sub <- DESeq(dds_sub, quiet = TRUE)
    coef_name <- resultsNames(dds_sub)[2]
    res <- lfcShrink(dds_sub, coef = coef_name, type = "apeglm", quiet = TRUE)
    tp <- as.character(as.integer(sub("mock", "", g)))
    res_df <- as.data.frame(res) |>
      rownames_to_column("gene_id") |>
      mutate(timepoint = tp, contrast = paste0(g, "_vs_t0"))
    results_list[[tp]] <- res_df
  }

  out <- summarise_run(results_list)
  c(list(species = run_code, type = "mock", dds = dds, results_list = results_list), out)
}

deseq_results <- list()
deseq_results$yH545  <- run_species_deseq(
  "yH545",  file.path(counts_new_dir, "yH545_salmon.merged.gene_counts.tsv"),  "gene-")
deseq_results$yH1053 <- run_species_deseq(
  "yH1053", file.path(counts_new_dir, "yH1053_salmon.merged.gene_counts.tsv"), "gene-",
  condition_keep = "noPi")
deseq_results$yH001  <- run_mock_deseq(
  "yH001", file.path(counts_new_dir, "yH001_salmon.merged.gene_counts.tsv"),
  file.path(counts_og_dir, "CGLA-salmon.merged.gene_counts.tsv"), "gene-")
deseq_results$yH149  <- run_mock_deseq(
  "yH149", file.path(counts_new_dir, "yH149_salmon.merged.gene_counts.tsv"),
  file.path(counts_og_dir, "KLAC-salmon.merged.gene_counts.tsv"), "gene-")
deseq_results$yH714  <- run_mock_deseq(
  "yH714", file.path(counts_new_dir, "yH714_salmon.merged.gene_counts.tsv"),
  file.path(counts_og_dir, "CALB-salmon.merged.gene_counts.tsv"), "")

deg_summary <- tibble(
  run = names(deseq_results),
  universe = sapply(deseq_results, function(x) nrow(x$universe)),
  degs_total = sapply(deseq_results, function(x) nrow(x$degs)),
  degs_up = sapply(deseq_results, function(x) nrow(x$degs_up)),
  degs_down = sapply(deseq_results, function(x) nrow(x$degs_down))
)
print(deg_summary)

## # A tibble: 5 × 5
##   run    universe degs_total degs_up degs_down
##   <chr>     <int>      <int>   <int>     <int>
## 1 yH545      5905       2889    1437      1452
## 2 yH1053     5861       3247    1616      1631
## 3 yH001      5110       1731     955       776
## 4 yH149      5060       1922    1003       919
## 5 yH714      5843       2459    1340      1119

for (rc in names(deseq_results)) {
  od <- file.path(out_base, rc)
  dir.create(od, recursive = TRUE, showWarnings = FALSE)
  write_csv(deseq_results[[rc]]$universe,  file.path(od, sprintf("%s_universe.csv", rc)))
  write_csv(deseq_results[[rc]]$degs,      file.path(od, sprintf("%s_degs_union.csv", rc)))
  write_csv(deseq_results[[rc]]$degs_up,   file.path(od, sprintf("%s_degs_up.csv", rc)))
  write_csv(deseq_results[[rc]]$degs_down, file.path(od, sprintf("%s_degs_down.csv", rc)))
}

## DEG landscape
deg_counts <- bind_rows(lapply(names(deseq_results), function(rc) {
  rl <- deseq_results[[rc]]$results_list
  bind_rows(lapply(names(rl), function(tp) {
    df <- rl[[tp]]
    sig <- df |> dplyr::filter(!is.na(padj), padj < padj_threshold, abs(log2FoldChange) > lfc_threshold)
    tibble(run = rc, timepoint = tp,
           Up = sum(sig$log2FoldChange > 0), Down = sum(sig$log2FoldChange < 0))
  }))
}))

#absolute time on x, numeric order, shared across runs
all_tp_levels <- as.character(sort(unique(as.numeric(deg_counts$timepoint))))

deg_long <- deg_counts |>
  mutate(tp_label = factor(timepoint, levels = all_tp_levels)) |>
  pivot_longer(cols = c(Up, Down), names_to = "direction", values_to = "count")

run_display <- c(
  yH545  = "S. cerevisiae WT (noPi)",
  yH1053 = "S. cerevisiae ppx1\u0394 ppn1\u0394 (noPi)",
  yH001  = "C. glabrata (mock)",
  yH149  = "K. lactis (mock)",
  yH714  = "C. albicans (mock)"
)
deg_long$run_display <- factor(run_display[deg_long$run], levels = run_display)
deg_long$direction <- factor(deg_long$direction, levels = c("Down", "Up"))

#mock runs derived from analysis type, not hardcoded
mock_runs <- names(deseq_results)[sapply(deseq_results, function(x) x$type == "mock")]

dir_colors <- c("Down" = "#4A90D9", "Up" = "#C0392B")
run_colors <- c(
  "S. cerevisiae WT (noPi)"                    = "#377EB8",
  "S. cerevisiae ppx1\u0394 ppn1\u0394 (noPi)" = "#984EA3",
  "C. glabrata (mock)"                         = "#E41A1C",
  "K. lactis (mock)"                           = "#F8C434",
  "C. albicans (mock)"                         = "#74AC4C"
)
run_shapes <- c(
  "S. cerevisiae WT (noPi)"                    = 21,
  "S. cerevisiae ppx1\u0394 ppn1\u0394 (noPi)" = 24,
  "C. glabrata (mock)"                         = 22,
  "K. lactis (mock)"                           = 23,
  "C. albicans (mock)"                         = 25
)

dw <- 0.75
y_max <- max(deg_long$count, na.rm = TRUE) * 1.08

#lines + points only on timecourse runs; mocks get bars alone
line_df <- deg_long |> dplyr::filter(!run %in% mock_runs)

p_facet <- ggplot(deg_long, aes(x = tp_label, y = count, fill = direction, colour = direction, group = direction)) +
  geom_col(position = position_dodge(width = dw), width = dw * 0.85, alpha = 0.45, colour = NA) +
  geom_line(data = line_df, position = position_dodge(width = dw), linewidth = 1.1) +
  geom_point(data = line_df, position = position_dodge(width = dw), size = 2, shape = 21, fill = "white", stroke = 1) +
  facet_wrap(~run_display, ncol = 2, axes = "all") +
  scale_fill_manual(values = dir_colors, name = NULL) +
  scale_colour_manual(values = dir_colors, name = NULL) +
  scale_y_continuous(limits = c(0, y_max)) +
  guides(fill = guide_legend(override.aes = list(alpha = 1)),
         colour = guide_legend(override.aes = list(linewidth = 2))) +
  labs(x = "Time (min)", y = "DEG count (|LFC| > 1)") +
  theme_prism(base_size = 12) +
  theme(strip.text = element_text(face = "italic", size = 11),
        legend.position = "bottom",
        legend.key.width = unit(1.2, "cm"),
        axis.text.x = element_text(size = 9, angle = 0))

scer_runs <- c("S. cerevisiae WT (noPi)", "S. cerevisiae ppx1\u0394 ppn1\u0394 (noPi)")
make_overlay <- function(dir, panel_color, runs_in) {
  df <- deg_long |> dplyr::filter(direction == dir, run_display %in% runs_in)
  ggplot(df, aes(x = tp_label, y = count, colour = run_display, shape = run_display, group = run_display)) +
    geom_line(linewidth = 1.3) +
    geom_point(size = 3.5, fill = "white", stroke = 1.2) +
    scale_colour_manual(values = run_colors, name = NULL) +
    scale_shape_manual(values = run_shapes, name = NULL) +
    scale_y_continuous(limits = c(0, y_max)) +
    labs(title = paste0(ifelse(dir == "Down", "Downregulated", "Upregulated"), " DEGs (WT vs mutant)"),
         x = "Time (min)", y = if (dir == "Down") "DEG count (|LFC| > 1)" else NULL) +
    theme_prism(base_size = 12) +
    theme(plot.title = element_text(colour = panel_color, face = "bold", size = 13),
          legend.position = "bottom",
          legend.text = element_text(face = "italic", size = 10),
          axis.text.x = element_text(size = 9, angle = 0))
}

p_down <- make_overlay("Down", dir_colors["Down"], scer_runs)
p_up   <- make_overlay("Up",   dir_colors["Up"],   scer_runs)

p_combined <- p_facet /
  (p_down | p_up) +
  plot_layout(heights = c(1.2, 0.7)) +
  plot_annotation(tag_levels = "A",
                  theme = theme(plot.tag = element_text(size = 14, face = "bold")))

save_fig(p_combined, paste0(Sys.Date(), "_deg_landscape_combined.png"), width = 13, height = 13)
p_combined

save_fig(p_facet, paste0(Sys.Date(), "_deg_landscape_A_faceted.png"), width = 11, height = 9)
save_fig(p_down,  paste0(Sys.Date(), "_deg_landscape_B_down_overlay.png"), width = 7, height = 5)
save_fig(p_up,    paste0(Sys.Date(), "_deg_landscape_C_up_overlay.png"), width = 7, height = 5)

#individual per-run landscape into figures/DEG/<run>/
#mocks: bars only; timecourses: bars + line + points. own timepoints + own y-axis per plot
make_run_plot <- function(rc) {
  df <- deg_long |> dplyr::filter(run == rc)
  df$tp_label <- droplevels(df$tp_label)
  is_mock <- deseq_results[[rc]]$type == "mock"
  disp <- as.character(unique(df$run_display))
  run_ymax <- max(df$count, na.rm = TRUE) * 1.08

  p <- ggplot(df, aes(x = tp_label, y = count, fill = direction, colour = direction, group = direction)) +
    geom_col(position = position_dodge(width = dw), width = dw * 0.85, alpha = 0.45, colour = NA)
  if (!is_mock) {
    p <- p +
      geom_line(position = position_dodge(width = dw), linewidth = 1.1) +
      geom_point(position = position_dodge(width = dw), size = 2, shape = 21, fill = "white", stroke = 1)
  }
  p +
    scale_fill_manual(values = dir_colors, name = NULL) +
    scale_colour_manual(values = dir_colors, name = NULL) +
    scale_y_continuous(limits = c(0, run_ymax)) +
    guides(fill = guide_legend(override.aes = list(alpha = 1)),
           colour = if (is_mock) "none" else guide_legend(override.aes = list(linewidth = 2))) +
    labs(title = disp, x = "Time (min)", y = "DEG count (|LFC| > 1)") +
    theme_prism(base_size = 12) +
    theme(plot.title = element_text(face = "italic", size = 13),
          legend.position = "bottom",
          axis.text.x = element_text(size = 9, angle = 0))
}

for (rc in names(deseq_results)) {
  rd <- file.path(fig_dir, rc)
  dir.create(rd, recursive = TRUE, showWarnings = FALSE)
  ggsave(file.path(rd, sprintf("%s_%s_deg_landscape.png", Sys.Date(), rc)),
         plot = make_run_plot(rc), width = 7, height = 5, dpi = 300, bg = "white")
}

#mock-vs-noPi DEG overlap per mock species (handling-artifact isolation)
#shared (same-direction) DEGs = experimental-handling response, common to both arms
#noPi-unique = phosphate-specific response; canonical PHO regulon must land here, NOT in shared (pos contrl)
#same baseline both arms: original-timecourse t0. noPi restricted to 30/60 to match mock time depth
#re-runs original noPi-vs-t0 (30/60) on counts_OG for reproducibility
#scer_inrun (yH545/yH1053): mock-vs-t0 and noPi-vs-t0 both within-run, shared 2026 noPi t0
#mock DEGs reused

library(ggVennDiagram)

padj_threshold <- 0.01
lfc_threshold  <- 1
venn_tps <- c("30", "60")

counts_new_dir <- here("14_DESeq2_062026data", "inputs", "counts_new")
counts_og_dir  <- here("14_DESeq2_062026data", "inputs", "counts_OG")
pho_dir        <- here("14_DESeq2_062026data", "inputs")
out_base       <- here("14_DESeq2_062026data", "outputs")
fig_dir        <- here("14_DESeq2_062026data", "figures", "DEG")

#mock_only pulls mock DEGs from chunk 05 and re-runs the original noPi arm
run_cfg <- list(
  yH545  = list(type = "scer_inrun", new = "yH545_salmon.merged.gene_counts.tsv",
                prefix = "gene-", sp = "Scer", disp = "Scer WT"),
  yH1053 = list(type = "scer_inrun", new = "yH1053_salmon.merged.gene_counts.tsv",
                prefix = "gene-", sp = "Scer", disp = "Scer ppx1\u0394 ppn1\u0394"),
  yH001  = list(type = "mock_only", og = "CGLA-salmon.merged.gene_counts.tsv",
                prefix = "gene-", sp = "Cgla", disp = "Cgla"),
  yH149  = list(type = "mock_only", og = "KLAC-salmon.merged.gene_counts.tsv",
                prefix = "gene-", sp = "Klac", disp = "Klac"),
  yH714  = list(type = "mock_only", og = "CALB-salmon.merged.gene_counts.tsv",
                prefix = "",      sp = "Calb", disp = "Calb")
)

deg_set_from_results <- function(results_list, tps) {
  bind_rows(results_list[names(results_list) %in% tps]) |>
    dplyr::filter(!is.na(padj), padj < padj_threshold, abs(log2FoldChange) > lfc_threshold) |>
    group_by(gene_id) |>
    dplyr::slice_min(padj, n = 1, with_ties = FALSE) |>
    ungroup() |>
    mutate(direction = ifelse(log2FoldChange > 0, "up", "down")) |>
    dplyr::select(gene_id, direction)
}

run_og_nopi_deseq <- function(og_file, id_prefix, tps) {
  raw <- read_tsv(og_file, show_col_types = FALSE)
  id_col <- names(raw)[1]; name_col <- names(raw)[2]
  ids_clean <- clean_gene_id(raw[[id_col]], id_prefix)
  samp <- setdiff(names(raw), c(id_col, name_col))

  mat <- as.matrix(raw[, samp]); rownames(mat) <- ids_clean
  tp <- parse_tp_og(colnames(mat))
  stopifnot(!any(is.na(tp)))
  stopifnot(0 %in% tp)
  stopifnot(all(as.integer(tps) %in% tp))

  keep <- rowSums(mat >= 10) >= 2
  mat <- mat[keep, ]
  mat <- apply(mat, 2, function(x) as.integer(round(x)))
  rownames(mat) <- ids_clean[keep]

  col_data <- data.frame(
    sample = colnames(mat),
    timepoint = factor(as.character(tp), levels = as.character(sort(unique(tp)))),
    row.names = colnames(mat)
  )
  col_data$timepoint <- relevel(col_data$timepoint, ref = "0")

  results_list <- list()
  for (t in tps) {
    keep_samp <- col_data$timepoint %in% c("0", t)
    sub_mat <- mat[, keep_samp]
    cd <- droplevels(col_data[keep_samp, , drop = FALSE])
    cd$timepoint <- relevel(cd$timepoint, ref = "0")
    dds <- DESeqDataSetFromMatrix(sub_mat, cd, design = ~ timepoint)
    dds <- DESeq(dds, quiet = TRUE)
    res <- lfcShrink(dds, coef = resultsNames(dds)[2], type = "apeglm", quiet = TRUE)
    results_list[[t]] <- as.data.frame(res) |>
      rownames_to_column("gene_id") |>
      mutate(timepoint = t)
  }
  results_list
}

run_inrun_mock_deseq <- function(counts_file, id_prefix, tps) {
  raw <- read_tsv(counts_file, show_col_types = FALSE)
  id_col <- names(raw)[1]; name_col <- names(raw)[2]
  ids_clean <- clean_gene_id(raw[[id_col]], id_prefix)
  samp <- setdiff(names(raw), c(id_col, name_col))

  cond <- parse_cond(samp)
  tp   <- parse_tp_new(samp)
  t0_samp   <- samp[cond == "noPi" & tp == 0]
  mock_samp <- samp[cond == "mock" & tp %in% as.integer(tps)]
  stopifnot(length(t0_samp) >= 2)
  stopifnot(length(mock_samp) >= 1)

  keep_cols <- c(t0_samp, mock_samp)
  mat <- as.matrix(raw[, keep_cols])
  mat <- apply(mat, 2, function(x) as.integer(round(x)))
  rownames(mat) <- ids_clean

  mock_tp_vec <- tp[match(mock_samp, samp)]
  grp <- c(rep("t0", length(t0_samp)), paste0("mock", mock_tp_vec))
  grp_levels <- c("t0", paste0("mock", sort(unique(mock_tp_vec))))
  col_data <- data.frame(
    sample = keep_cols,
    group = factor(grp, levels = grp_levels),
    row.names = keep_cols
  )
  col_data$group <- relevel(col_data$group, ref = "t0")

  keep <- rowSums(mat >= 10) >= 2
  mat <- mat[keep, ]
  message(sprintf("  in-run mock: t0 libs %d | mock libs %d | genes after filter %d",
                  length(t0_samp), length(mock_samp), nrow(mat)))

  test_grps <- setdiff(levels(col_data$group), "t0")
  results_list <- list()
  for (g in test_grps) {
    keep_samp <- col_data$group %in% c("t0", g)
    sub_mat <- mat[, keep_samp]
    cd <- droplevels(col_data[keep_samp, , drop = FALSE])
    cd$group <- relevel(cd$group, ref = "t0")
    dds <- DESeqDataSetFromMatrix(sub_mat, cd, design = ~ group)
    dds <- DESeq(dds, quiet = TRUE)
    res <- lfcShrink(dds, coef = resultsNames(dds)[2], type = "apeglm", quiet = TRUE)
    t <- as.character(as.integer(sub("mock", "", g)))
    results_list[[t]] <- as.data.frame(res) |>
      rownames_to_column("gene_id") |>
      mutate(timepoint = t)
  }
  results_list
}

venn_summary <- list()
pho_report   <- list()

for (rc in names(run_cfg)) {
  cfg <- run_cfg[[rc]]
  message(sprintf("=== %s (%s) mock vs noPi (30/60) ===", rc, cfg$disp))

  if (cfg$type == "mock_only") {
    mock_deg  <- deg_set_from_results(deseq_results[[rc]]$results_list, venn_tps)
    og_res    <- run_og_nopi_deseq(file.path(counts_og_dir, cfg$og), cfg$prefix, venn_tps)
    nopi_deg  <- deg_set_from_results(og_res, venn_tps)
    univ_mock <- unique(bind_rows(deseq_results[[rc]]$results_list)$gene_id)
    univ_nopi <- unique(bind_rows(og_res)$gene_id)
  } else {
    mock_res  <- run_inrun_mock_deseq(file.path(counts_new_dir, cfg$new), cfg$prefix, venn_tps)
    mock_deg  <- deg_set_from_results(mock_res, venn_tps)
    nopi_deg  <- deg_set_from_results(deseq_results[[rc]]$results_list, venn_tps)
    univ_mock <- unique(bind_rows(mock_res)$gene_id)
    univ_nopi <- unique(bind_rows(deseq_results[[rc]]$results_list)$gene_id)
  }

  #common testable universe (genes tested in both arms) for a fair Venn
  common_univ <- intersect(univ_mock, univ_nopi)
  mock_deg <- mock_deg |> dplyr::filter(gene_id %in% common_univ)
  nopi_deg <- nopi_deg |> dplyr::filter(gene_id %in% common_univ)
  message(sprintf("  testable universe: mock %d | noPi %d | common %d",
                  length(univ_mock), length(univ_nopi), length(common_univ)))

  joined <- full_join(
    mock_deg |> dplyr::rename(dir_mock = direction),
    nopi_deg |> dplyr::rename(dir_nopi = direction),
    by = "gene_id"
  )
  shared_same <- joined |> dplyr::filter(!is.na(dir_mock), !is.na(dir_nopi), dir_mock == dir_nopi)
  shared_opp  <- joined |> dplyr::filter(!is.na(dir_mock), !is.na(dir_nopi), dir_mock != dir_nopi)
  mock_only   <- joined |> dplyr::filter(!is.na(dir_mock), is.na(dir_nopi))
  nopi_only   <- joined |> dplyr::filter(is.na(dir_mock), !is.na(dir_nopi))

  message(sprintf("  mock DEGs: %d | noPi DEGs: %d | shared(same-dir): %d | shared(opp-dir): %d",
                  nrow(mock_deg), nrow(nopi_deg), nrow(shared_same), nrow(shared_opp)))

  #Venn on same-direction membership; short labels + x-expansion so names don't clip
  venn_sets <- list(Mock = mock_deg$gene_id, noPi = nopi_deg$gene_id)
  p_venn <- ggVennDiagram(venn_sets, label = "count", label_alpha = 0,
                          set_color = c("#999999", "#377EB8")) +
    scale_fill_gradient(low = "#F7FBFF", high = "#C0392B") +
    scale_x_continuous(expand = expansion(mult = 0.2)) +
    labs(title = sprintf("%s: mock vs noPi DEGs (30/60, same-direction)", cfg$disp)) +
    theme(plot.title = element_text(face = "bold", size = 13),
          legend.position = "none")

  rd <- file.path(fig_dir, rc)
  dir.create(rd, recursive = TRUE, showWarnings = FALSE)
  ggsave(file.path(rd, sprintf("%s_%s_venn_mock_vs_nopi.png", Sys.Date(), rc)),
         plot = p_venn, width = 6, height = 5, dpi = 300, bg = "white")

  #region membership table out
  region_tbl <- bind_rows(
    shared_same |> dplyr::transmute(gene_id, region = "shared_same_dir", dir_mock, dir_nopi),
    shared_opp  |> dplyr::transmute(gene_id, region = "shared_opp_dir",  dir_mock, dir_nopi),
    mock_only   |> dplyr::transmute(gene_id, region = "mock_only",  dir_mock, dir_nopi),
    nopi_only   |> dplyr::transmute(gene_id, region = "nopi_only",  dir_mock, dir_nopi)
  )
  dir.create(file.path(out_base, rc), recursive = TRUE, showWarnings = FALSE)
  write_csv(region_tbl, file.path(out_base, rc, sprintf("%s_venn_regions.csv", rc)))

  ## PHO sanity check
  #yH1053: PPX1/PPN1 are deleted (~0 counts) so they drop from the universe and won't appear
  #in the PHO table; expected, not a control leak
  pho <- read_csv(file.path(pho_dir, sprintf("PHO_regulon_%s_mapped1.csv", cfg$sp)),
                  show_col_types = FALSE) |>
    dplyr::filter(!is.na(gene_id)) |>
    mutate(gene_id_clean = clean_gene_id(gene_id, cfg$prefix))

  #assert PHO genes actually reached the tested universe; loud if mapping/prefix mismatch
  pho_in_univ <- pho |> dplyr::filter(gene_id_clean %in% common_univ)
  if (nrow(pho_in_univ) < max(3, 0.3 * nrow(pho))) {
    warning(sprintf("[%s] only %d/%d PHO genes in tested universe; check ID prefix/mapping",
                    cfg$sp, nrow(pho_in_univ), nrow(pho)))
  }

  pho_regions <- region_tbl |>
    dplyr::inner_join(pho |> dplyr::select(gene_id_clean, scer_gene_name, functional_category),
                      by = c("gene_id" = "gene_id_clean"))

  message(sprintf("PHO genes: %d mapped, %d in universe, %d called in either arm",
                  nrow(pho), nrow(pho_in_univ), nrow(pho_regions)))
  pho_shared <- pho_regions |> dplyr::filter(region == "shared_same_dir")
  if (nrow(pho_shared) > 0) {
    message(sprintf("PHO genes in SHARED set (control leak?): %s",
                    paste(pho_shared$scer_gene_name, collapse = ", ")))
  } else {
    message("PHO check OK")
  }

  pho_report[[rc]] <- pho_regions |>
    dplyr::transmute(run = rc, species = cfg$sp, scer_gene_name, functional_category,
                     region, dir_mock, dir_nopi)
  write_csv(pho_report[[rc]],
            file.path(out_base, rc, sprintf("%s_PHO_venn_membership.csv", rc)))

  venn_summary[[rc]] <- tibble(
    run = rc, species = cfg$sp, type = cfg$type,
    mock_degs = nrow(mock_deg), nopi_degs = nrow(nopi_deg),
    shared_same = nrow(shared_same), shared_opp = nrow(shared_opp),
    mock_only = nrow(mock_only), nopi_only = nrow(nopi_only),
    pho_in_universe = nrow(pho_in_univ), pho_in_shared = nrow(pho_shared)
  )
}

venn_summary <- bind_rows(venn_summary)
print(venn_summary)

## # A tibble: 5 × 11
##   run    species type       mock_degs nopi_degs shared_same shared_opp mock_only
##   <chr>  <chr>   <chr>          <int>     <int>       <int>      <int>     <int>
## 1 yH545  Scer    scer_inrun       549       964         527          0        22
## 2 yH1053 Scer    scer_inrun       531      1524         478          6        47
## 3 yH001  Cgla    mock_only       1729      2186         588        570       571
## 4 yH149  Klac    mock_only       1916      1991        1329         65       522
## 5 yH714  Calb    mock_only       2458      2212        1822         34       602
## # ℹ 3 more variables: nopi_only <int>, pho_in_universe <int>,
## #   pho_in_shared <int>

pho_report_all <- bind_rows(pho_report)
print(pho_report_all |> dplyr::arrange(species, run, functional_category, scer_gene_name))

## # A tibble: 73 × 7
##    run   species scer_gene_name functional_category region     dir_mock dir_nopi
##    <chr> <chr>   <chr>          <chr>               <chr>      <chr>    <chr>   
##  1 yH714 Calb    DDP1           Phosphatases        mock_only  down     <NA>    
##  2 yH714 Calb    GDE1           Phosphatases        shared_sa… up       up      
##  3 yH714 Calb    HOR2           Phosphatases        shared_sa… down     down    
##  4 yH714 Calb    PHM8           Phosphatases        shared_sa… down     down    
##  5 yH714 Calb    PHO11          Phosphatases        nopi_only  <NA>     up      
##  6 yH714 Calb    PHO8           Phosphatases        shared_sa… down     down    
##  7 yH714 Calb    VTC1           Poly P Metabolism   nopi_only  <NA>     up      
##  8 yH714 Calb    VTC2           Poly P Metabolism   nopi_only  <NA>     up      
##  9 yH714 Calb    VTC4           Poly P Metabolism   nopi_only  <NA>     up      
## 10 yH714 Calb    PHO81          Regulatory          nopi_only  <NA>     up      
## # ℹ 63 more rows

E009 DEG landscape

Per-timepoint DEG counts (noPi-vs-t0, |LFC|>1, padj<0.01) across the full Scer timecourse, for yH545 (WT) and yH1053 (ppx1Δ ppn1Δ) as we did before. This is the shape and timing of the starvation program.

WT (yH545) has the same 60, 90 trough in new data

Early wave (15–45 min): large and immediate, ~1600 DEGs combined at 15 min (~870 down, ~810 up). The cell responds hard right away. We did not have this resolution before.
Trough (60–90 min): the defining feature we saw before. DEG counts collapse, down-regulated genes drop to ~45 at 60 and 90 min, up to _270/180. Luckily, in our new data, this is reproducible, even to similar magnitude compared to previous results
Late wave (120–480 min): the dominant, sustained program. Climbs out of the trough at 120, reaches ~960 down / ~900 up by 180, plateaus near _1000/1000 through 480. In our previous data, the 150m timepoint saw the late wave beginning.

Mutant (yH1053) DEG Landscape

Trough is delayed and shallower. WT bottoms out at 60–90 min (down ~45). The mutant’s trough is later and far less deep, down-DEGs only fall to ~240 at 90 min and the collapse doesn’t happen at 60.
Early wave blunted, late wave amplified. Mutant early response is smaller (~390 down, ~580 up at 15 vs WT’s _870/810). But the late wave overtakes WT: the curves cross around 240–300 min and the mutant ends highest at 480 (down ~1390 vs WT ~980; up ~1320 vs ~1040).
Peak timing shifts late. 75% of mutant up-genes (1215 of 1616) peak at 360–480 min, versus WT’s balanced early/late split. The whole program is shoved toward the late.

my interpretation: PPX1/PPN1 hydrolyze stored polyphosphate. Without that buffer, the mutant hits internal Pi limitation almost immediately on phosphate withdrawal, so Pho4 engages early and the PHO-specific program fires ahead of WT. In our 2026 data, the full regulon is induced by 30/60 min (PHO5 LFC ~5.5 by 60) where WT shows only its fastest genes. This early PHO induction is a new phenotype we didn’t see in WT.

This is separable from the genome-wide count. The mutant’s total early DEG count is lower than WT’s (the generic early stress/handling burst is smaller), but its phosphate-specific induction is earlier and stronger.

E009 mock-vs-noPi analysis

What this analysis is, and why it’s separate from the landscape.: This analysis performs a mock contrast analysis to our noPi data. The noPi arm = handling + phosphate starvation. The mock arm = handling alone (resuspended in Pi-replete medium). So genes that move in both arms are the handling artifact; genes in the noPi arm only are the phosphate-specific response.

Design. Per species, mock DEGs vs noPi DEGs at 30/60 only (mocks have just those two timepoints, so noPi is restricted to match, like-for-like). Both arms referenced to the same t0. Overlap is same-direction (a gene up in one arm and down in the other is not “shared”, it’s reported separately). Three comparative-species Venns (Cgla/Klac/Calb) plus the two Scer runs’ own mock arms. phosphate-specific genes were used as a “positive control” if you will as must land in noPi-only, not in the shared set.

Scer

WT (yH545): noPi 30/60 calls ~964 DEGs, of which 437 are noPi-only and 527 are shared with mock; mock-only is just 22. So the mock arm barely does anything on its own, almost everything mock does, noPi also does, meaning the noPi response is genuinely phosphate-driven, not handling.
Important caveat for WT: the shared (handling) block (527) is larger than the phosphate-specific block (437) at 30/60. WT’s early response is handling-dominated at these timepoints (consistent with the landscape’s early stress wave). So phosphate-specific conclusions for WT at 30/60 rest on the noPi-only slice, not on raw noPi DEG counts. The phosphate program proper is the late wave, past this window.
Mutant (yH1053): most phosphate-specific of all, 1040 noPi-only (68% of its noPi set), 484 shared, 47 mock-only.

Comparative species have an active mock arm

Cgla, Klac, Calb each show 520–600 mock-only DEGs, the medium swap itself drives a large response in these species. For Calb the mock arm (2458) is larger than noPi (2212).

Cgla weird result

Cgla PHO84 is up in both arms (in the shared set)
Not really sure where this is from

Cross-species PHO read

Generally, when we take the PHO set JA curated, we find that those genes are only induced in the noPi arm. The mock arm includes genes like HOR2 or CBF1 that respond similarly in certain species mock+nopi backgrounds. The one I cannot reconcile yet is the cgPHO84 described above.

library(edgeR)
#took this from 11212025 TMM processing 
counts_new_dir <- here("14_DESeq2_062026data", "inputs", "counts_new")
out_base       <- here("14_DESeq2_062026data", "outputs")

lcpm_runs <- list(
  yH545  = list(file = "yH545_salmon.merged.gene_counts.tsv",  prefix = "gene-"),
  yH1053 = list(file = "yH1053_salmon.merged.gene_counts.tsv", prefix = "gene-"),
  yH001  = list(file = "yH001_salmon.merged.gene_counts.tsv",  prefix = "gene-"),
  yH149  = list(file = "yH149_salmon.merged.gene_counts.tsv",  prefix = "gene-"),
  yH714  = list(file = "yH714_salmon.merged.gene_counts.tsv",  prefix = "")
)

make_lcpm <- function(run_code, counts_file, id_prefix) {
  raw <- read_tsv(counts_file, show_col_types = FALSE)
  id_col <- names(raw)[1]; name_col <- names(raw)[2]
  gene_ids <- clean_gene_id(raw[[id_col]], id_prefix)
  sample_cols <- setdiff(names(raw), c(id_col, name_col))

  count_mat <- as.matrix(raw[, sample_cols])
  count_mat <- apply(count_mat, 2, function(x) as.integer(round(x)))
  rownames(count_mat) <- gene_ids

  keep <- rowSums(count_mat >= 10) >= 2
  count_mat <- count_mat[keep, ]

  dge <- DGEList(counts = count_mat)
  dge <- calcNormFactors(dge, method = "TMM")
  lcpm <- edgeR::cpm(dge, log = TRUE)

  as.data.frame(lcpm) |>
    rownames_to_column("gene_id")
}

fmt_tp_label <- function(tp_min) {
  ifelse(tp_min < 60, paste0(tp_min, "min"), paste0(tp_min / 60, "h"))
}

build_sample_info <- function(samples) {
  si <- tibble(Sample = samples) |>
    mutate(
      Condition = parse_cond(Sample),
      tp_min    = parse_tp_new(Sample),
      Replicate = as.integer(str_extract(Sample, "(?<=rep)\\d+"))
    )
  stopifnot(!any(is.na(si$Condition)), !any(is.na(si$tp_min)), !any(is.na(si$Replicate)))
  si |>
    arrange(factor(Condition, levels = c("noPi", "mock")), tp_min, Replicate) |>
    group_by(Condition, tp_min) |>
    mutate(StandardizedReplicate = dense_rank(Replicate)) |>
    ungroup() |>
    transmute(Sample, Condition, Timepoint = fmt_tp_label(tp_min),
              Replicate, StandardizedReplicate)
}

for (rc in names(lcpm_runs)) {
  cfg <- lcpm_runs[[rc]]
  mat <- make_lcpm(rc, file.path(counts_new_dir, cfg$file), cfg$prefix)

  sample_info <- build_sample_info(setdiff(names(mat), "gene_id"))
  mat <- mat[, c("gene_id", sample_info$Sample)]

  lcpm_dir <- file.path(out_base, rc, "lcpm")
  dir.create(lcpm_dir, recursive = TRUE, showWarnings = FALSE)
  write_csv(mat, file.path(lcpm_dir, sprintf("%s_tmm_log2cpm.csv", rc)))
  write_csv(sample_info, file.path(lcpm_dir, sprintf("%s_sample_info.csv", rc)))
}

#processing Rlog here 
make_rlog <- function(run_code, counts_file, id_prefix) {
  raw <- read_tsv(counts_file, show_col_types = FALSE)
  id_col <- names(raw)[1]; name_col <- names(raw)[2]
  gene_ids <- clean_gene_id(raw[[id_col]], id_prefix)
  sample_cols <- setdiff(names(raw), c(id_col, name_col))

  count_mat <- as.matrix(raw[, sample_cols])
  count_mat <- apply(count_mat, 2, function(x) as.integer(round(x)))
  rownames(count_mat) <- gene_ids

  col_data <- data.frame(
    condition = factor(parse_cond(colnames(count_mat))),
    timepoint = factor(parse_tp_new(colnames(count_mat))),
    row.names = colnames(count_mat)
  )
  design <- if (nlevels(col_data$condition) > 1) ~ condition + timepoint else ~ timepoint

  dds <- DESeqDataSetFromMatrix(countData = count_mat, colData = col_data, design = design)
  dds <- DESeq(dds, quiet = TRUE)
  rld <- assay(rlog(dds, blind = FALSE))

  keep <- rowSums(count_mat) >= 10
  as.data.frame(rld[keep, ]) |>
    rownames_to_column("gene_id")
}

for (rc in names(lcpm_runs)) {
  cfg <- lcpm_runs[[rc]]
  mat <- make_rlog(rc, file.path(counts_new_dir, cfg$file), cfg$prefix)

  sample_info <- build_sample_info(setdiff(names(mat), "gene_id"))
  mat <- mat[, c("gene_id", sample_info$Sample)]

  rlog_dir <- file.path(out_base, rc, "rlog")
  dir.create(rlog_dir, recursive = TRUE, showWarnings = FALSE)
  write_csv(mat, file.path(rlog_dir, sprintf("%s_rlog.csv", rc)))
  write_csv(sample_info, file.path(rlog_dir, sprintf("%s_sample_info.csv", rc)))
}

060402026-raw-count-process_HE26069

Hubert Kicinski

2026-06-04

Checking for Batch Effects

Notes

Counts QC, remaining species

yH001 (C. glabrata, mock) counts QC notes

Library size barplot

Count distribution boxplot

PCA

Replicate SD boxplot

RLE boxplot

Sample distance heatmap

MA plot, 30m rep1 vs rep2

Bottom line

yH149 (K. lactis, mock) counts QC notes

Library size barplot

Count distribution boxplot

PCA

Replicate SD boxplot

RLE boxplot

Sample distance heatmap

Bottom line

yH181 (C. glabrata, casamino acids + glucose) counts QC notes

Library size barplot

Count distribution boxplot

PCA

Replicate SD boxplot

RLE boxplot

Sample distance heatmap

Bottom line

yH714 (C. albicans, mock) counts QC notes

Library size barplot

Count distribution boxplot

PCA

Replicate SD boxplot

RLE boxplot

Sample distance heatmap

Bottom line

yH1053 (S. cerevisiae ppx1Δ ppn1Δ, noPi + mock) counts QC notes

Library size barplot

Count distribution boxplot

PCA

Replicate SD boxplot

RLE boxplot

Sample distance heatmap

Bottom line

E009 DEG landscape

WT (yH545) has the same 60, 90 trough in new data

Mutant (yH1053) DEG Landscape

E009 mock-vs-noPi analysis

Scer

Comparative species have an active mock arm

Cgla weird result

Cross-species PHO read