Given that we are aiming at replacing the OG S. cerevisiae data with the new yH545 data, I want to interrogate the batch effects present between the two runs.
To do this, I will run a joint normalization of TMM followed by Log2CPM as originally performed. The goal of the joint normalization is to ensure that the normalization factor is equal for the two runs. Otherwise, the two separately normalized runs would not be directly comparable.
library(here)
library(edgeR)
library(dplyr)
library(tibble)
library(tidyr)
library(stringr)
library(ggplot2)
library(ggprism)
base_dir <- here("14_DESeq2_062026data")
input_dir <- file.path(base_dir, "inputs")
output_dir <- file.path(base_dir, "outputs", "yH545")
figure_dir <- file.path(base_dir, "figures", "Counts-QC", "yH545")
new_counts_file <- file.path(input_dir, "counts_new", "yH545_salmon.merged.gene_counts.tsv")
og_counts_file <- file.path(input_dir, "counts_OG", "SCER-salmon.merged.gene_counts.tsv")
stopifnot(file.exists(new_counts_file), file.exists(og_counts_file))
save_fig <- function(plot, filename, width = 10, height = 8) {
ggsave(file.path(figure_dir, filename), plot = plot,
width = width, height = height, dpi = 300)
}
collapse_lanes <- function(counts_raw) {
has_lane <- any(grepl("\\.L00[12]$", colnames(counts_raw)))
if (!has_lane) return(as.matrix(counts_raw))
sample_base <- gsub("\\.L00[12]$", "", colnames(counts_raw))
unique_samples <- unique(sample_base)
collapsed <- sapply(unique_samples, function(samp) {
cols <- grep(paste0("^", samp, "\\.L00"), colnames(counts_raw), value = TRUE)
rowSums(counts_raw[, cols, drop = FALSE])
})
rownames(collapsed) <- rownames(counts_raw)
collapsed
}
extract_tp_min <- function(sample_names) {
m <- as.integer(gsub("m", "", str_extract(sample_names, "[0-9]+m")))
t <- as.integer(str_remove(str_extract(sample_names, "t[0-9]+"), "t"))
ifelse(!is.na(m), m, t)
}
new_raw <- read.delim(new_counts_file, row.names = 1, check.names = FALSE)
og_raw <- read.delim(og_counts_file, row.names = 1, check.names = FALSE)
new_raw <- new_raw[, !colnames(new_raw) %in% "gene_name", drop = FALSE]
og_raw <- og_raw[, !colnames(og_raw) %in% "gene_name", drop = FALSE]
new_counts <- collapse_lanes(new_raw)
og_counts <- collapse_lanes(og_raw)
new_counts <- new_counts[, !grepl("mock", colnames(new_counts), ignore.case = TRUE), drop = FALSE]
og_counts <- og_counts[, !grepl("mock", colnames(og_counts), ignore.case = TRUE), drop = FALSE]
common_genes <- intersect(rownames(new_counts), rownames(og_counts))
new_counts <- new_counts[common_genes, , drop = FALSE]
og_counts <- og_counts[common_genes, , drop = FALSE]
colnames(new_counts) <- paste0("new__", colnames(new_counts))
colnames(og_counts) <- paste0("old__", colnames(og_counts))
combined <- cbind(og_counts, new_counts)
combined <- apply(combined, 2, function(x) as.integer(round(x)))
rownames(combined) <- common_genes
meta <- tibble(
sample = colnames(combined),
batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
timepoint_min = extract_tp_min(sample)
) |>
mutate(
batch = factor(batch, levels = c("old (2023)", "new (2026)")),
timepoint_min = factor(timepoint_min, levels = sort(unique(timepoint_min)))
)
#keep unfiltered matrix for library size
combined_raw_full <- combined
keep <- rowSums(combined >= 10) >= 2
combined <- combined[keep, ]
dge <- DGEList(counts = combined)
dge <- calcNormFactors(dge, method = "TMM")
lcpm <- cpm(dge, log = TRUE, prior.count = 2)
write.csv(lcpm, file.path(output_dir, "01_combined_scer_jointTMM_log2cpm.csv"))
pca <- prcomp(t(lcpm), center = TRUE, scale. = FALSE)
var_explained <- (pca$sdev^2) / sum(pca$sdev^2) * 100
pca_df <- as.data.frame(pca$x[, 1:2]) |>
rownames_to_column("sample") |>
left_join(meta, by = "sample") |>
mutate(timepoint_min = droplevels(timepoint_min))
stopifnot(nrow(pca_df) == ncol(lcpm), !any(is.na(pca_df$timepoint_min)))
shared_tps <- c(0, 30, 60, 90, 120, 180, 240, 360, 480)
pca_df <- pca_df |>
mutate(shared = as.integer(as.character(timepoint_min)) %in% shared_tps)
write.csv(pca_df, file.path(output_dir, "01_pca_coordinates.csv"), row.names = FALSE)
x_lab <- sprintf("PC1 (%.1f%%)", var_explained[1])
y_lab <- sprintf("PC2 (%.1f%%)", var_explained[2])
p_batch <- ggplot(pca_df, aes(x = PC1, y = PC2, fill = batch)) +
geom_point(shape = 21, size = 3.5, stroke = 0.8, colour = "grey20") +
scale_fill_manual(values = c("old (2023)" = "#377EB8", "new (2026)" = "#E41A1C"),
name = NULL) +
labs(x = x_lab, y = y_lab, title = "Combined S. cerevisiae PCA, by batch") +
theme_prism(base_size = 12) +
theme(legend.position = "bottom")
p_time <- ggplot(pca_df, aes(x = PC1, y = PC2, fill = timepoint_min, shape = batch)) +
geom_point(size = 3.5, stroke = 0.8, colour = "grey20") +
scale_shape_manual(values = c("old (2023)" = 21, "new (2026)" = 24), name = NULL) +
scale_fill_viridis_d(option = "viridis", name = "timepoint (min)") +
guides(fill = guide_legend(override.aes = list(shape = 21, colour = "grey20"))) +
labs(x = x_lab, y = y_lab, title = "Combined S. cerevisiae PCA, by timepoint") +
theme_prism(base_size = 12) +
theme(legend.position = "right")
save_fig(p_batch, "01_pca_by_batch.png", width = 8, height = 7)
save_fig(p_time, "01_pca_by_timepoint.png", width = 9, height = 7)
p_batch
p_time
It appears that batch is not a dominant axis of variation.
PC1 seems to be the time axis. As you move from -PC1 to +PC1, you move across the timecourse, early to late. PC2 is some additional response axis. However, you cannot call it a batch. All together, this explains 73.5% variance.
extract_tp_min <- function(sample_names) {
m <- as.integer(gsub("m", "", str_extract(sample_names, "[0-9]+m")))
t <- as.integer(str_remove(str_extract(sample_names, "t[0-9]+"), "t"))
ifelse(!is.na(m), m, t)
}
lcpm <- as.matrix(read.csv(
file.path(output_dir, "01_combined_scer_jointTMM_log2cpm.csv"),
row.names = 1, check.names = FALSE
))
sample_meta <- tibble(sample = colnames(lcpm)) |>
mutate(
batch = ifelse(grepl("^new__", sample), "new", "old"),
timepoint_min = extract_tp_min(sample)
)
shared_tps <- c(0, 30, 60, 90, 120, 180, 240, 360, 480)
lfc_tps <- setdiff(shared_tps, 0)
mean_profile <- function(mat, meta, b, t) {
cols <- meta$sample[meta$batch == b & meta$timepoint_min == t]
rowMeans(mat[, cols, drop = FALSE])
}
abs_conc <- bind_rows(lapply(shared_tps, function(t) {
o <- mean_profile(lcpm, sample_meta, "old", t)
n <- mean_profile(lcpm, sample_meta, "new", t)
tibble(timepoint_min = t, gene = names(o), old_mean = o, new_mean = n)
})) |>
mutate(tp = factor(timepoint_min, levels = shared_tps))
abs_cor <- abs_conc |>
group_by(timepoint_min, tp) |>
summarise(
abs_pearson = cor(old_mean, new_mean, method = "pearson"),
abs_spearman = cor(old_mean, new_mean, method = "spearman"),
.groups = "drop"
)
old_t0 <- mean_profile(lcpm, sample_meta, "old", 0)
new_t0 <- mean_profile(lcpm, sample_meta, "new", 0)
lfc_conc <- bind_rows(lapply(lfc_tps, function(t) {
old_lfc <- mean_profile(lcpm, sample_meta, "old", t) - old_t0
new_lfc <- mean_profile(lcpm, sample_meta, "new", t) - new_t0
tibble(timepoint_min = t, gene = names(old_lfc), old_lfc = old_lfc, new_lfc = new_lfc)
})) |>
mutate(tp = factor(timepoint_min, levels = lfc_tps))
lfc_cor <- lfc_conc |>
group_by(timepoint_min, tp) |>
summarise(
lfc_pearson = cor(old_lfc, new_lfc, method = "pearson"),
lfc_spearman = cor(old_lfc, new_lfc, method = "spearman"),
.groups = "drop"
)
summary_tbl <- abs_cor |>
select(timepoint_min, abs_pearson, abs_spearman) |>
left_join(select(lfc_cor, timepoint_min, lfc_pearson, lfc_spearman),
by = "timepoint_min") |>
arrange(timepoint_min)
write.csv(summary_tbl, file.path(output_dir, "02_concordance_correlation_summary.csv"),
row.names = FALSE)
p_abs <- ggplot(abs_conc, aes(x = old_mean, y = new_mean)) +
geom_point(size = 0.4, alpha = 0.25, colour = "grey30") +
geom_abline(slope = 1, intercept = 0, colour = "#E41A1C", linewidth = 0.6) +
geom_text(
data = abs_cor,
aes(x = -Inf, y = Inf, label = sprintf("r = %.3f", abs_pearson)),
hjust = -0.15, vjust = 1.5, size = 3.4, inherit.aes = FALSE
) +
facet_wrap(~ tp, ncol = 3) +
labs(
x = "old (2023) mean log2CPM",
y = "new (2026) mean log2CPM",
title = "Old vs new expression concordance, shared timepoints"
) +
theme_prism(base_size = 11) +
theme(strip.text = element_text(size = 10))
p_lfc <- ggplot(lfc_conc, aes(x = old_lfc, y = new_lfc)) +
geom_point(size = 0.4, alpha = 0.25, colour = "grey30") +
geom_hline(yintercept = 0, colour = "grey75", linewidth = 0.3) +
geom_vline(xintercept = 0, colour = "grey75", linewidth = 0.3) +
geom_abline(slope = 1, intercept = 0, colour = "#E41A1C", linewidth = 0.6) +
geom_text(
data = lfc_cor,
aes(x = -Inf, y = Inf, label = sprintf("r = %.3f", lfc_pearson)),
hjust = -0.15, vjust = 1.5, size = 3.4, inherit.aes = FALSE
) +
facet_wrap(~ tp, ncol = 4) +
labs(
x = "old (2023) log2FC vs t0",
y = "new (2026) log2FC vs t0",
title = "Old vs new response concordance (log2FC), shared timepoints"
) +
theme_prism(base_size = 11) +
theme(strip.text = element_text(size = 10))
save_fig(p_abs, "02_concordance_scatter_absolute.png", width = 11, height = 10)
save_fig(p_lfc, "02_concordance_scatter_lfc.png", width = 12, height = 7)
p_abs
p_lfc
pho_ids <- c(
"gene-YML123C" = "PHO84",
"gene-YGR233C" = "PHO81",
"gene-YOL001W" = "PHO80",
"gene-YFR034C" = "PHO4"
)
stopifnot(all(names(pho_ids) %in% rownames(lcpm)))
batch_colors <- c("old (2023)" = "#377EB8", "new (2026)" = "#E41A1C")
plot_pho_gene <- function(id, symbol) {
g_long <- tibble(
sample = colnames(lcpm),
expr = lcpm[id, ]
) |>
left_join(sample_meta, by = "sample") |>
filter(timepoint_min %in% shared_tps) |>
mutate(batch = factor(ifelse(batch == "old", "old (2023)", "new (2026)"),
levels = c("old (2023)", "new (2026)")))
g_t0 <- g_long |>
filter(timepoint_min == 0) |>
group_by(batch) |>
summarise(t0_mean = mean(expr), .groups = "drop")
g_long <- g_long |>
left_join(g_t0, by = "batch") |>
mutate(lfc = expr - t0_mean)
g_plot <- g_long |>
select(batch, timepoint_min, expr, lfc) |>
pivot_longer(c(expr, lfc), names_to = "metric", values_to = "value") |>
mutate(metric = factor(recode(metric,
expr = "absolute (log2CPM)",
lfc = "log2FC vs t0"),
levels = c("absolute (log2CPM)", "log2FC vs t0")))
g_mean <- g_plot |>
group_by(batch, timepoint_min, metric) |>
summarise(value = mean(value), .groups = "drop")
p <- ggplot(g_plot, aes(x = timepoint_min, y = value, colour = batch)) +
geom_point(size = 2, alpha = 0.6) +
geom_line(data = g_mean, aes(group = batch), linewidth = 1) +
facet_wrap(~ metric, scales = "free_y") +
scale_colour_manual(values = batch_colors, name = NULL) +
scale_x_continuous(breaks = shared_tps) +
labs(x = "Timepoint (min)", y = NULL,
title = paste0(symbol, " expression, old vs new")) +
theme_prism(base_size = 12) +
theme(legend.position = "bottom")
save_fig(p, paste0("02_", symbol, "_old_vs_new.png"), width = 11, height = 5)
p
}
pho_plots <- Map(plot_pho_gene, names(pho_ids), pho_ids)
pho_plots
## $`gene-YML123C`
##
## $`gene-YGR233C`
##
## $`gene-YOL001W`
##
## $`gene-YFR034C`
sd_meta <- tibble(sample = colnames(lcpm)) |>
mutate(batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
tp = extract_tp_min(sample))
rep_sd <- bind_rows(lapply(split(sd_meta, list(sd_meta$batch, sd_meta$tp), drop = TRUE), function(grp) {
cols <- grp$sample
if (length(cols) < 2) return(NULL)
tibble(
batch = grp$batch[1],
tp = grp$tp[1],
gene = rownames(lcpm),
sd = matrixStats::rowSds(lcpm[, cols, drop = FALSE])
)
})) |>
mutate(
batch = factor(batch, levels = c("old (2023)", "new (2026)")),
tp = factor(tp, levels = sort(unique(tp)))
)
sd_median <- rep_sd |>
group_by(batch, tp) |>
summarise(median_sd = median(sd), .groups = "drop")
write.csv(sd_median, file.path(output_dir, "03_replicate_sd_median.csv"), row.names = FALSE)
batch_colors <- c("old (2023)" = "#377EB8", "new (2026)" = "#E41A1C")
p_sd <- ggplot(rep_sd, aes(x = tp, y = sd, fill = batch)) +
geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2, position = position_dodge(0.8)) +
scale_fill_manual(values = batch_colors, name = NULL) +
coord_cartesian(ylim = c(0, quantile(rep_sd$sd, 0.99))) +
labs(x = "Timepoint (min)", y = "Within-replicate SD (log2CPM)",
title = "Replicate spread per timepoint, old vs new") +
theme_prism(base_size = 12) +
theme(legend.position = "bottom")
save_fig(p_sd, "03_replicate_sd_boxplot.png", width = 11, height = 6)
p_sd
rle_meta <- tibble(sample = colnames(lcpm)) |>
mutate(batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
tp = extract_tp_min(sample))
#reference = within batch x timepoint median, so deviation reflects disagreement with sibling replicates
rle_mat <- lcpm
for (g in unique(interaction(rle_meta$batch, rle_meta$tp, drop = TRUE))) {
cols <- rle_meta$sample[interaction(rle_meta$batch, rle_meta$tp, drop = TRUE) == g]
if (length(cols) < 2) next
ref <- matrixStats::rowMedians(lcpm[, cols, drop = FALSE])
rle_mat[, cols] <- lcpm[, cols, drop = FALSE] - ref
}
rle_long <- as.data.frame(rle_mat) |>
rownames_to_column("gene") |>
pivot_longer(-gene, names_to = "sample", values_to = "rle") |>
left_join(rle_meta, by = "sample") |>
filter(tp %in% shared_tps) |>
mutate(
batch = factor(batch, levels = c("old (2023)", "new (2026)")),
sample_lab = gsub("^(old__|new__)", "", sample),
sample_lab = factor(sample_lab, levels = unique(sample_lab[order(tp, batch)]))
)
batch_colors <- c("old (2023)" = "#377EB8", "new (2026)" = "#E41A1C")
p_rle <- ggplot(rle_long, aes(x = sample_lab, y = rle, fill = batch)) +
geom_hline(yintercept = 0, colour = "grey60", linewidth = 0.3) +
geom_boxplot(outlier.size = 0.2, outlier.alpha = 0.15) +
scale_fill_manual(values = batch_colors, name = NULL) +
coord_cartesian(ylim = c(-0.6, 0.6)) +
labs(x = NULL, y = "RLE (log2CPM vs within-group median)",
title = "Per-library RLE, old vs new") +
theme_prism(base_size = 10) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
legend.position = "bottom")
save_fig(p_rle, "03_rle_boxplot.png", width = 16, height = 6)
p_rle
#sample-to-sample euclidean distance, new data only
new_cols <- colnames(lcpm)[grepl("^new__", colnames(lcpm))]
lcpm_new <- lcpm[, new_cols]
new_meta <- tibble(sample = new_cols) |>
mutate(tp = extract_tp_min(sample),
sample_lab = gsub("^new__", "", sample))
sample_dists <- dist(t(lcpm_new))
dist_mat <- as.matrix(sample_dists)
dimnames(dist_mat) <- list(new_meta$sample_lab, new_meta$sample_lab)
annot <- new_meta |>
mutate(timepoint = factor(tp, levels = sort(unique(tp)))) |>
select(sample_lab, timepoint) |>
column_to_rownames("sample_lab")
annot_colors <- list(
timepoint = setNames(viridisLite::viridis(nlevels(annot$timepoint)),
levels(annot$timepoint))
)
ph <- pheatmap::pheatmap(
dist_mat,
clustering_distance_rows = sample_dists,
clustering_distance_cols = sample_dists,
annotation_row = annot,
annotation_col = annot,
annotation_colors = annot_colors,
color = colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")))(255),
fontsize_row = 7, fontsize_col = 7,
main = "Sample-to-sample distance, new (2026) data (Euclidean, log2CPM)",
silent = TRUE
)
png(file.path(figure_dir, "03_sample_distance_heatmap_new.png"),
width = 10, height = 10, units = "in", res = 300)
grid::grid.newpage(); grid::grid.draw(ph$gtable)
dev.off()
## png
## 2
grid::grid.newpage(); grid::grid.draw(ph$gtable)
#per-library log2CPM distribution
dist_meta <- tibble(sample = colnames(lcpm)) |>
mutate(batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
tp = extract_tp_min(sample),
sample_lab = gsub("^(old__|new__)", "", sample))
dist_long <- as.data.frame(lcpm) |>
pivot_longer(everything(), names_to = "sample", values_to = "log2cpm") |>
left_join(dist_meta, by = "sample") |>
mutate(
batch = factor(batch, levels = c("old (2023)", "new (2026)")),
sample_lab = factor(sample_lab, levels = unique(sample_lab[order(tp, batch)]))
)
p_dist <- ggplot(dist_long, aes(x = sample_lab, y = log2cpm, fill = batch)) +
geom_boxplot(outlier.size = 0.2, outlier.alpha = 0.15) +
scale_fill_manual(values = batch_colors, name = NULL) +
labs(x = NULL, y = "log2CPM",
title = "Per-library count distribution, old vs new") +
theme_prism(base_size = 10) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
legend.position = "bottom")
save_fig(p_dist, "03_count_distribution_boxplot.png", width = 16, height = 6)
p_dist
libsize_meta <- tibble(sample = colnames(combined_raw_full)) |>
mutate(batch = ifelse(grepl("^new__", sample), "new (2026)", "old (2023)"),
tp = extract_tp_min(sample),
sample_lab = gsub("^(old__|new__)", "", sample))
libsize_df <- tibble(
sample = colnames(combined_raw_full),
total_reads = colSums(combined_raw_full)
) |>
left_join(libsize_meta, by = "sample") |>
mutate(
batch = factor(batch, levels = c("old (2023)", "new (2026)")),
sample_lab = factor(sample_lab, levels = unique(sample_lab[order(tp, batch)]))
)
write.csv(libsize_df |> select(sample, batch, tp, total_reads),
file.path(output_dir, "03_library_size.csv"), row.names = FALSE)
p_libsize <- ggplot(libsize_df, aes(x = sample_lab, y = total_reads / 1e6, fill = batch)) +
geom_col() +
scale_fill_manual(values = batch_colors, name = NULL) +
labs(x = NULL, y = "Total assigned reads (millions)",
title = "Library size per sample, old vs new") +
theme_prism(base_size = 10) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
legend.position = "bottom")
save_fig(p_libsize, "03_library_size_barplot.png", width = 16, height = 6)
p_libsize
The yH545 batch/concordance work above defines the count-level QC we want for every run. The other five runs have no 2023 counterpart, so the concordance analyses (per-timepoint correlation, LFC concordance, PHO old-vs-new) do not apply. What carries over is the single-batch count-integrity QC: library size, RLE, replicate spread, count distribution, sample distance, and PCA. Each run is normalized on its own (single-batch TMM), replicate siblings are grouped by condition x timepoint.
extract_condition <- function(sample_names) {
vapply(strsplit(sample_names, "\\."), function(x) x[3], character(1))
}
run_counts_qc <- function(code, counts_file) {
fig_dir <- file.path(base_dir, "figures", "Counts-QC", code)
out_dir <- file.path(base_dir, "outputs", code)
sf <- function(plot, filename, width = 10, height = 8) {
ggsave(file.path(fig_dir, filename), plot = plot, width = width, height = height, dpi = 300)
}
raw <- read.delim(counts_file, row.names = 1, check.names = FALSE)
raw <- raw[, !colnames(raw) %in% "gene_name", drop = FALSE]
counts <- collapse_lanes(raw)
gene_ids <- rownames(counts)
counts <- apply(counts, 2, function(x) as.integer(round(x)))
rownames(counts) <- gene_ids
meta <- tibble(
sample = colnames(counts),
condition = extract_condition(sample),
tp = extract_tp_min(sample)
) |>
mutate(
sample_lab = gsub("^[^.]+\\.[^.]+\\.", "", sample),
condition = factor(condition, levels = sort(unique(condition)))
) |>
arrange(condition, tp) |>
mutate(
sample_lab = factor(sample_lab, levels = unique(sample_lab)),
tp_f = factor(tp, levels = sort(unique(tp))),
group = paste(condition, tp, sep = "_")
)
conditions <- levels(meta$condition)
cond_pal <- setNames(RColorBrewer::brewer.pal(max(3, length(conditions)), "Set1")[seq_along(conditions)], conditions)
cond_shapes <- setNames(c(21, 24, 22, 23)[seq_along(conditions)], conditions)
#library size on raw counts, before filtering
libsize_df <- tibble(sample = colnames(counts), total_reads = colSums(counts)) |>
left_join(meta, by = "sample")
write.csv(libsize_df |> select(sample, condition, tp, total_reads),
file.path(out_dir, "04_library_size.csv"), row.names = FALSE)
p_libsize <- ggplot(libsize_df, aes(x = sample_lab, y = total_reads / 1e6, fill = condition)) +
geom_col() +
scale_fill_manual(values = cond_pal, name = NULL) +
labs(x = NULL, y = "Total assigned reads (millions)",
title = paste0(code, " library size per sample")) +
theme_prism(base_size = 10) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
legend.position = "bottom")
sf(p_libsize, "04_library_size_barplot.png", width = 14, height = 6)
print(p_libsize)
#filter + single-batch TMM
keep <- rowSums(counts >= 10) >= 2
counts <- counts[keep, ]
dge <- DGEList(counts = counts)
dge <- calcNormFactors(dge, method = "TMM")
lcpm <- cpm(dge, log = TRUE, prior.count = 2)
write.csv(lcpm, file.path(out_dir, "04_log2cpm.csv"))
#pca, fill by timepoint, shape by condition
pca <- prcomp(t(lcpm), center = TRUE, scale. = FALSE)
ve <- (pca$sdev^2) / sum(pca$sdev^2) * 100
pca_df <- as.data.frame(pca$x[, 1:2]) |>
rownames_to_column("sample") |>
left_join(meta, by = "sample")
write.csv(pca_df |> select(sample, condition, tp, PC1, PC2),
file.path(out_dir, "04_pca_coordinates.csv"), row.names = FALSE)
p_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, fill = tp_f, shape = condition)) +
geom_point(size = 3.5, stroke = 0.8, colour = "grey20") +
scale_shape_manual(values = cond_shapes, name = NULL) +
scale_fill_viridis_d(option = "viridis", name = "timepoint (min)") +
guides(fill = guide_legend(override.aes = list(shape = 21, colour = "grey20"))) +
labs(x = sprintf("PC1 (%.1f%%)", ve[1]), y = sprintf("PC2 (%.1f%%)", ve[2]),
title = paste0(code, " PCA")) +
theme_prism(base_size = 12) +
theme(legend.position = "right")
sf(p_pca, "04_pca.png", width = 9, height = 7)
print(p_pca)
#within-replicate sd per condition x timepoint
rep_sd <- bind_rows(lapply(split(meta, meta$group, drop = TRUE), function(grp) {
cols <- grp$sample
if (length(cols) < 2) return(NULL)
tibble(condition = grp$condition[1], tp = grp$tp[1], gene = rownames(lcpm),
sd = matrixStats::rowSds(lcpm[, cols, drop = FALSE]))
})) |>
mutate(tp = factor(tp, levels = sort(unique(tp))))
if (nrow(rep_sd) > 0) {
write.csv(rep_sd |> group_by(condition, tp) |> summarise(median_sd = median(sd), .groups = "drop"),
file.path(out_dir, "04_replicate_sd_median.csv"), row.names = FALSE)
p_sd <- ggplot(rep_sd, aes(x = tp, y = sd, fill = condition)) +
geom_boxplot(outlier.size = 0.3, outlier.alpha = 0.2, position = position_dodge(0.8)) +
scale_fill_manual(values = cond_pal, name = NULL) +
coord_cartesian(ylim = c(0, quantile(rep_sd$sd, 0.99))) +
labs(x = "Timepoint (min)", y = "Within-replicate SD (log2CPM)",
title = paste0(code, " replicate spread per timepoint")) +
theme_prism(base_size = 12) +
theme(legend.position = "bottom")
sf(p_sd, "04_replicate_sd_boxplot.png", width = 11, height = 6)
print(p_sd)
}
#per-library rle, reference = within condition x timepoint median
rle_mat <- lcpm
for (gp in unique(meta$group)) {
cols <- meta$sample[meta$group == gp]
if (length(cols) < 2) next
ref <- matrixStats::rowMedians(lcpm[, cols, drop = FALSE])
rle_mat[, cols] <- lcpm[, cols, drop = FALSE] - ref
}
rle_long <- as.data.frame(rle_mat) |>
rownames_to_column("gene") |>
pivot_longer(-gene, names_to = "sample", values_to = "rle") |>
left_join(meta, by = "sample")
p_rle <- ggplot(rle_long, aes(x = sample_lab, y = rle, fill = condition)) +
geom_hline(yintercept = 0, colour = "grey60", linewidth = 0.3) +
geom_boxplot(outlier.size = 0.2, outlier.alpha = 0.15) +
scale_fill_manual(values = cond_pal, name = NULL) +
coord_cartesian(ylim = c(-0.6, 0.6)) +
labs(x = NULL, y = "RLE (log2CPM vs within-group median)",
title = paste0(code, " per-library RLE")) +
theme_prism(base_size = 10) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
legend.position = "bottom")
sf(p_rle, "04_rle_boxplot.png", width = 14, height = 6)
print(p_rle)
#per-library count distribution
dist_long <- as.data.frame(lcpm) |>
pivot_longer(everything(), names_to = "sample", values_to = "log2cpm") |>
left_join(meta, by = "sample")
p_dist <- ggplot(dist_long, aes(x = sample_lab, y = log2cpm, fill = condition)) +
geom_boxplot(outlier.size = 0.2, outlier.alpha = 0.15) +
scale_fill_manual(values = cond_pal, name = NULL) +
labs(x = NULL, y = "log2CPM",
title = paste0(code, " per-library count distribution")) +
theme_prism(base_size = 10) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
legend.position = "bottom")
sf(p_dist, "04_count_distribution_boxplot.png", width = 14, height = 6)
print(p_dist)
#sample-to-sample distance
sample_dists <- dist(t(lcpm))
dist_mat <- as.matrix(sample_dists)
lab_lookup <- setNames(as.character(meta$sample_lab), meta$sample)
dimnames(dist_mat) <- list(lab_lookup[colnames(lcpm)], lab_lookup[colnames(lcpm)])
annot <- meta |>
select(sample_lab, condition, timepoint = tp) |>
mutate(timepoint = factor(timepoint, levels = sort(unique(timepoint)))) |>
as.data.frame()
rownames(annot) <- annot$sample_lab
annot$sample_lab <- NULL
annot_colors <- list(
condition = cond_pal,
timepoint = setNames(viridisLite::viridis(nlevels(annot$timepoint)), levels(annot$timepoint))
)
ph <- pheatmap::pheatmap(
dist_mat,
clustering_distance_rows = sample_dists,
clustering_distance_cols = sample_dists,
annotation_row = annot, annotation_col = annot,
annotation_colors = annot_colors,
color = colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")))(255),
fontsize_row = 7, fontsize_col = 7,
main = paste0(code, " sample-to-sample distance (Euclidean, log2CPM)"),
silent = TRUE
)
png(file.path(fig_dir, "04_sample_distance_heatmap.png"), width = 10, height = 10, units = "in", res = 300)
grid::grid.newpage(); grid::grid.draw(ph$gtable)
dev.off()
grid::grid.newpage(); grid::grid.draw(ph$gtable)
invisible(lcpm)
}
qc_codes <- c("yH001", "yH149", "yH181", "yH714", "yH1053")
qc_files <- file.path(input_dir, "counts_new", paste0(qc_codes, "_salmon.merged.gene_counts.tsv"))
invisible(Map(run_counts_qc, qc_codes, qc_files))
Set: 3 libraries: mock 30m rep1, mock 30m rep2, mock 60m rep1. Only 30m is replicated. 60m is n=1, dropped before sequencing
Set: 4 libraries: mock 30m rep1, mock 30m rep2, mock 60m rep1, mock 60m rep2. Both timepoints replicated (n=2 each).
Set: 14 libraries, two conditions (casamino, glucose), three timepoints (0, 450, 690 min), mostly triplicate.
Set: 4 libraries: mock 30m rep1, mock 30m rep2, mock 60m rep1, mock 60m rep2. Both timepoints replicated (n=2 each).
Set: 26 libraries: noPi timecourse (0, 15, 30, 45, 60, 90, 120, 180, 240, 360, 480 min, 2 reps each) + mock (30m, 60m, 2 reps each).
#chunk 05: DESeq2 DEG analysis for the 2026 E009 reseq runs
#same thresholds as before; same design as befor e
#timecourse noPi-vs-t0 (own t0): yH545, yH1053
#mock-vs-original-t0 (no own t0): yH001 (Cgla), yH149 (Klac), yH714 (Calb)
#JA will do yH181 for his analyssi
library(DESeq2)
library(tidyverse)
library(patchwork)
padj_threshold <- 0.01
lfc_threshold <- 1
counts_new_dir <- here("14_DESeq2_062026data", "inputs", "counts_new")
counts_og_dir <- here("14_DESeq2_062026data", "inputs", "counts_OG")
out_base <- here("14_DESeq2_062026data", "outputs")
fig_dir <- here("14_DESeq2_062026data", "figures", "DEG")
dir.create(fig_dir, recursive = TRUE, showWarnings = FALSE)
#helpers
clean_gene_id <- function(ids, prefix) if (nzchar(prefix)) sub(paste0("^", prefix), "", ids) else ids
parse_tp_new <- function(x) as.integer(str_extract(x, "\\d+(?=m\\.rep)")) #new naming: noPi.30m.rep1
parse_cond <- function(x) str_extract(x, "mock|noPi")
parse_tp_og <- function(x) { #old naming: t0000 zero-pad, or Xm
z1 <- str_extract(x, "(?<=t)\\d{3,4}")
z2 <- str_extract(x, "\\d+(?=m)")
ifelse(!is.na(z1), as.integer(z1), as.integer(z2))
}
save_fig <- function(p, fname, width, height) {
ggsave(file.path(fig_dir, fname), plot = p, width = width, height = height,
dpi = 300, bg = "white")
}
#shared universe/DEG summarization (factored out so both run functions stay parallel)
summarise_run <- function(results_list) {
all_res <- bind_rows(results_list)
universe <- all_res |>
group_by(gene_id) |>
dplyr::summarise(
baseMean = mean(baseMean, na.rm = TRUE),
max_abs_lfc = max(abs(log2FoldChange), na.rm = TRUE),
min_padj = min(padj, na.rm = TRUE),
.groups = "drop"
)
sig_any <- all_res |>
dplyr::filter(!is.na(padj), padj < padj_threshold, abs(log2FoldChange) > lfc_threshold) |>
dplyr::pull(gene_id) |>
unique()
degs <- all_res |>
dplyr::filter(gene_id %in% sig_any, !is.na(padj)) |>
group_by(gene_id) |>
dplyr::slice_min(padj, n = 1, with_ties = FALSE) |>
ungroup() |>
mutate(direction = ifelse(log2FoldChange > 0, "up", "down")) |>
dplyr::select(gene_id, baseMean, log2FoldChange, padj, timepoint, direction)
degs_up <- degs |> dplyr::filter(direction == "up")
degs_down <- degs |> dplyr::filter(direction == "down")
message(sprintf(" universe: %d | DEGs: %d (up: %d, down: %d)",
nrow(universe), nrow(degs), nrow(degs_up), nrow(degs_down)))
list(universe = universe, degs = degs, degs_up = degs_up, degs_down = degs_down)
}
#timecourse noPi-vs-t0, reference t0 lives inside the file
run_species_deseq <- function(run_code, counts_file, id_prefix, condition_keep = NULL) {
message(sprintf("=== %s ===", run_code))
counts_raw <- read_tsv(counts_file, show_col_types = FALSE)
id_col <- names(counts_raw)[1]; name_col <- names(counts_raw)[2]
gene_ids_clean <- clean_gene_id(counts_raw[[id_col]], id_prefix)
sample_cols <- setdiff(names(counts_raw), c(id_col, name_col))
if (!is.null(condition_keep)) {
sample_cols <- sample_cols[parse_cond(sample_cols) %in% condition_keep]
}
count_mat <- as.matrix(counts_raw[, sample_cols])
count_mat <- apply(count_mat, 2, function(x) as.integer(round(x)))
rownames(count_mat) <- gene_ids_clean
tp_min <- parse_tp_new(colnames(count_mat))
stopifnot(!any(is.na(tp_min)))
tp_levels <- as.character(sort(unique(tp_min)))
col_data <- data.frame(
sample = colnames(count_mat),
timepoint = factor(as.character(tp_min), levels = tp_levels),
row.names = colnames(count_mat)
)
ref_tp <- if ("0" %in% tp_levels) "0" else tp_levels[1]
col_data$timepoint <- relevel(col_data$timepoint, ref = ref_tp)
test_tps <- setdiff(tp_levels, ref_tp)
message(sprintf(" ref: %s | test: %s", ref_tp, paste(test_tps, collapse = ", ")))
keep <- rowSums(count_mat >= 10) >= 2
count_mat <- count_mat[keep, ]
message(sprintf(" genes after filter: %d", nrow(count_mat)))
dds <- DESeqDataSetFromMatrix(countData = count_mat, colData = col_data, design = ~ timepoint)
dds <- DESeq(dds, quiet = TRUE)
results_list <- list()
for (tp in test_tps) {
keep_samp <- col_data$timepoint %in% c(ref_tp, tp)
dds_sub <- dds[, keep_samp]
colData(dds_sub)$timepoint <- droplevels(colData(dds_sub)$timepoint)
colData(dds_sub)$timepoint <- relevel(colData(dds_sub)$timepoint, ref = ref_tp)
dds_sub <- DESeq(dds_sub, quiet = TRUE)
coef_name <- resultsNames(dds_sub)[2]
res <- lfcShrink(dds_sub, coef = coef_name, type = "apeglm", quiet = TRUE)
res_df <- as.data.frame(res) |>
rownames_to_column("gene_id") |>
mutate(timepoint = tp, contrast = paste0(tp, "_vs_", ref_tp))
results_list[[tp]] <- res_df
}
out <- summarise_run(results_list)
c(list(species = run_code, type = "timecourse", dds = dds, results_list = results_list), out)
}
#mock-vs-original-t0, baseline pulled from the original-species matrix and joined on common genes
run_mock_deseq <- function(run_code, mock_file, og_file, id_prefix) {
message(sprintf("=== %s (mock vs original t0) ===", run_code))
m_raw <- read_tsv(mock_file, show_col_types = FALSE)
m_id <- names(m_raw)[1]; m_nm <- names(m_raw)[2]
m_ids <- clean_gene_id(m_raw[[m_id]], id_prefix)
m_samp <- setdiff(names(m_raw), c(m_id, m_nm))
m_samp <- m_samp[parse_cond(m_samp) %in% "mock"]
m_tp <- parse_tp_new(m_samp)
stopifnot(!any(is.na(m_tp)))
m_mat <- as.matrix(m_raw[, m_samp]); rownames(m_mat) <- m_ids
o_raw <- read_tsv(og_file, show_col_types = FALSE)
o_id <- names(o_raw)[1]; o_nm <- names(o_raw)[2]
o_ids <- clean_gene_id(o_raw[[o_id]], id_prefix)
o_samp <- setdiff(names(o_raw), c(o_id, o_nm))
o_tp <- parse_tp_og(o_samp)
message(sprintf(" original timepoints parsed: %s",
paste(sort(unique(o_tp)), collapse = ", ")))
ref_samp <- o_samp[!is.na(o_tp) & o_tp == 0]
stopifnot(length(ref_samp) >= 1) #loud fail if original t0 not found
o_mat <- as.matrix(o_raw[, ref_samp, drop = FALSE]); rownames(o_mat) <- o_ids
common <- intersect(rownames(m_mat), rownames(o_mat))
message(sprintf(" mock genes: %d | og genes: %d | common: %d",
nrow(m_mat), nrow(o_mat), length(common)))
m_mat <- m_mat[common, , drop = FALSE]; o_mat <- o_mat[common, , drop = FALSE]
count_mat <- cbind(o_mat, m_mat)
count_mat <- apply(count_mat, 2, function(x) as.integer(round(x)))
rownames(count_mat) <- common
grp <- c(rep("t0", ncol(o_mat)), paste0("mock", m_tp))
grp_levels <- c("t0", paste0("mock", sort(unique(m_tp))))
col_data <- data.frame(
sample = colnames(count_mat),
group = factor(grp, levels = grp_levels),
row.names = colnames(count_mat)
)
col_data$group <- relevel(col_data$group, ref = "t0")
keep <- rowSums(count_mat >= 10) >= 2
count_mat <- count_mat[keep, ]
message(sprintf(" genes after filter: %d", nrow(count_mat)))
dds <- DESeqDataSetFromMatrix(countData = count_mat, colData = col_data, design = ~ group)
dds <- DESeq(dds, quiet = TRUE)
test_grps <- setdiff(levels(col_data$group), "t0")
results_list <- list()
for (g in test_grps) {
keep_samp <- col_data$group %in% c("t0", g)
dds_sub <- dds[, keep_samp]
colData(dds_sub)$group <- droplevels(colData(dds_sub)$group)
colData(dds_sub)$group <- relevel(colData(dds_sub)$group, ref = "t0")
dds_sub <- DESeq(dds_sub, quiet = TRUE)
coef_name <- resultsNames(dds_sub)[2]
res <- lfcShrink(dds_sub, coef = coef_name, type = "apeglm", quiet = TRUE)
tp <- as.character(as.integer(sub("mock", "", g)))
res_df <- as.data.frame(res) |>
rownames_to_column("gene_id") |>
mutate(timepoint = tp, contrast = paste0(g, "_vs_t0"))
results_list[[tp]] <- res_df
}
out <- summarise_run(results_list)
c(list(species = run_code, type = "mock", dds = dds, results_list = results_list), out)
}
deseq_results <- list()
deseq_results$yH545 <- run_species_deseq(
"yH545", file.path(counts_new_dir, "yH545_salmon.merged.gene_counts.tsv"), "gene-")
deseq_results$yH1053 <- run_species_deseq(
"yH1053", file.path(counts_new_dir, "yH1053_salmon.merged.gene_counts.tsv"), "gene-",
condition_keep = "noPi")
deseq_results$yH001 <- run_mock_deseq(
"yH001", file.path(counts_new_dir, "yH001_salmon.merged.gene_counts.tsv"),
file.path(counts_og_dir, "CGLA-salmon.merged.gene_counts.tsv"), "gene-")
deseq_results$yH149 <- run_mock_deseq(
"yH149", file.path(counts_new_dir, "yH149_salmon.merged.gene_counts.tsv"),
file.path(counts_og_dir, "KLAC-salmon.merged.gene_counts.tsv"), "gene-")
deseq_results$yH714 <- run_mock_deseq(
"yH714", file.path(counts_new_dir, "yH714_salmon.merged.gene_counts.tsv"),
file.path(counts_og_dir, "CALB-salmon.merged.gene_counts.tsv"), "")
deg_summary <- tibble(
run = names(deseq_results),
universe = sapply(deseq_results, function(x) nrow(x$universe)),
degs_total = sapply(deseq_results, function(x) nrow(x$degs)),
degs_up = sapply(deseq_results, function(x) nrow(x$degs_up)),
degs_down = sapply(deseq_results, function(x) nrow(x$degs_down))
)
print(deg_summary)
## # A tibble: 5 × 5
## run universe degs_total degs_up degs_down
## <chr> <int> <int> <int> <int>
## 1 yH545 5905 2889 1437 1452
## 2 yH1053 5861 3247 1616 1631
## 3 yH001 5110 1731 955 776
## 4 yH149 5060 1922 1003 919
## 5 yH714 5843 2459 1340 1119
for (rc in names(deseq_results)) {
od <- file.path(out_base, rc)
dir.create(od, recursive = TRUE, showWarnings = FALSE)
write_csv(deseq_results[[rc]]$universe, file.path(od, sprintf("%s_universe.csv", rc)))
write_csv(deseq_results[[rc]]$degs, file.path(od, sprintf("%s_degs_union.csv", rc)))
write_csv(deseq_results[[rc]]$degs_up, file.path(od, sprintf("%s_degs_up.csv", rc)))
write_csv(deseq_results[[rc]]$degs_down, file.path(od, sprintf("%s_degs_down.csv", rc)))
}
## DEG landscape
deg_counts <- bind_rows(lapply(names(deseq_results), function(rc) {
rl <- deseq_results[[rc]]$results_list
bind_rows(lapply(names(rl), function(tp) {
df <- rl[[tp]]
sig <- df |> dplyr::filter(!is.na(padj), padj < padj_threshold, abs(log2FoldChange) > lfc_threshold)
tibble(run = rc, timepoint = tp,
Up = sum(sig$log2FoldChange > 0), Down = sum(sig$log2FoldChange < 0))
}))
}))
#absolute time on x, numeric order, shared across runs
all_tp_levels <- as.character(sort(unique(as.numeric(deg_counts$timepoint))))
deg_long <- deg_counts |>
mutate(tp_label = factor(timepoint, levels = all_tp_levels)) |>
pivot_longer(cols = c(Up, Down), names_to = "direction", values_to = "count")
run_display <- c(
yH545 = "S. cerevisiae WT (noPi)",
yH1053 = "S. cerevisiae ppx1\u0394 ppn1\u0394 (noPi)",
yH001 = "C. glabrata (mock)",
yH149 = "K. lactis (mock)",
yH714 = "C. albicans (mock)"
)
deg_long$run_display <- factor(run_display[deg_long$run], levels = run_display)
deg_long$direction <- factor(deg_long$direction, levels = c("Down", "Up"))
#mock runs derived from analysis type, not hardcoded
mock_runs <- names(deseq_results)[sapply(deseq_results, function(x) x$type == "mock")]
dir_colors <- c("Down" = "#4A90D9", "Up" = "#C0392B")
run_colors <- c(
"S. cerevisiae WT (noPi)" = "#377EB8",
"S. cerevisiae ppx1\u0394 ppn1\u0394 (noPi)" = "#984EA3",
"C. glabrata (mock)" = "#E41A1C",
"K. lactis (mock)" = "#F8C434",
"C. albicans (mock)" = "#74AC4C"
)
run_shapes <- c(
"S. cerevisiae WT (noPi)" = 21,
"S. cerevisiae ppx1\u0394 ppn1\u0394 (noPi)" = 24,
"C. glabrata (mock)" = 22,
"K. lactis (mock)" = 23,
"C. albicans (mock)" = 25
)
dw <- 0.75
y_max <- max(deg_long$count, na.rm = TRUE) * 1.08
#lines + points only on timecourse runs; mocks get bars alone
line_df <- deg_long |> dplyr::filter(!run %in% mock_runs)
p_facet <- ggplot(deg_long, aes(x = tp_label, y = count, fill = direction, colour = direction, group = direction)) +
geom_col(position = position_dodge(width = dw), width = dw * 0.85, alpha = 0.45, colour = NA) +
geom_line(data = line_df, position = position_dodge(width = dw), linewidth = 1.1) +
geom_point(data = line_df, position = position_dodge(width = dw), size = 2, shape = 21, fill = "white", stroke = 1) +
facet_wrap(~run_display, ncol = 2, axes = "all") +
scale_fill_manual(values = dir_colors, name = NULL) +
scale_colour_manual(values = dir_colors, name = NULL) +
scale_y_continuous(limits = c(0, y_max)) +
guides(fill = guide_legend(override.aes = list(alpha = 1)),
colour = guide_legend(override.aes = list(linewidth = 2))) +
labs(x = "Time (min)", y = "DEG count (|LFC| > 1)") +
theme_prism(base_size = 12) +
theme(strip.text = element_text(face = "italic", size = 11),
legend.position = "bottom",
legend.key.width = unit(1.2, "cm"),
axis.text.x = element_text(size = 9, angle = 0))
scer_runs <- c("S. cerevisiae WT (noPi)", "S. cerevisiae ppx1\u0394 ppn1\u0394 (noPi)")
make_overlay <- function(dir, panel_color, runs_in) {
df <- deg_long |> dplyr::filter(direction == dir, run_display %in% runs_in)
ggplot(df, aes(x = tp_label, y = count, colour = run_display, shape = run_display, group = run_display)) +
geom_line(linewidth = 1.3) +
geom_point(size = 3.5, fill = "white", stroke = 1.2) +
scale_colour_manual(values = run_colors, name = NULL) +
scale_shape_manual(values = run_shapes, name = NULL) +
scale_y_continuous(limits = c(0, y_max)) +
labs(title = paste0(ifelse(dir == "Down", "Downregulated", "Upregulated"), " DEGs (WT vs mutant)"),
x = "Time (min)", y = if (dir == "Down") "DEG count (|LFC| > 1)" else NULL) +
theme_prism(base_size = 12) +
theme(plot.title = element_text(colour = panel_color, face = "bold", size = 13),
legend.position = "bottom",
legend.text = element_text(face = "italic", size = 10),
axis.text.x = element_text(size = 9, angle = 0))
}
p_down <- make_overlay("Down", dir_colors["Down"], scer_runs)
p_up <- make_overlay("Up", dir_colors["Up"], scer_runs)
p_combined <- p_facet /
(p_down | p_up) +
plot_layout(heights = c(1.2, 0.7)) +
plot_annotation(tag_levels = "A",
theme = theme(plot.tag = element_text(size = 14, face = "bold")))
save_fig(p_combined, paste0(Sys.Date(), "_deg_landscape_combined.png"), width = 13, height = 13)
p_combined
save_fig(p_facet, paste0(Sys.Date(), "_deg_landscape_A_faceted.png"), width = 11, height = 9)
save_fig(p_down, paste0(Sys.Date(), "_deg_landscape_B_down_overlay.png"), width = 7, height = 5)
save_fig(p_up, paste0(Sys.Date(), "_deg_landscape_C_up_overlay.png"), width = 7, height = 5)
#individual per-run landscape into figures/DEG/<run>/
#mocks: bars only; timecourses: bars + line + points. own timepoints + own y-axis per plot
make_run_plot <- function(rc) {
df <- deg_long |> dplyr::filter(run == rc)
df$tp_label <- droplevels(df$tp_label)
is_mock <- deseq_results[[rc]]$type == "mock"
disp <- as.character(unique(df$run_display))
run_ymax <- max(df$count, na.rm = TRUE) * 1.08
p <- ggplot(df, aes(x = tp_label, y = count, fill = direction, colour = direction, group = direction)) +
geom_col(position = position_dodge(width = dw), width = dw * 0.85, alpha = 0.45, colour = NA)
if (!is_mock) {
p <- p +
geom_line(position = position_dodge(width = dw), linewidth = 1.1) +
geom_point(position = position_dodge(width = dw), size = 2, shape = 21, fill = "white", stroke = 1)
}
p +
scale_fill_manual(values = dir_colors, name = NULL) +
scale_colour_manual(values = dir_colors, name = NULL) +
scale_y_continuous(limits = c(0, run_ymax)) +
guides(fill = guide_legend(override.aes = list(alpha = 1)),
colour = if (is_mock) "none" else guide_legend(override.aes = list(linewidth = 2))) +
labs(title = disp, x = "Time (min)", y = "DEG count (|LFC| > 1)") +
theme_prism(base_size = 12) +
theme(plot.title = element_text(face = "italic", size = 13),
legend.position = "bottom",
axis.text.x = element_text(size = 9, angle = 0))
}
for (rc in names(deseq_results)) {
rd <- file.path(fig_dir, rc)
dir.create(rd, recursive = TRUE, showWarnings = FALSE)
ggsave(file.path(rd, sprintf("%s_%s_deg_landscape.png", Sys.Date(), rc)),
plot = make_run_plot(rc), width = 7, height = 5, dpi = 300, bg = "white")
}
#mock-vs-noPi DEG overlap per mock species (handling-artifact isolation)
#shared (same-direction) DEGs = experimental-handling response, common to both arms
#noPi-unique = phosphate-specific response; canonical PHO regulon must land here, NOT in shared (pos contrl)
#same baseline both arms: original-timecourse t0. noPi restricted to 30/60 to match mock time depth
#re-runs original noPi-vs-t0 (30/60) on counts_OG for reproducibility
#scer_inrun (yH545/yH1053): mock-vs-t0 and noPi-vs-t0 both within-run, shared 2026 noPi t0
#mock DEGs reused
library(ggVennDiagram)
padj_threshold <- 0.01
lfc_threshold <- 1
venn_tps <- c("30", "60")
counts_new_dir <- here("14_DESeq2_062026data", "inputs", "counts_new")
counts_og_dir <- here("14_DESeq2_062026data", "inputs", "counts_OG")
pho_dir <- here("14_DESeq2_062026data", "inputs")
out_base <- here("14_DESeq2_062026data", "outputs")
fig_dir <- here("14_DESeq2_062026data", "figures", "DEG")
#mock_only pulls mock DEGs from chunk 05 and re-runs the original noPi arm
run_cfg <- list(
yH545 = list(type = "scer_inrun", new = "yH545_salmon.merged.gene_counts.tsv",
prefix = "gene-", sp = "Scer", disp = "Scer WT"),
yH1053 = list(type = "scer_inrun", new = "yH1053_salmon.merged.gene_counts.tsv",
prefix = "gene-", sp = "Scer", disp = "Scer ppx1\u0394 ppn1\u0394"),
yH001 = list(type = "mock_only", og = "CGLA-salmon.merged.gene_counts.tsv",
prefix = "gene-", sp = "Cgla", disp = "Cgla"),
yH149 = list(type = "mock_only", og = "KLAC-salmon.merged.gene_counts.tsv",
prefix = "gene-", sp = "Klac", disp = "Klac"),
yH714 = list(type = "mock_only", og = "CALB-salmon.merged.gene_counts.tsv",
prefix = "", sp = "Calb", disp = "Calb")
)
deg_set_from_results <- function(results_list, tps) {
bind_rows(results_list[names(results_list) %in% tps]) |>
dplyr::filter(!is.na(padj), padj < padj_threshold, abs(log2FoldChange) > lfc_threshold) |>
group_by(gene_id) |>
dplyr::slice_min(padj, n = 1, with_ties = FALSE) |>
ungroup() |>
mutate(direction = ifelse(log2FoldChange > 0, "up", "down")) |>
dplyr::select(gene_id, direction)
}
run_og_nopi_deseq <- function(og_file, id_prefix, tps) {
raw <- read_tsv(og_file, show_col_types = FALSE)
id_col <- names(raw)[1]; name_col <- names(raw)[2]
ids_clean <- clean_gene_id(raw[[id_col]], id_prefix)
samp <- setdiff(names(raw), c(id_col, name_col))
mat <- as.matrix(raw[, samp]); rownames(mat) <- ids_clean
tp <- parse_tp_og(colnames(mat))
stopifnot(!any(is.na(tp)))
stopifnot(0 %in% tp)
stopifnot(all(as.integer(tps) %in% tp))
keep <- rowSums(mat >= 10) >= 2
mat <- mat[keep, ]
mat <- apply(mat, 2, function(x) as.integer(round(x)))
rownames(mat) <- ids_clean[keep]
col_data <- data.frame(
sample = colnames(mat),
timepoint = factor(as.character(tp), levels = as.character(sort(unique(tp)))),
row.names = colnames(mat)
)
col_data$timepoint <- relevel(col_data$timepoint, ref = "0")
results_list <- list()
for (t in tps) {
keep_samp <- col_data$timepoint %in% c("0", t)
sub_mat <- mat[, keep_samp]
cd <- droplevels(col_data[keep_samp, , drop = FALSE])
cd$timepoint <- relevel(cd$timepoint, ref = "0")
dds <- DESeqDataSetFromMatrix(sub_mat, cd, design = ~ timepoint)
dds <- DESeq(dds, quiet = TRUE)
res <- lfcShrink(dds, coef = resultsNames(dds)[2], type = "apeglm", quiet = TRUE)
results_list[[t]] <- as.data.frame(res) |>
rownames_to_column("gene_id") |>
mutate(timepoint = t)
}
results_list
}
run_inrun_mock_deseq <- function(counts_file, id_prefix, tps) {
raw <- read_tsv(counts_file, show_col_types = FALSE)
id_col <- names(raw)[1]; name_col <- names(raw)[2]
ids_clean <- clean_gene_id(raw[[id_col]], id_prefix)
samp <- setdiff(names(raw), c(id_col, name_col))
cond <- parse_cond(samp)
tp <- parse_tp_new(samp)
t0_samp <- samp[cond == "noPi" & tp == 0]
mock_samp <- samp[cond == "mock" & tp %in% as.integer(tps)]
stopifnot(length(t0_samp) >= 2)
stopifnot(length(mock_samp) >= 1)
keep_cols <- c(t0_samp, mock_samp)
mat <- as.matrix(raw[, keep_cols])
mat <- apply(mat, 2, function(x) as.integer(round(x)))
rownames(mat) <- ids_clean
mock_tp_vec <- tp[match(mock_samp, samp)]
grp <- c(rep("t0", length(t0_samp)), paste0("mock", mock_tp_vec))
grp_levels <- c("t0", paste0("mock", sort(unique(mock_tp_vec))))
col_data <- data.frame(
sample = keep_cols,
group = factor(grp, levels = grp_levels),
row.names = keep_cols
)
col_data$group <- relevel(col_data$group, ref = "t0")
keep <- rowSums(mat >= 10) >= 2
mat <- mat[keep, ]
message(sprintf(" in-run mock: t0 libs %d | mock libs %d | genes after filter %d",
length(t0_samp), length(mock_samp), nrow(mat)))
test_grps <- setdiff(levels(col_data$group), "t0")
results_list <- list()
for (g in test_grps) {
keep_samp <- col_data$group %in% c("t0", g)
sub_mat <- mat[, keep_samp]
cd <- droplevels(col_data[keep_samp, , drop = FALSE])
cd$group <- relevel(cd$group, ref = "t0")
dds <- DESeqDataSetFromMatrix(sub_mat, cd, design = ~ group)
dds <- DESeq(dds, quiet = TRUE)
res <- lfcShrink(dds, coef = resultsNames(dds)[2], type = "apeglm", quiet = TRUE)
t <- as.character(as.integer(sub("mock", "", g)))
results_list[[t]] <- as.data.frame(res) |>
rownames_to_column("gene_id") |>
mutate(timepoint = t)
}
results_list
}
venn_summary <- list()
pho_report <- list()
for (rc in names(run_cfg)) {
cfg <- run_cfg[[rc]]
message(sprintf("=== %s (%s) mock vs noPi (30/60) ===", rc, cfg$disp))
if (cfg$type == "mock_only") {
mock_deg <- deg_set_from_results(deseq_results[[rc]]$results_list, venn_tps)
og_res <- run_og_nopi_deseq(file.path(counts_og_dir, cfg$og), cfg$prefix, venn_tps)
nopi_deg <- deg_set_from_results(og_res, venn_tps)
univ_mock <- unique(bind_rows(deseq_results[[rc]]$results_list)$gene_id)
univ_nopi <- unique(bind_rows(og_res)$gene_id)
} else {
mock_res <- run_inrun_mock_deseq(file.path(counts_new_dir, cfg$new), cfg$prefix, venn_tps)
mock_deg <- deg_set_from_results(mock_res, venn_tps)
nopi_deg <- deg_set_from_results(deseq_results[[rc]]$results_list, venn_tps)
univ_mock <- unique(bind_rows(mock_res)$gene_id)
univ_nopi <- unique(bind_rows(deseq_results[[rc]]$results_list)$gene_id)
}
#common testable universe (genes tested in both arms) for a fair Venn
common_univ <- intersect(univ_mock, univ_nopi)
mock_deg <- mock_deg |> dplyr::filter(gene_id %in% common_univ)
nopi_deg <- nopi_deg |> dplyr::filter(gene_id %in% common_univ)
message(sprintf(" testable universe: mock %d | noPi %d | common %d",
length(univ_mock), length(univ_nopi), length(common_univ)))
joined <- full_join(
mock_deg |> dplyr::rename(dir_mock = direction),
nopi_deg |> dplyr::rename(dir_nopi = direction),
by = "gene_id"
)
shared_same <- joined |> dplyr::filter(!is.na(dir_mock), !is.na(dir_nopi), dir_mock == dir_nopi)
shared_opp <- joined |> dplyr::filter(!is.na(dir_mock), !is.na(dir_nopi), dir_mock != dir_nopi)
mock_only <- joined |> dplyr::filter(!is.na(dir_mock), is.na(dir_nopi))
nopi_only <- joined |> dplyr::filter(is.na(dir_mock), !is.na(dir_nopi))
message(sprintf(" mock DEGs: %d | noPi DEGs: %d | shared(same-dir): %d | shared(opp-dir): %d",
nrow(mock_deg), nrow(nopi_deg), nrow(shared_same), nrow(shared_opp)))
#Venn on same-direction membership; short labels + x-expansion so names don't clip
venn_sets <- list(Mock = mock_deg$gene_id, noPi = nopi_deg$gene_id)
p_venn <- ggVennDiagram(venn_sets, label = "count", label_alpha = 0,
set_color = c("#999999", "#377EB8")) +
scale_fill_gradient(low = "#F7FBFF", high = "#C0392B") +
scale_x_continuous(expand = expansion(mult = 0.2)) +
labs(title = sprintf("%s: mock vs noPi DEGs (30/60, same-direction)", cfg$disp)) +
theme(plot.title = element_text(face = "bold", size = 13),
legend.position = "none")
rd <- file.path(fig_dir, rc)
dir.create(rd, recursive = TRUE, showWarnings = FALSE)
ggsave(file.path(rd, sprintf("%s_%s_venn_mock_vs_nopi.png", Sys.Date(), rc)),
plot = p_venn, width = 6, height = 5, dpi = 300, bg = "white")
#region membership table out
region_tbl <- bind_rows(
shared_same |> dplyr::transmute(gene_id, region = "shared_same_dir", dir_mock, dir_nopi),
shared_opp |> dplyr::transmute(gene_id, region = "shared_opp_dir", dir_mock, dir_nopi),
mock_only |> dplyr::transmute(gene_id, region = "mock_only", dir_mock, dir_nopi),
nopi_only |> dplyr::transmute(gene_id, region = "nopi_only", dir_mock, dir_nopi)
)
dir.create(file.path(out_base, rc), recursive = TRUE, showWarnings = FALSE)
write_csv(region_tbl, file.path(out_base, rc, sprintf("%s_venn_regions.csv", rc)))
## PHO sanity check
#yH1053: PPX1/PPN1 are deleted (~0 counts) so they drop from the universe and won't appear
#in the PHO table; expected, not a control leak
pho <- read_csv(file.path(pho_dir, sprintf("PHO_regulon_%s_mapped1.csv", cfg$sp)),
show_col_types = FALSE) |>
dplyr::filter(!is.na(gene_id)) |>
mutate(gene_id_clean = clean_gene_id(gene_id, cfg$prefix))
#assert PHO genes actually reached the tested universe; loud if mapping/prefix mismatch
pho_in_univ <- pho |> dplyr::filter(gene_id_clean %in% common_univ)
if (nrow(pho_in_univ) < max(3, 0.3 * nrow(pho))) {
warning(sprintf("[%s] only %d/%d PHO genes in tested universe; check ID prefix/mapping",
cfg$sp, nrow(pho_in_univ), nrow(pho)))
}
pho_regions <- region_tbl |>
dplyr::inner_join(pho |> dplyr::select(gene_id_clean, scer_gene_name, functional_category),
by = c("gene_id" = "gene_id_clean"))
message(sprintf("PHO genes: %d mapped, %d in universe, %d called in either arm",
nrow(pho), nrow(pho_in_univ), nrow(pho_regions)))
pho_shared <- pho_regions |> dplyr::filter(region == "shared_same_dir")
if (nrow(pho_shared) > 0) {
message(sprintf("PHO genes in SHARED set (control leak?): %s",
paste(pho_shared$scer_gene_name, collapse = ", ")))
} else {
message("PHO check OK")
}
pho_report[[rc]] <- pho_regions |>
dplyr::transmute(run = rc, species = cfg$sp, scer_gene_name, functional_category,
region, dir_mock, dir_nopi)
write_csv(pho_report[[rc]],
file.path(out_base, rc, sprintf("%s_PHO_venn_membership.csv", rc)))
venn_summary[[rc]] <- tibble(
run = rc, species = cfg$sp, type = cfg$type,
mock_degs = nrow(mock_deg), nopi_degs = nrow(nopi_deg),
shared_same = nrow(shared_same), shared_opp = nrow(shared_opp),
mock_only = nrow(mock_only), nopi_only = nrow(nopi_only),
pho_in_universe = nrow(pho_in_univ), pho_in_shared = nrow(pho_shared)
)
}
venn_summary <- bind_rows(venn_summary)
print(venn_summary)
## # A tibble: 5 × 11
## run species type mock_degs nopi_degs shared_same shared_opp mock_only
## <chr> <chr> <chr> <int> <int> <int> <int> <int>
## 1 yH545 Scer scer_inrun 549 964 527 0 22
## 2 yH1053 Scer scer_inrun 531 1524 478 6 47
## 3 yH001 Cgla mock_only 1729 2186 588 570 571
## 4 yH149 Klac mock_only 1916 1991 1329 65 522
## 5 yH714 Calb mock_only 2458 2212 1822 34 602
## # ℹ 3 more variables: nopi_only <int>, pho_in_universe <int>,
## # pho_in_shared <int>
pho_report_all <- bind_rows(pho_report)
print(pho_report_all |> dplyr::arrange(species, run, functional_category, scer_gene_name))
## # A tibble: 73 × 7
## run species scer_gene_name functional_category region dir_mock dir_nopi
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 yH714 Calb DDP1 Phosphatases mock_only down <NA>
## 2 yH714 Calb GDE1 Phosphatases shared_sa… up up
## 3 yH714 Calb HOR2 Phosphatases shared_sa… down down
## 4 yH714 Calb PHM8 Phosphatases shared_sa… down down
## 5 yH714 Calb PHO11 Phosphatases nopi_only <NA> up
## 6 yH714 Calb PHO8 Phosphatases shared_sa… down down
## 7 yH714 Calb VTC1 Poly P Metabolism nopi_only <NA> up
## 8 yH714 Calb VTC2 Poly P Metabolism nopi_only <NA> up
## 9 yH714 Calb VTC4 Poly P Metabolism nopi_only <NA> up
## 10 yH714 Calb PHO81 Regulatory nopi_only <NA> up
## # ℹ 63 more rows
Per-timepoint DEG counts (noPi-vs-t0, |LFC|>1, padj<0.01) across the full Scer timecourse, for yH545 (WT) and yH1053 (ppx1Δ ppn1Δ) as we did before. This is the shape and timing of the starvation program.
my interpretation: PPX1/PPN1 hydrolyze stored polyphosphate. Without that buffer, the mutant hits internal Pi limitation almost immediately on phosphate withdrawal, so Pho4 engages early and the PHO-specific program fires ahead of WT. In our 2026 data, the full regulon is induced by 30/60 min (PHO5 LFC ~5.5 by 60) where WT shows only its fastest genes. This early PHO induction is a new phenotype we didn’t see in WT.
This is separable from the genome-wide count. The mutant’s total early DEG count is lower than WT’s (the generic early stress/handling burst is smaller), but its phosphate-specific induction is earlier and stronger.
What this analysis is, and why it’s separate from the landscape.: This analysis performs a mock contrast analysis to our noPi data. The noPi arm = handling + phosphate starvation. The mock arm = handling alone (resuspended in Pi-replete medium). So genes that move in both arms are the handling artifact; genes in the noPi arm only are the phosphate-specific response.
Design. Per species, mock DEGs vs noPi DEGs at 30/60 only (mocks have just those two timepoints, so noPi is restricted to match, like-for-like). Both arms referenced to the same t0. Overlap is same-direction (a gene up in one arm and down in the other is not “shared”, it’s reported separately). Three comparative-species Venns (Cgla/Klac/Calb) plus the two Scer runs’ own mock arms. phosphate-specific genes were used as a “positive control” if you will as must land in noPi-only, not in the shared set.
library(edgeR)
#took this from 11212025 TMM processing
counts_new_dir <- here("14_DESeq2_062026data", "inputs", "counts_new")
out_base <- here("14_DESeq2_062026data", "outputs")
lcpm_runs <- list(
yH545 = list(file = "yH545_salmon.merged.gene_counts.tsv", prefix = "gene-"),
yH1053 = list(file = "yH1053_salmon.merged.gene_counts.tsv", prefix = "gene-"),
yH001 = list(file = "yH001_salmon.merged.gene_counts.tsv", prefix = "gene-"),
yH149 = list(file = "yH149_salmon.merged.gene_counts.tsv", prefix = "gene-"),
yH714 = list(file = "yH714_salmon.merged.gene_counts.tsv", prefix = "")
)
make_lcpm <- function(run_code, counts_file, id_prefix) {
raw <- read_tsv(counts_file, show_col_types = FALSE)
id_col <- names(raw)[1]; name_col <- names(raw)[2]
gene_ids <- clean_gene_id(raw[[id_col]], id_prefix)
sample_cols <- setdiff(names(raw), c(id_col, name_col))
count_mat <- as.matrix(raw[, sample_cols])
count_mat <- apply(count_mat, 2, function(x) as.integer(round(x)))
rownames(count_mat) <- gene_ids
keep <- rowSums(count_mat >= 10) >= 2
count_mat <- count_mat[keep, ]
dge <- DGEList(counts = count_mat)
dge <- calcNormFactors(dge, method = "TMM")
lcpm <- edgeR::cpm(dge, log = TRUE)
as.data.frame(lcpm) |>
rownames_to_column("gene_id")
}
fmt_tp_label <- function(tp_min) {
ifelse(tp_min < 60, paste0(tp_min, "min"), paste0(tp_min / 60, "h"))
}
build_sample_info <- function(samples) {
si <- tibble(Sample = samples) |>
mutate(
Condition = parse_cond(Sample),
tp_min = parse_tp_new(Sample),
Replicate = as.integer(str_extract(Sample, "(?<=rep)\\d+"))
)
stopifnot(!any(is.na(si$Condition)), !any(is.na(si$tp_min)), !any(is.na(si$Replicate)))
si |>
arrange(factor(Condition, levels = c("noPi", "mock")), tp_min, Replicate) |>
group_by(Condition, tp_min) |>
mutate(StandardizedReplicate = dense_rank(Replicate)) |>
ungroup() |>
transmute(Sample, Condition, Timepoint = fmt_tp_label(tp_min),
Replicate, StandardizedReplicate)
}
for (rc in names(lcpm_runs)) {
cfg <- lcpm_runs[[rc]]
mat <- make_lcpm(rc, file.path(counts_new_dir, cfg$file), cfg$prefix)
sample_info <- build_sample_info(setdiff(names(mat), "gene_id"))
mat <- mat[, c("gene_id", sample_info$Sample)]
lcpm_dir <- file.path(out_base, rc, "lcpm")
dir.create(lcpm_dir, recursive = TRUE, showWarnings = FALSE)
write_csv(mat, file.path(lcpm_dir, sprintf("%s_tmm_log2cpm.csv", rc)))
write_csv(sample_info, file.path(lcpm_dir, sprintf("%s_sample_info.csv", rc)))
}
#processing Rlog here
make_rlog <- function(run_code, counts_file, id_prefix) {
raw <- read_tsv(counts_file, show_col_types = FALSE)
id_col <- names(raw)[1]; name_col <- names(raw)[2]
gene_ids <- clean_gene_id(raw[[id_col]], id_prefix)
sample_cols <- setdiff(names(raw), c(id_col, name_col))
count_mat <- as.matrix(raw[, sample_cols])
count_mat <- apply(count_mat, 2, function(x) as.integer(round(x)))
rownames(count_mat) <- gene_ids
col_data <- data.frame(
condition = factor(parse_cond(colnames(count_mat))),
timepoint = factor(parse_tp_new(colnames(count_mat))),
row.names = colnames(count_mat)
)
design <- if (nlevels(col_data$condition) > 1) ~ condition + timepoint else ~ timepoint
dds <- DESeqDataSetFromMatrix(countData = count_mat, colData = col_data, design = design)
dds <- DESeq(dds, quiet = TRUE)
rld <- assay(rlog(dds, blind = FALSE))
keep <- rowSums(count_mat) >= 10
as.data.frame(rld[keep, ]) |>
rownames_to_column("gene_id")
}
for (rc in names(lcpm_runs)) {
cfg <- lcpm_runs[[rc]]
mat <- make_rlog(rc, file.path(counts_new_dir, cfg$file), cfg$prefix)
sample_info <- build_sample_info(setdiff(names(mat), "gene_id"))
mat <- mat[, c("gene_id", sample_info$Sample)]
rlog_dir <- file.path(out_base, rc, "rlog")
dir.create(rlog_dir, recursive = TRUE, showWarnings = FALSE)
write_csv(mat, file.path(rlog_dir, sprintf("%s_rlog.csv", rc)))
write_csv(sample_info, file.path(rlog_dir, sprintf("%s_sample_info.csv", rc)))
}