library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
# Input data (Table 3)
# Tumour_gained = peaks gained in tumour (tumour-enriched)
# Tumour_lost = peaks lost in tumour (margin-enriched)
df <- data.frame(
Sample = c("GBM10","GBM12","GBM21","GBM22","GBM24","GBM31","GBM32","GBM37","GBM38","GBM40"),
Tumour_gained = c(16313,17109,63350,22471,12577,11480,8519,37022,17987,1413),
Tumour_lost = c(28108, 9115,15953,27604, 2583,24420,8438,14139, 3509,6504)
)
# X-axis order for plotting
# (WT samples first, then IDH1-mutant samples)
sample_order <- c(
"GBM10","GBM12","GBM32","GBM37","GBM38",
"GBM21","GBM22","GBM24","GBM31","GBM40"
)
# Axis labels
labels <- setNames(sample_order, sample_order)
# Convert wide -> long format
# This creates two rows per sample:
# one for "gained" and one for "lost"
df_long <- df %>%
pivot_longer(
cols = c(Tumour_gained, Tumour_lost),
names_to = "Type",
values_to = "Count"
) %>%
mutate(
# apply the plotting order
Sample = factor(Sample, levels = sample_order),
# nicer legend text
Type = recode(
Type,
Tumour_gained = "Peaks gained in tumour",
Tumour_lost = "Peaks lost in tumour"
),
# force legend order (grey first, then red)
Type = factor(Type, levels = c("Peaks lost in tumour", "Peaks gained in tumour"))
)
# Plot side-by-side bars
ggplot(df_long, aes(x = Sample, y = Count, fill = Type)) +
geom_col(position = "dodge") +
# x-axis label mapping
scale_x_discrete(labels = labels) +
# colours for the two bar types
scale_fill_manual(values = c(
"Peaks lost in tumour" = "grey",
"Peaks gained in tumour" = "red"
)) +
# axis labels + legend title removed
labs(
x = "Patient (IDH1 status)",
y = "Number of peaks",
fill = NULL
) +
# simple theme + legend at top
theme_minimal() +
theme(legend.position = "top") +
# set visible y-range, but allow text/lines to sit outside the panel
coord_cartesian(ylim = c(0, 70000), clip = "off") +
# Group annotations (WT vs IDH1-mutant)
annotate("segment", x = 1, xend = 5, y = 50000, yend = 50000, colour = "red") +
annotate("text", x = 2.5, y = 52000, label = "Wild-type GBM", colour = "red") +
annotate("segment", x = 6, xend = 10, y = 65000, yend = 65000, colour = "red") +
annotate("text", x = 8, y = 63000, label = "IDH1-mutated GBM", colour = "red")
library(ggplot2)
library(readxl)
# Load the spreadsheet
file <- "/home/cathyuni28/data/Fig2_GenomicAnnotation_TumourEnriched_summary_with_GBM21.xlsx"
df <- read_excel(file, sheet = "PerSample_Percent")
# Columns that contain percent annotations
# (these are the stacked bar segments)
category_cols <- c(
"Promoter-TSS","Intron","Intergenic","Exon","5' UTR","3' UTR","TTS","Unassigned"
)
# Pool within each IDH1 group (MUT vs WT)
# Weighted average = sum(percent * Total_peaks) / sum(Total_peaks)
pooled <- df %>%
group_by(IDH1_status) %>%
summarise(
across(all_of(category_cols), ~ sum(.x * Total_peaks) / sum(Total_peaks)),
.groups = "drop"
) %>%
# Force x-axis order: mutant first, then wild-type
mutate(IDH1_status = factor(IDH1_status, levels = c("MUT", "WT")))
# Convert wide -> long for ggplot stacked bars
long <- pooled %>%
pivot_longer(
cols = all_of(category_cols),
names_to = "Category",
values_to = "Percent"
)
# Labels shown above each bar
top_labels <- data.frame(
IDH1_status = factor(c("MUT","WT"), levels = c("MUT","WT")),
label = c("IDH1-mutant","IDH1-wild-type"),
y = 102 # slightly above 100 so text sits above the bars
)
#Plot (stacked percent bars)
ggplot(long, aes(x = IDH1_status, y = Percent, fill = Category)) +
geom_col(width = 0.7) +
# Show 0–100 but allow text above the plot area
coord_cartesian(ylim = c(0, 100), clip = "off") +
labs(
x = NULL,
y = "Percent of peaks",
fill = "Category"
) +
theme_minimal() +
theme(
# hide MUT/WT at the bottom
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "right"
) +
# add the IDH1 labels above the bars (black text)
geom_text(
data = top_labels,
aes(x = IDH1_status, y = y, label = label),
inherit.aes = FALSE
)
# Load the pooled table
file <- "/home/cathyuni28/data/Fig2_GenomicAnnotation_Pooled_Tumour_vs_Margin_forPlot.xlsx"
df <- read_excel(file, sheet = "Combined_Pooled_2Bars")
# Convert wide -> long
# one row per Set x Category
long <- df %>%
pivot_longer(
cols = -Set,
names_to = "Category",
values_to = "Percent"
)
# Keep only the categories you used in the final plot
keep_cats <- c("Promoter-TSS", "Exon", "Intron", "Intergenic")
long <- long %>% filter(Category %in% keep_cats)
# Force bar order (left to right)
long <- long %>%
mutate(Set = factor(Set, levels = c("Tumour-enriched", "Margin-enriched")))
# 5) Labels shown above each bar
top_labels <- data.frame(
Set = factor(c("Tumour-enriched", "Margin-enriched"),
levels = c("Tumour-enriched", "Margin-enriched")),
label = c("Gained in tumour", "Lost in tumour"),
y = 102
)
# 6) Plot stacked percent bars
ggplot(long, aes(x = Set, y = Percent, fill = Category)) +
geom_col(width = 0.65) +
# Show 0–100 but allow text above 100
coord_cartesian(ylim = c(0, 100), clip = "off") +
labs(
x = NULL,
y = "Percent of peaks",
fill = "Annotation"
) +
theme_classic() +
theme(
# hide the bottom Set labels (we use the top labels instead)
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
legend.position = "right"
) +
# add the top labels (black text)
geom_text(
data = top_labels,
aes(x = Set, y = y, label = label),
inherit.aes = FALSE
)
###Figure 4. Shared and IDH1-specific GO Biological Process enrichment for tumour-gained ATAC-seq peaks
library(stringr)
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
library(grid)
# Load the Excel file
file <- "/home/cathyuni28/data/non-size MVT/UPDATED gene ontology GBM project (tumourvs margin non-size) (1).xlsx"
df <- read_excel(file, sheet = "Sheet1")
# Identify which columns are WT vs MUT
wt_cols <- names(df)[str_detect(names(df), "WT")]
mut_cols <- names(df)[str_detect(names(df), "MUT")]
# Collapse each group into one unique set of GO term names
wt_set <- unique(as.character(unlist(df[wt_cols])))
mut_set <- unique(as.character(unlist(df[mut_cols])))
# Draw the Venn diagram
grid.newpage()
venn_grob <- draw.pairwise.venn(
area1 = length(mut_set),
area2 = length(wt_set),
cross.area = length(intersect(mut_set, wt_set)),
category = c("", ""),
fill = c("#F4A6A6", "#A6C8F4"),
alpha = c(0.6, 0.6),
cex = 1.2,
cat.cex = 0
)
# Add labels inside the circles
grid.text("IDH1-mutant", x = 0.23, y = 0.78, gp = gpar(fontsize = 14, fontface = "bold"))
grid.text("IDH1-wild-type", x = 0.77, y = 0.78, gp = gpar(fontsize = 14, fontface = "bold"))
# Load the Excel sheet
file <- "/home/cathyuni28/data/non-size MVT/UPDATED gene ontology GBM project (tumourvs margin non-size) (1).xlsx"
df <- read_excel(file, sheet = "Sheet2")
## New names:
## • `` -> `...2`
## • `` -> `...4`
## • `` -> `...6`
## • `` -> `...8`
## • `` -> `...10`
## • `` -> `...12`
## • `` -> `...14`
## • `` -> `...16`
## • `` -> `...18`
## • `` -> `...20`
# Ensure column names are unique (Excel sometimes duplicates headers)
names(df) <- make.unique(names(df))
# Identify FDR vs Pathway columns
# odd columns = FDR, even columns = Pathway
fdr_cols <- seq(1, ncol(df), 2)
path_cols <- seq(2, ncol(df), 2)
# Convert wide -> long:
# for each sample, make rows of (Sample, Pathway, FDR)
long <- do.call(rbind, lapply(seq_along(fdr_cols), function(i) {
data.frame(
Sample = str_trim(names(df)[fdr_cols[i]]),
Pathway = str_trim(as.character(df[[path_cols[i]]])),
FDR = as.numeric(df[[fdr_cols[i]]]),
stringsAsFactors = FALSE
)
}))
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
## Warning in data.frame(Sample = str_trim(names(df)[fdr_cols[i]]), Pathway =
## str_trim(as.character(df[[path_cols[i]]])), : NAs introduced by coercion
# Clean and compute the colour value
# -log10(FDR): bigger = more significant enrichment
plot_df <- long %>%
filter(!is.na(Pathway), Pathway != "", !is.na(FDR)) %>%
mutate(negLog10FDR = -log10(FDR))
# Order samples: WT first, then MUT
# (based on sample name containing "WT" or "MUT")
sample_order <- c(
sort(unique(plot_df$Sample[str_detect(plot_df$Sample, "WT")])),
sort(unique(plot_df$Sample[str_detect(plot_df$Sample, "MUT")]))
)
plot_df$Sample <- factor(plot_df$Sample, levels = sample_order)
# Heatmap plot
ggplot(plot_df, aes(x = Sample, y = Pathway, fill = negLog10FDR)) +
geom_tile() +
scale_fill_gradient(
low = "white",
high = "red",
name = expression(-log[10]*"(FDR)")
) +
labs(
x = "Sample",
y = "Biological process pathway"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
install.packages("tidyverse")
## Installing package into '/home/cathyuni28/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.2.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# File locations
dir_120 <- "~/data/TSS_profiles_120-180"
dir_ns <- "~/data/TSS_profiles_nonSize"
# Function: read one HOMER TSS file
# - Finds the header line "Distance from Center"
# - Reads the table from there
# - Keeps only: position + signal
read_homer_tss <- function(path) {
lines <- readLines(path)
start_idx <- which(grepl("^Distance from Center", lines))[1]
tab <- read_tsv(path, skip = start_idx - 1, show_col_types = FALSE)
names(tab)[1] <- "pos"
tab %>%
select(pos, signal = 2) %>%
mutate(file = basename(path))
}
# list .txt files in each directory
files_120 <- list.files(dir_120, pattern = "\\.txt$", full.names = TRUE)
files_ns <- list.files(dir_ns, pattern = "\\.txt$", full.names = TRUE)
# read + label size-selected
tss_120 <- map_dfr(files_120, read_homer_tss) %>%
mutate(
tissue = ifelse(str_detect(tolower(file), "tumour"), "Tumour", "Margin"),
type = "Size-selected (120–180bp)"
)
# read + label non-size-selected
tss_ns <- map_dfr(files_ns, read_homer_tss) %>%
mutate(
tissue = ifelse(str_detect(tolower(file), "tumour"), "Tumour", "Margin"),
type = "Non-size-selected"
)
# combine into one table
tss_all <- bind_rows(tss_120, tss_ns)
# Summarise across files:
# mean signal and standard error at each position
tss_summary <- tss_all %>%
group_by(type, tissue, pos) %>%
summarise(
mean_signal = mean(signal, na.rm = TRUE),
se_signal = sd(signal, na.rm = TRUE) / sqrt(n()),
.groups = "drop"
)
ggplot(tss_summary, aes(x = pos, y = mean_signal, colour = tissue, fill = tissue, group = tissue)) +
geom_vline(xintercept = 0, linetype = "dashed") +
geom_line() +
geom_ribbon(aes(ymin = mean_signal - se_signal, ymax = mean_signal + se_signal),
alpha = 0.2, colour = NA) +
facet_wrap(~ type, ncol = 1, scales = "free_y") +
scale_colour_manual(values = c("Tumour" = "red", "Margin" = "black"), name = NULL) +
scale_fill_manual(values = c("Tumour" = "red", "Margin" = "black"), name = NULL) +
labs(x = "Distance from TSS (bp)", y = "ATAC-seq signal") +
theme_bw() +
theme(legend.position = "top")
# File + sheet names
file <- "/home/cathyuni28/data/ARNT2.xlsx"
sheet_margin <- "ARNT2 Margin"
sheet_mutant <- "ARNT2 Mutant"
sheet_wt <- "ARNT2 Wildtype"
# Read one sheet -> long format
# first column is Distance
# other columns ending in "Coverage" are samples
# extract GBM number from sample name
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
distance_col <- names(df)[1]
df %>%
select(Distance = all_of(distance_col), matches("Coverage$")) %>%
mutate(Distance = as.numeric(Distance)) %>%
pivot_longer(cols = -Distance, names_to = "Sample", values_to = "Coverage") %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = as.integer(str_remove(GBM, "GBM"))
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
# Load the three groups
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
mutant_long <- tidy_homer_sheet(file, sheet_mutant, "Tumour IDH1 Mutant")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1 Wildtype")
# Plot A: paired Margin vs IDH1-mutant tumour
# (only GBMs present in the mutant sheet are kept)
paired_mutant <- bind_rows(margin_long, mutant_long) %>%
filter(GBM %in% unique(mutant_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Mutant")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_mutant, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Mutant" = "red3"), name = NULL) +
labs(
x = "Distance from ARNT2 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot B: paired Margin vs IDH1-wild-type tumour
# (only GBMs present in the WT sheet are kept)
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Wildtype" = "red3"), name = NULL) +
labs(
x = "Distance from ARNT2 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot C: pooled Margin vs pooled Tumour (Mutant + WT together)
# compute mean + SD at each Distance
# show mean line + SD ribbon
tumour_long <- bind_rows(mutant_long, wt_long) %>%
mutate(Group = "Tumour")
all_long <- bind_rows(margin_long, tumour_long)
summary_df <- all_long %>%
group_by(Group, Distance) %>%
summarise(
mean = mean(Coverage, na.rm = TRUE),
sd = sd(Coverage, na.rm = TRUE),
.groups = "drop"
)
ggplot(summary_df, aes(x = Distance)) +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = Group), alpha = 0.2, colour = NA) +
geom_line(aes(y = mean, color = Group)) +
scale_color_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
scale_fill_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
labs(
x = "Distance from ARNT2 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
print
## function (x, ...)
## UseMethod("print")
## <bytecode: 0x558696995f58>
## <environment: namespace:base>
# File + sheet names
file <- "/home/cathyuni28/data/BMAL1 (1).xlsx"
sheet_margin <- "BMAL1 margin"
sheet_mutant <- "BMAL1 Mutant"
sheet_wt <- "BMAL1 WT"
# Read one sheet -> long format
# first column is Distance
# other columns ending in "Coverage" are samples
# extract GBM number from sample name
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
distance_col <- names(df)[1]
df %>%
select(Distance = all_of(distance_col), matches("Coverage$")) %>%
mutate(Distance = as.numeric(Distance)) %>%
pivot_longer(cols = -Distance, names_to = "Sample", values_to = "Coverage") %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = as.integer(str_remove(GBM, "GBM"))
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
# Load the three groups
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
mutant_long <- tidy_homer_sheet(file, sheet_mutant, "Tumour IDH1 Mutant")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1 Wildtype")
# Plot A: paired Margin vs IDH1-mutant tumour
# (only GBMs present in the mutant sheet are kept)
paired_mutant <- bind_rows(margin_long, mutant_long) %>%
filter(GBM %in% unique(mutant_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Mutant")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_mutant, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Mutant" = "red3"), name = NULL) +
labs(
x = "Distance from BMAL1 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot B: paired Margin vs IDH1-wild-type tumour
# (only GBMs present in the WT sheet are kept)
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Wildtype" = "red3"), name = NULL) +
labs(
x = "Distance from BMAL1 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot C: pooled Margin vs pooled Tumour (Mutant + WT together)
# compute mean + SD at each Distance
# show mean line + SD ribbon
tumour_long <- bind_rows(mutant_long, wt_long) %>%
mutate(Group = "Tumour")
all_long <- bind_rows(margin_long, tumour_long)
summary_df <- all_long %>%
group_by(Group, Distance) %>%
summarise(
mean = mean(Coverage, na.rm = TRUE),
sd = sd(Coverage, na.rm = TRUE),
.groups = "drop"
)
ggplot(summary_df, aes(x = Distance)) +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = Group), alpha = 0.2, colour = NA) +
geom_line(aes(y = mean, color = Group)) +
scale_color_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
scale_fill_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
labs(
x = "Distance from BMAL1 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
print
## function (x, ...)
## UseMethod("print")
## <bytecode: 0x558696995f58>
## <environment: namespace:base>
# File + sheet names
file <- "/home/cathyuni28/data/MAX.xlsx"
sheet_margin <- "MAX Margin"
sheet_mutant <- "MAX Mutant"
sheet_wt <- "MAX Wildtype"
# Read one sheet -> long format
# first column is Distance
# other columns ending in "Coverage" are samples
# extract GBM number from sample name
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
distance_col <- names(df)[1]
df %>%
select(Distance = all_of(distance_col), matches("Coverage$")) %>%
mutate(Distance = as.numeric(Distance)) %>%
pivot_longer(cols = -Distance, names_to = "Sample", values_to = "Coverage") %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = as.integer(str_remove(GBM, "GBM"))
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
# Load the three groups
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
mutant_long <- tidy_homer_sheet(file, sheet_mutant, "Tumour IDH1 Mutant")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1 Wildtype")
# Plot A: paired Margin vs IDH1-mutant tumour
# (only GBMs present in the mutant sheet are kept)
paired_mutant <- bind_rows(margin_long, mutant_long) %>%
filter(GBM %in% unique(mutant_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Mutant")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_mutant, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Mutant" = "red3"), name = NULL) +
labs(
x = "Distance from MAX binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot B: paired Margin vs IDH1-wild-type tumour
# (only GBMs present in the WT sheet are kept)
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Wildtype" = "red3"), name = NULL) +
labs(
x = "Distance from MAX binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot C: pooled Margin vs pooled Tumour (Mutant + WT together)
# compute mean + SD at each Distance
# show mean line + SD ribbon
tumour_long <- bind_rows(mutant_long, wt_long) %>%
mutate(Group = "Tumour")
all_long <- bind_rows(margin_long, tumour_long)
summary_df <- all_long %>%
group_by(Group, Distance) %>%
summarise(
mean = mean(Coverage, na.rm = TRUE),
sd = sd(Coverage, na.rm = TRUE),
.groups = "drop"
)
ggplot(summary_df, aes(x = Distance)) +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = Group), alpha = 0.2, colour = NA) +
geom_line(aes(y = mean, color = Group)) +
scale_color_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
scale_fill_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
labs(
x = "Distance from MAX binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
print
## function (x, ...)
## UseMethod("print")
## <bytecode: 0x558696995f58>
## <environment: namespace:base>
# File + sheet names
file <- "/home/cathyuni28/data/MED1.xlsx"
sheet_margin <- "MED1 Margin"
sheet_mutant <- "MED1 Mutant"
sheet_wt <- "MED1 Wildtype"
# Read one sheet -> long format
# first column is Distance
# other columns ending in "Coverage" are samples
# extract GBM number from sample name
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
distance_col <- names(df)[1]
df %>%
select(Distance = all_of(distance_col), matches("Coverage$")) %>%
mutate(Distance = as.numeric(Distance)) %>%
pivot_longer(cols = -Distance, names_to = "Sample", values_to = "Coverage") %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = as.integer(str_remove(GBM, "GBM"))
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
# Load the three groups
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
mutant_long <- tidy_homer_sheet(file, sheet_mutant, "Tumour IDH1 Mutant")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1 Wildtype")
# Plot A: paired Margin vs IDH1-mutant tumour
# (only GBMs present in the mutant sheet are kept)
paired_mutant <- bind_rows(margin_long, mutant_long) %>%
filter(GBM %in% unique(mutant_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Mutant")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_mutant, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Mutant" = "red3"), name = NULL) +
labs(
x = "Distance from MED1 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot B: paired Margin vs IDH1-wild-type tumour
# (only GBMs present in the WT sheet are kept)
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Wildtype" = "red3"), name = NULL) +
labs(
x = "Distance from MED1 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot C: pooled Margin vs pooled Tumour (Mutant + WT together)
# compute mean + SD at each Distance
# show mean line + SD ribbon
tumour_long <- bind_rows(mutant_long, wt_long) %>%
mutate(Group = "Tumour")
all_long <- bind_rows(margin_long, tumour_long)
summary_df <- all_long %>%
group_by(Group, Distance) %>%
summarise(
mean = mean(Coverage, na.rm = TRUE),
sd = sd(Coverage, na.rm = TRUE),
.groups = "drop"
)
ggplot(summary_df, aes(x = Distance)) +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = Group), alpha = 0.2, colour = NA) +
geom_line(aes(y = mean, color = Group)) +
scale_color_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
scale_fill_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
labs(
x = "Distance from MED1 binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
print
## function (x, ...)
## UseMethod("print")
## <bytecode: 0x558696995f58>
## <environment: namespace:base>
# File + sheet names
file <- "/home/cathyuni28/data/MYC.xlsx"
sheet_margin <- "MYC Margin"
sheet_mutant <- "MYC Mutant"
sheet_wt <- "MYC Wildtype"
# Read one sheet -> long format
# first column is Distance
# other columns ending in "Coverage" are samples
# extract GBM number from sample name
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
distance_col <- names(df)[1]
df %>%
select(Distance = all_of(distance_col), matches("Coverage$")) %>%
mutate(Distance = as.numeric(Distance)) %>%
pivot_longer(cols = -Distance, names_to = "Sample", values_to = "Coverage") %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = as.integer(str_remove(GBM, "GBM"))
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
# Load the three groups
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
mutant_long <- tidy_homer_sheet(file, sheet_mutant, "Tumour IDH1 Mutant")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1 Wildtype")
# Plot A: paired Margin vs IDH1-mutant tumour
# (only GBMs present in the mutant sheet are kept)
paired_mutant <- bind_rows(margin_long, mutant_long) %>%
filter(GBM %in% unique(mutant_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Mutant")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_mutant, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Mutant" = "red3"), name = NULL) +
labs(
x = "Distance from MYC binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot B: paired Margin vs IDH1-wild-type tumour
# (only GBMs present in the WT sheet are kept)
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>%
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1 Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c("Margin" = "black", "Tumour IDH1 Wildtype" = "red3"), name = NULL) +
labs(
x = "Distance from MYC binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
# Plot C: pooled Margin vs pooled Tumour (Mutant + WT together)
# compute mean + SD at each Distance
# show mean line + SD ribbon
tumour_long <- bind_rows(mutant_long, wt_long) %>%
mutate(Group = "Tumour")
all_long <- bind_rows(margin_long, tumour_long)
summary_df <- all_long %>%
group_by(Group, Distance) %>%
summarise(
mean = mean(Coverage, na.rm = TRUE),
sd = sd(Coverage, na.rm = TRUE),
.groups = "drop"
)
ggplot(summary_df, aes(x = Distance)) +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = Group), alpha = 0.2, colour = NA) +
geom_line(aes(y = mean, color = Group)) +
scale_color_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
scale_fill_manual(values = c("Margin" = "black", "Tumour" = "red3"), name = NULL) +
labs(
x = "Distance from MYC binding site (bp)",
y = "ATAC-seq signal"
) +
theme_bw() +
theme(legend.position = "top")
print
## function (x, ...)
## UseMethod("print")
## <bytecode: 0x558696995f58>
## <environment: namespace:base>
# Per-sheet density plots
# # File
file <- "/home/cathyuni28/data/120-260_frag_his.xlsx"
# X-axis window to plot
x_min <- 0
x_max <- 500
# Highlight band (e.g. nucleosome-sized fragments)
highlight_from <- 120
highlight_to <- 260
# Y-axis:
# - NULL = auto shared max across sheets (recommended)
# - number = fixed y max (e.g. 0.010)
fixed_y_max <- NULL
# Get sheets
sheets <- excel_sheets(file)
# Find a shared Y max
# We do this so every sheet uses the same y-axis scale.
y_max <- 0
if (is.null(fixed_y_max)) {
for (sheet in sheets) {
# Read sheet
df <- read_excel(file, sheet = sheet)
names(df) <- tolower(names(df))
# Pull x/y (expect columns called breaks and density)
x <- as.numeric(df$breaks)
y <- as.numeric(df$density)
# Keep only valid points in range
keep <- !is.na(x) & !is.na(y) & x >= x_min & x <= x_max
# Update max density if we have any points
if (any(keep)) y_max <- max(y_max, max(y[keep]))
}
# Small headroom so lines don’t clip
y_max <- y_max * 1.05
} else {
y_max <- fixed_y_max
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
# Plot each sheet
for (sheet in sheets) {
# Read sheet
df <- read_excel(file, sheet = sheet)
names(df) <- tolower(names(df))
# Pull x/y
x <- as.numeric(df$breaks)
y <- as.numeric(df$density)
# Keep only valid points in range
keep <- !is.na(x) & !is.na(y) & x >= x_min & x <= x_max
x <- x[keep]
y <- y[keep]
# Skip if not enough data
if (length(x) < 2) next
# Basic margins so labels fit
par(mar = c(6, 6, 3, 2))
# Make an empty plot (we add band + line after)
plot(
x, y,
type = "n",
xlim = c(x_min, x_max),
ylim = c(0, y_max),
xlab = "DNA fragment size, bp",
ylab = "Density",
main = sheet
)
# Highlight band (120–260 bp)
usr <- par("usr")
rect(highlight_from, 0, highlight_to, usr[4],
col = rgb(0.7, 0.7, 0.7, 0.20),
border = NA)
# Density curve
lines(x, y, col = "red", lwd = 2)
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
# Overlay density plot
# Input file
file <- "/home/cathyuni28/data/120-260_frag_his.xlsx"
# Settings
x_min <- 0
x_max <- 500
highlight_from <- 120
highlight_to <- 260
# Fixed y-axis for the overlay plot
fixed_y_max <- 0.008
# Get sheets
sheets <- excel_sheets(file)
# Read all sheets into lists
# Each element becomes a small data frame with breaks + density.
data_list <- lapply(sheets, function(sheet) {
df <- read_excel(file, sheet = sheet)
names(df) <- tolower(names(df))
x <- as.numeric(df$breaks)
y <- as.numeric(df$density)
keep <- !is.na(x) & !is.na(y) & x >= x_min & x <= x_max
d <- data.frame(breaks = x[keep], density = y[keep])
# Sort so lines draw smoothly
d[order(d$breaks), ]
})
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
# plot
par(mar = c(6, 6, 2, 10)) # extra right margin for legend
plot(NA,
xlim = c(x_min, x_max),
ylim = c(0, fixed_y_max),
xlab = "DNA fragment size, bp",
ylab = "Density")
# Highlight band (120–260 bp)
usr <- par("usr")
rect(highlight_from, 0, highlight_to, usr[4],
col = rgb(0.7, 0.7, 0.7, 0.25),
border = NA)
# Simple colour set (recycled if you have more than 5 sheets)
cols <- c("#4E79A7", "#59A14F", "#E15759", "#9C755F", "#B07AA1")
cols <- rep(cols, length.out = length(data_list))
# Draw each line
for (i in seq_along(data_list)) {
d <- data_list[[i]]
lines(d$breaks, d$density, lwd = 2.5, col = cols[i])
}
# Legend
legend("topright", inset = c(-0.32, 0),
legend = sheets,
col = cols,
lwd = 2.5,
bty = "n",
xpd = TRUE)
file <- "/home/cathyuni28/data/120-260 BMAL1.xlsx"
# Set these to match your workbook sheet names exactly
sheet_margin <- "Margin BMAL1"
sheet_wt <- "Wildtype BMAL1"
# read 1 sheet and convert to tidy long format
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
# First column is Distance (in your file the column name is long)
distance_col <- names(df)[1]
df %>%
select(
Distance = all_of(distance_col),
matches("Coverage$")
) %>%
mutate(Distance = as.numeric(Distance)) %>% # ensure numeric x-axis
pivot_longer(
cols = -Distance,
names_to = "Sample",
values_to = "Coverage"
) %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = str_remove(GBM, "GBM"),
GBM = as.integer(GBM)
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1-Wildtype")
# Quick sanity check: which GBMs are present?
list(
margin_gbms = sort(unique(margin_long$GBM)),
wt_gbms = sort(unique(wt_long$GBM))
)
## $margin_gbms
## [1] 12
##
## $wt_gbms
## [1] 12
# Paired Margin vs WT tumour (print only)
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>% # ensures pairs exist
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1-Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(values = c(
"Margin" = "black",
"Tumour IDH1-Wildtype" = "red3"
), name = NULL) +
labs(
,
x = "Distance from BMAL1 binding site (bp)",
y = "ATAC-seq occupancy (Coverage)"
) +
theme_bw() +
theme(legend.position = "top")
file <- "/home/cathyuni28/data/120-260 MYC.xlsx"
# Set these to match your workbook sheet names exactly
sheet_margin <- "Margin MYC"
sheet_wt <- "Wildtype MYC"
# read 1 sheet and convert to tidy long format
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
# First column is Distance (in your file the column name is long)
distance_col <- names(df)[1]
df %>%
select(
Distance = all_of(distance_col),
matches("Coverage$")
) %>%
mutate(Distance = as.numeric(Distance)) %>% # ensure numeric x-axis
pivot_longer(
cols = -Distance,
names_to = "Sample",
values_to = "Coverage"
) %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = str_remove(GBM, "GBM"),
GBM = as.integer(GBM)
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1-Wildtype")
# Quick sanity check: which GBMs are present?
list(
margin_gbms = sort(unique(margin_long$GBM)),
wt_gbms = sort(unique(wt_long$GBM))
)
## $margin_gbms
## [1] 12
##
## $wt_gbms
## [1] 12
# Paired Margin vs WT tumour
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>% # ensures pairs exist
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1-Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(
values = c("Margin" = "black", "Tumour IDH1-Wildtype" = "red3"),
name = NULL
) +
labs(
title = "Aggregate ATAC-seq profiles around TF binding sites of MYC (paired WT samples)",
x = "Distance from MYC binding site (bp)",
y = "ATAC-seq occupancy (Coverage)"
) +
theme_bw() +
theme(legend.position = "top")
file <- "/home/cathyuni28/data/120-260 MED1.xlsx"
# Set these to match your workbook sheet names exactly
sheet_margin <- "Margin MED1"
sheet_wt <- "Wildtype MED1"
# read 1 sheet and convert to tidy long format
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
# First column is Distance (in your file the column name is long)
distance_col <- names(df)[1]
df %>%
select(
Distance = all_of(distance_col),
matches("Coverage$")
) %>%
mutate(Distance = as.numeric(Distance)) %>% # ensure numeric x-axis
pivot_longer(
cols = -Distance,
names_to = "Sample",
values_to = "Coverage"
) %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = str_remove(GBM, "GBM"),
GBM = as.integer(GBM)
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1-Wildtype")
# Quick sanity check: which GBMs are present?
list(
margin_gbms = sort(unique(margin_long$GBM)),
wt_gbms = sort(unique(wt_long$GBM))
)
## $margin_gbms
## [1] 12
##
## $wt_gbms
## [1] 12
# Paired Margin vs WT tumour
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>% # ensures pairs exist
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1-Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(
values = c("Margin" = "black", "Tumour IDH1-Wildtype" = "red3"),
name = NULL
) +
labs(
title = "Aggregate ATAC-seq profiles around TF binding sites of MED1 (paired WT samples)",
x = "Distance from MED1 binding site (bp)",
y = "ATAC-seq occupancy (Coverage)"
) +
theme_bw() +
theme(legend.position = "top")
file <- "/home/cathyuni28/data/120-260 MAX.xlsx"
# Set these to match your workbook sheet names exactly
sheet_margin <- "Margin MAX"
sheet_wt <- "Wildtype MAX"
# read 1 sheet and convert to tidy long format
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
# First column is Distance (in your file the column name is long)
distance_col <- names(df)[1]
df %>%
select(
Distance = all_of(distance_col),
matches("Coverage$")
) %>%
mutate(Distance = as.numeric(Distance)) %>% # ensure numeric x-axis
pivot_longer(
cols = -Distance,
names_to = "Sample",
values_to = "Coverage"
) %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = str_remove(GBM, "GBM"),
GBM = as.integer(GBM)
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1-Wildtype")
# Quick sanity check: which GBMs are present?
list(
margin_gbms = sort(unique(margin_long$GBM)),
wt_gbms = sort(unique(wt_long$GBM))
)
## $margin_gbms
## [1] 12
##
## $wt_gbms
## [1] 12
# Paired Margin vs WT tumour (print only)
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>% # ensures pairs exist
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1-Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(
values = c("Margin" = "black", "Tumour IDH1-Wildtype" = "red3"),
name = NULL
) +
labs(
title = "Aggregate ATAC-seq profiles around TF binding sites of MAX (paired WT samples)",
x = "Distance from MAX binding site (bp)",
y = "ATAC-seq occupancy (Coverage)"
) +
theme_bw() +
theme(legend.position = "top")
file <- "/home/cathyuni28/data/120-260 ARNT2.xlsx"
# Set these to match your workbook sheet names exactly
sheet_margin <- "Margin ARNT2"
sheet_wt <- "Wildtype ARNT2"
# read 1 sheet and convert to tidy long format
tidy_homer_sheet <- function(path, sheet, group_label) {
df <- read_excel(path, sheet = sheet)
# First column is Distance (in your file the column name is long)
distance_col <- names(df)[1]
df %>%
select(
Distance = all_of(distance_col),
matches("Coverage$")
) %>%
mutate(Distance = as.numeric(Distance)) %>% # ensure numeric x-axis
pivot_longer(
cols = -Distance,
names_to = "Sample",
values_to = "Coverage"
) %>%
mutate(
Group = group_label,
GBM = str_extract(Sample, "GBM\\d+"),
GBM = str_remove(GBM, "GBM"),
GBM = as.integer(GBM)
) %>%
filter(!is.na(GBM)) %>%
select(Distance, Coverage, Group, GBM)
}
margin_long <- tidy_homer_sheet(file, sheet_margin, "Margin")
wt_long <- tidy_homer_sheet(file, sheet_wt, "Tumour IDH1-Wildtype")
# Quick sanity check: which GBMs are present?
list(
margin_gbms = sort(unique(margin_long$GBM)),
wt_gbms = sort(unique(wt_long$GBM))
)
## $margin_gbms
## [1] 12
##
## $wt_gbms
## [1] 12
# Paired Margin vs WT tumour (print only)
paired_wt <- bind_rows(margin_long, wt_long) %>%
filter(GBM %in% unique(wt_long$GBM)) %>% # ensures pairs exist
mutate(
Group = factor(Group, levels = c("Margin", "Tumour IDH1-Wildtype")),
GBM = paste0("GBM", GBM)
)
ggplot(paired_wt, aes(x = Distance, y = Coverage, color = Group)) +
geom_line() +
facet_wrap(~ GBM, scales = "free_y") +
scale_color_manual(
values = c("Margin" = "black", "Tumour IDH1-Wildtype" = "red3"),
name = NULL
) +
labs(
title = "Aggregate ATAC-seq profiles around TF binding sites of ARNT2 (paired WT samples)",
x = "Distance from ARNT2 binding site (bp)",
y = "ATAC-seq occupancy (Coverage)"
) +
theme_bw() +
theme(legend.position = "top")
# DepMap GBM Chronos: TABLE + BOXPLOT
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, isoyear, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
# File paths
model_path <- "/home/cathyuni28/data/Model.csv"
gene_effect_path <- "/home/cathyuni28/data/CRISPRGeneEffect.csv"
# Genes to include
genes <- c(
"MED1","MAX","MYC","ARNT2","BMAL1",
"FOSL1","FOSL2","FOS","JUN","JUNB","ATF3",
"BACH2","NFI","SOX3","BHLHE40"
)
# Load model metadata + filter to GBM
model <- fread(model_path)
gbm_ids <- model %>%
filter(
OncotreePrimaryDisease == "Diffuse Glioma",
OncotreeSubtype == "Glioblastoma"
) %>%
pull(ModelID) %>%
unique()
if (length(gbm_ids) == 0) stop("GBM filter returned 0 models. Check Oncotree labels.")
message("GBM models found: ", length(gbm_ids))
## GBM models found: 67
# Map gene symbols -> exact CRISPRGeneEffect column names
ge_header <- names(fread(gene_effect_path, nrows = 0))
# Model ID column in this file is typically called "V1"
id_col <- "V1"
if (!(id_col %in% ge_header)) stop("Expected CRISPRGeneEffect.csv to have 'V1' as the ID column.")
# Match each gene to its column (e.g. "MYC (...)")
gene_cols <- sapply(genes, function(g) {
hit <- ge_header[grepl(paste0("^", g, " \\("), ge_header)]
if (length(hit) == 0) NA_character_ else hit[1]
})
names(gene_cols) <- genes
# Warn if anything wasn't found
missing <- names(gene_cols)[is.na(gene_cols)]
if (length(missing) > 0) {
message("WARNING: These genes were not found in CRISPRGeneEffect columns: ",
paste(missing, collapse = ", "))
}
## WARNING: These genes were not found in CRISPRGeneEffect columns: NFI
# Keep only genes that exist
found_genes <- names(gene_cols)[!is.na(gene_cols)]
found_cols <- c(id_col, unname(gene_cols[found_genes]))
message("Genes found: ", length(found_genes), " / ", length(genes))
## Genes found: 14 / 15
# Read ONLY needed columns + filter to GBM
gene_effect <- fread(gene_effect_path, select = found_cols)
setnames(gene_effect, "V1", "ModelID")
gbm_effect <- gene_effect %>%
filter(ModelID %in% gbm_ids)
message("Rows after GBM filter: ", nrow(gbm_effect))
## Rows after GBM filter: 50
# Long format for table + plot
long_df <- bind_rows(lapply(found_genes, function(g) {
data.frame(
ModelID = gbm_effect$ModelID,
Gene = g,
GeneEffect = gbm_effect[[ gene_cols[g] ]]
)
})) %>%
filter(!is.na(GeneEffect))
# Summary TABLE
summ <- long_df %>%
group_by(Gene) %>%
summarise(
N_GBM = n(),
Mean_GeneEffect = mean(GeneEffect),
Median_GeneEffect = median(GeneEffect),
IQR_GeneEffect = IQR(GeneEffect),
.groups = "drop"
) %>%
arrange(Mean_GeneEffect)
# Print the table nicely in the knitted document
summ
## # A tibble: 14 × 5
## Gene N_GBM Mean_GeneEffect Median_GeneEffect IQR_GeneEffect
## <chr> <int> <dbl> <dbl> <dbl>
## 1 MYC 50 -1.42 -1.41 0.628
## 2 JUN 50 -0.636 -0.577 0.449
## 3 MAX 50 -0.626 -0.696 0.334
## 4 FOSL1 50 -0.460 -0.438 0.483
## 5 MED1 50 -0.419 -0.317 0.426
## 6 JUNB 50 -0.201 -0.186 0.219
## 7 BMAL1 50 -0.120 -0.113 0.166
## 8 FOSL2 50 -0.104 -0.0853 0.192
## 9 FOS 50 -0.0794 -0.0853 0.117
## 10 SOX3 50 -0.0668 -0.0105 0.111
## 11 ARNT2 50 0.0119 0.0141 0.120
## 12 ATF3 50 0.0123 0.0114 0.142
## 13 BACH2 50 0.0544 0.0512 0.0888
## 14 BHLHE40 50 0.0979 0.115 0.101
# Order genes in the boxplot by the same ranking
long_df$Gene <- factor(long_df$Gene, levels = summ$Gene)
# Boxplot
p <- ggplot(long_df, aes(x = Gene, y = GeneEffect)) +
geom_boxplot(outlier.shape = 16) +
coord_flip() +
labs(
title = "DepMap CRISPR (Chronos): Gene effect distributions in GBM cell line models",
subtitle = "Filtered: OncotreePrimaryDisease = Diffuse Glioma; OncotreeSubtype = Glioblastoma",
x = "Gene",
y = "Chronos gene effect (more negative = more essential)"
) +
theme_minimal(base_size = 12)
print(p)