library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(aplot)
# Create simulated data
set.seed(123)
n_genes <- 1000
geneList <- sort(rnorm(n_genes), decreasing = TRUE)
names(geneList) <- paste0("gene", 1:n_genes)
# Return to random selection
# pathway_size <- 15
# pathway_genes <- sample(names(geneList), pathway_size)
pathway_genes <- names(geneList)[c(1:20, 50:60, 80:90)] # Modified to show clear pattern
pathway_size <- length(pathway_genes)
# Calculate running score
make_gsdata <- function() {
x <- 1:n_genes
runningScore <- cumsum(ifelse(names(geneList) %in% pathway_genes,
1/pathway_size,
-1/(n_genes-pathway_size)))
runningScore <- runningScore/max(abs(runningScore))
data.frame(
x = x,
runningScore = runningScore,
geneList = geneList,
position = ifelse(names(geneList) %in% pathway_genes, 1, 0),
id = "Pathway1"
)
}
gsdata <- make_gsdata()
gsdata1 <- gsdata %>%
filter(position == 1) %>%
mutate(gene_name = names(geneList)[position == 1])
# Calculate dynamic breaks based on gene positions
tmp_positions <- gsdata$position
v <- seq(1, sum(tmp_positions), length.out = 9)
inv <- findInterval(rev(cumsum(tmp_positions)), v)
if (min(inv) == 0) inv <- inv + 1
# Create result data frame for heatmap
result_df <- data.frame(
Interval = 1:length(unique(inv)),
xmin = which(!duplicated(inv)),
xmax = c(which(!duplicated(inv))[-1], length(inv)),
ymin = 0,
ymax = 0.3,
Mean_LogFC = quantile(geneList, probs = seq(0, 1, length.out = length(unique(inv)))),
id = "Pathway1"
)
# Plot parameters
curveCol <- c("#76BA99", "#EB4747")
htCol <- c("#08519C", "#A50F15")
lineSize <- 1.2
dim(gsdata); head(gsdata); summary(gsdata$geneList)
## [1] 1000 5
## x runningScore geneList position id
## gene1 1 0.02506541 3.241040 1 Pathway1
## gene2 2 0.05013082 2.691714 1 Pathway1
## gene3 3 0.07519623 2.684859 1 Pathway1
## gene4 4 0.10026164 2.575450 1 Pathway1
## gene5 5 0.12532705 2.571458 1 Pathway1
## gene6 6 0.15039246 2.553026 1 Pathway1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.80978 -0.62832 0.00921 0.01613 0.66460 3.24104
unique(gsdata$id)
## [1] "Pathway1"
unique(gsdata$position)
## [1] 1 0
table(gsdata$position)
##
## 0 1
## 958 42
head(geneList); summary(geneList)
## gene1 gene2 gene3 gene4 gene5 gene6
## 3.241040 2.691714 2.684859 2.575450 2.571458 2.553026
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.80978 -0.62832 0.00921 0.01613 0.66460 3.24104
# 1. Enrichment Score Curve
pcurve <- ggplot(gsdata, aes(x = x, y = runningScore)) +
geom_line(aes(color = runningScore), size = lineSize) +
scale_color_gradient2(
low = curveCol[1],
mid = "yellow",
high = curveCol[2],
midpoint = mean(range(gsdata$runningScore))
) +
geom_hline(yintercept = 0, size = lineSize/2, color = "black", lty = "dashed") +
scale_x_continuous(expand = c(0, 0), limits = c(1, n_genes)) +
theme_bw(base_size = 12) +
theme(
legend.position = "none",
panel.grid = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.line.x = element_blank(),
axis.title.x = element_blank(),
plot.margin = margin(t = .2, r = .2, b = 0, l = .2, unit = "cm")
) +
ylab("Running Enrichment Score")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# 2. Gene Position Ticks with Heatmap
pseg_ht <- ggplot(gsdata, aes(x = x, y = runningScore)) +
geom_segment(
data = gsdata1,
aes(x = x, xend = x, y = 0, yend = 1),
color = "black",
size = 0.5,
show.legend = FALSE
) +
geom_rect(
data = result_df,
aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = Mean_LogFC),
inherit.aes = FALSE,
alpha = 0.8,
show.legend = FALSE
) +
scale_fill_gradient2(
low = htCol[1],
mid = "white",
high = htCol[2],
midpoint = 0
) +
scale_x_continuous(expand = c(0, 0), limits = c(1, n_genes)) +
scale_y_continuous(expand = c(0, 0)) +
theme_bw(base_size = 12) +
theme(
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
plot.margin = margin(t = -.1, r = .2, b = 0, l = .2, unit = "cm")
)
# 3. Ranked List Metric
prank <- ggplot(data.frame(x = 1:length(geneList), y = geneList), aes(x = x, y = y)) +
geom_col(aes(fill = y), width = 1, show.legend = FALSE) +
scale_fill_gradient2(
low = htCol[2],
mid = "white",
high = htCol[1],
midpoint = 0
) +
geom_hline(yintercept = 0, size = 0.5, color = "black", lty = "dashed") +
scale_x_continuous(expand = c(0, 0), limits = c(1, n_genes),
breaks = seq(0, n_genes, 200)) +
theme_bw(base_size = 12) +
theme(
panel.grid = element_blank(),
plot.margin = margin(t = -.1, r = .2, b = .2, l = .2, unit = "cm")
) +
coord_cartesian(expand = 0) +
ylab("Ranked List Metric") +
xlab("Rank in Ordered Dataset")
# Combine plots
combined_plot <- aplot::plot_list(
gglist = list(pcurve, pseg_ht, prank),
ncol = 1,
heights = c(0.5, 0.2, 0.3)
)
print(combined_plot)
## Warning: Removed 2 rows containing missing values (`geom_col()`).
