gsea_plot.R

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(aplot)

# Create simulated data
set.seed(123)
n_genes <- 1000
geneList <- sort(rnorm(n_genes), decreasing = TRUE)
names(geneList) <- paste0("gene", 1:n_genes)

# Return to random selection
# pathway_size <- 15
# pathway_genes <- sample(names(geneList), pathway_size)

 pathway_genes <- names(geneList)[c(1:20, 50:60, 80:90)]  # Modified to show clear pattern
 pathway_size <- length(pathway_genes)
# Calculate running score
make_gsdata <- function() {
  x <- 1:n_genes
  runningScore <- cumsum(ifelse(names(geneList) %in% pathway_genes, 
                                1/pathway_size, 
                                -1/(n_genes-pathway_size)))
  runningScore <- runningScore/max(abs(runningScore))
  
  data.frame(
    x = x,
    runningScore = runningScore,
    geneList = geneList,
    position = ifelse(names(geneList) %in% pathway_genes, 1, 0),
    id = "Pathway1"
  )
}

gsdata <- make_gsdata()
gsdata1 <- gsdata %>%
  filter(position == 1) %>%
  mutate(gene_name = names(geneList)[position == 1])

# Calculate dynamic breaks based on gene positions
tmp_positions <- gsdata$position
v <- seq(1, sum(tmp_positions), length.out = 9)
inv <- findInterval(rev(cumsum(tmp_positions)), v)
if (min(inv) == 0) inv <- inv + 1

# Create result data frame for heatmap
result_df <- data.frame(
  Interval = 1:length(unique(inv)),
  xmin = which(!duplicated(inv)),
  xmax = c(which(!duplicated(inv))[-1], length(inv)),
  ymin = 0,
  ymax = 0.3,
  Mean_LogFC = quantile(geneList, probs = seq(0, 1, length.out = length(unique(inv)))),
  id = "Pathway1"
)

# Plot parameters
curveCol <- c("#76BA99", "#EB4747")
htCol <- c("#08519C", "#A50F15")
lineSize <- 1.2
dim(gsdata); head(gsdata); summary(gsdata$geneList)

## [1] 1000    5

##       x runningScore geneList position       id
## gene1 1   0.02506541 3.241040        1 Pathway1
## gene2 2   0.05013082 2.691714        1 Pathway1
## gene3 3   0.07519623 2.684859        1 Pathway1
## gene4 4   0.10026164 2.575450        1 Pathway1
## gene5 5   0.12532705 2.571458        1 Pathway1
## gene6 6   0.15039246 2.553026        1 Pathway1

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.80978 -0.62832  0.00921  0.01613  0.66460  3.24104

unique(gsdata$id)

## [1] "Pathway1"

unique(gsdata$position)

## [1] 1 0

table(gsdata$position)

## 
##   0   1 
## 958  42

head(geneList); summary(geneList)

##    gene1    gene2    gene3    gene4    gene5    gene6 
## 3.241040 2.691714 2.684859 2.575450 2.571458 2.553026

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.80978 -0.62832  0.00921  0.01613  0.66460  3.24104

# 1. Enrichment Score Curve
pcurve <- ggplot(gsdata, aes(x = x, y = runningScore)) +
  geom_line(aes(color = runningScore), size = lineSize) +
  scale_color_gradient2(
    low = curveCol[1],
    mid = "yellow",
    high = curveCol[2],
    midpoint = mean(range(gsdata$runningScore))
  ) +
  geom_hline(yintercept = 0, size = lineSize/2, color = "black", lty = "dashed") +
  scale_x_continuous(expand = c(0, 0), limits = c(1, n_genes)) +
  theme_bw(base_size = 12) +
  theme(
    legend.position = "none",
    panel.grid = element_blank(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank(),
    axis.line.x = element_blank(),
    axis.title.x = element_blank(),
    plot.margin = margin(t = .2, r = .2, b = 0, l = .2, unit = "cm")
  ) +
  ylab("Running Enrichment Score")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# 2. Gene Position Ticks with Heatmap
pseg_ht <- ggplot(gsdata, aes(x = x, y = runningScore)) +
  geom_segment(
    data = gsdata1,
    aes(x = x, xend = x, y = 0, yend = 1),
    color = "black",
    size = 0.5,
    show.legend = FALSE
  ) +
  geom_rect(
    data = result_df,
    aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax, fill = Mean_LogFC),
    inherit.aes = FALSE,
    alpha = 0.8,
    show.legend = FALSE
  ) +
  scale_fill_gradient2(
    low = htCol[1],
    mid = "white",
    high = htCol[2],
    midpoint = 0
  ) +
  scale_x_continuous(expand = c(0, 0), limits = c(1, n_genes)) +
  scale_y_continuous(expand = c(0, 0)) +
  theme_bw(base_size = 12) +
  theme(
    axis.ticks = element_blank(),
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.margin = margin(t = -.1, r = .2, b = 0, l = .2, unit = "cm")
  )

# 3. Ranked List Metric
prank <- ggplot(data.frame(x = 1:length(geneList), y = geneList), aes(x = x, y = y)) +
  geom_col(aes(fill = y), width = 1, show.legend = FALSE) +
  scale_fill_gradient2(
    low = htCol[2],
    mid = "white",
    high = htCol[1],
    midpoint = 0
  ) +
  geom_hline(yintercept = 0, size = 0.5, color = "black", lty = "dashed") +
  scale_x_continuous(expand = c(0, 0), limits = c(1, n_genes), 
                     breaks = seq(0, n_genes, 200)) +
  theme_bw(base_size = 12) +
  theme(
    panel.grid = element_blank(),
    plot.margin = margin(t = -.1, r = .2, b = .2, l = .2, unit = "cm")
  ) +
  coord_cartesian(expand = 0) +
  ylab("Ranked List Metric") +
  xlab("Rank in Ordered Dataset")

# Combine plots
combined_plot <- aplot::plot_list(
  gglist = list(pcurve, pseg_ht, prank),
  ncol = 1,
  heights = c(0.5, 0.2, 0.3)
)

print(combined_plot)

## Warning: Removed 2 rows containing missing values (`geom_col()`).

gsea_plot.R

xut2

2025-01-17