# Load required libraries
library(DOSE)
## Warning: package 'DOSE' was built under R version 4.4.1
##
## DOSE v4.0.0 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an
## R/Bioconductor package for Disease Ontology Semantic and Enrichment
## analysis. Bioinformatics. 2015, 31(4):608-609
library(enrichplot)
## Warning: package 'enrichplot' was built under R version 4.4.2
## enrichplot v1.26.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, 19(11):3292-3320
library(ggplot2)
library(aplot)
## Warning: package 'aplot' was built under R version 4.4.2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
Get GSEA information for plotting @param object GSEA result object @param geneSetID Gene set ID @return A data frame with running score information
gsInfo <- function(object, geneSetID) {
# Get gene set
geneSet <- object@geneSets[[geneSetID]]
# Get ranked gene list
geneList <- object@geneList
# Get ranks
rnk <- rank(-geneList)
ord <- order(rnk)
statsAdj <- geneList[ord]
statsAdj <- sign(statsAdj) * (abs(statsAdj))
# Get gene positions in ordered list
geneRanks <- match(geneSet, names(statsAdj))
geneRanks <- geneRanks[!is.na(geneRanks)]
# Calculate running sum
xs <- 1:length(statsAdj)
nGenes <- length(statsAdj)
nGeneSet <- length(geneSet)
# Calculate enrichment score
p <- 1
NR <- sum(abs(statsAdj[geneRanks])^p)
rSum <- 0
runningScore <- rep(0, length(xs))
for(i in xs) {
if (i %in% geneRanks) {
rSum <- rSum + (abs(statsAdj[i])^p/NR)
} else {
rSum <- rSum - (1/(nGenes - nGeneSet))
}
runningScore[i] <- rSum
}
# Return result
data.frame(
x = xs,
runningScore = runningScore
)
}
Create a complete GSEA visualization @param gsea_result GSEA result object @param pathway_id Pathway ID to visualize @return A list containing three plots
createCompleteGsea <- function(gsea_result, pathway_id) {
# Common parameters
lineSize <- 1.2
base_size <- 12
curveCol <- c("red", "green3") # Red to green gradient for running score
htCol <- c("#08519C", "#F0F0F0", "#A50F15") # Blue-White-Red for heatmap
htHeight <- 0.15
# Get required data
gsdata <- gsInfo(gsea_result, geneSetID = pathway_id)
gsdata$id <- pathway_id
# Get gene list and data
geneList <- gsea_result@geneList
data_ga <- as.data.frame(gsea_result) %>%
dplyr::filter(ID == pathway_id)
# Process title
tit <- unlist(strsplit(pathway_id, split = "_"))
niceTit <- if(length(tit) == 1) {
paste(str_to_title(tit), collapse = " ")
} else {
paste(str_to_title(tit[2:length(tit)]), collapse = " ")
}
# Create p1 (Running Score Plot)
p1 <- ggplot(gsdata, aes(x = x, y = runningScore)) +
geom_line(aes(color = runningScore), size = lineSize) +
scale_color_gradient(
low = curveCol[1], # Red
high = curveCol[2], # Green
guide = "none"
) +
geom_hline(yintercept = 0, size = lineSize * 0.5,
color = "black", linetype = "dashed") +
theme_bw(base_size = base_size) +
scale_x_continuous(expand = c(0, 0)) +
theme(
legend.position = "none",
plot.title = element_text(hjust = 0.5),
panel.grid = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.line.x = element_blank(),
axis.title.x = element_blank(),
plot.margin = margin(t = .2, r = .2, b = 0, l = .2, unit = "cm")
) +
ylab("Running Enrichment Score") +
ggtitle(niceTit)
# Add statistics
pLabel <- paste0(
"NES: ", round(data_ga$NES, digits = 2), "\n",
"Pvalue: ", ifelse(data_ga$pvalue < 0.001, "< 0.001",
round(data_ga$pvalue, digits = 2)), "\n",
"Adjusted Pvalue: ", ifelse(data_ga$p.adjust < 0.001, "< 0.001",
round(data_ga$p.adjust, digits = 2))
)
px <- 0.9 * nrow(gsdata)
py <- 0.9 * sum(abs(range(gsdata$runningScore))) + min(gsdata$runningScore)
p1 <- p1 +
annotate(geom = "text",
x = px,
y = py,
label = pLabel,
size = 4,
color = "grey0",
fontface = "italic",
hjust = 1)
# Create p2 (Heatmap)
# Get gene list and set
geneSet <- gsea_result@geneSets[[pathway_id]]
stats <- geneList[names(geneList) %in% names(geneList)]
stats <- sort(stats, decreasing = TRUE)
# Calculate gene positions
positions <- match(geneSet, names(stats))
positions <- positions[!is.na(positions)]
positions <- sort(positions)
# Create position data for black lines
position_df <- data.frame(
x = positions,
y = rep(0, length(positions)),
yend = rep(1, length(positions))
)
# Process ranks for interval segmentation
ranks <- stats
n_breaks <- 8 # Number of intervals
# Calculate quantiles for breaks
breaks <- quantile(ranks, probs = seq(0, 1, length.out = n_breaks + 1))
# Create interval data
interval_data <- data.frame()
for(i in 1:n_breaks) {
if(i == 1) {
subset <- ranks >= breaks[i] & ranks <= breaks[i + 1]
} else {
subset <- ranks > breaks[i] & ranks <= breaks[i + 1]
}
if(any(subset)) {
interval_data <- rbind(interval_data, data.frame(
xmin = which(subset)[1],
xmax = tail(which(subset), 1),
interval = i,
rank_mean = mean(ranks[subset])
))
}
}
# Create p2
p2 <- ggplot() +
# Add intervals
geom_rect(data = interval_data,
aes(xmin = xmin, xmax = xmax,
ymin = 0, ymax = htHeight,
fill = rank_mean),
color = NA) +
# Add gene position lines
geom_segment(data = position_df,
aes(x = x, xend = x, y = y, yend = yend),
color = "black",
size = 0.5) +
scale_fill_gradient2(
low = htCol[1], # Blue
mid = htCol[2], # White
high = htCol[3], # Red
midpoint = 0,
guide = "none"
) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
theme_bw(base_size = base_size) +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
panel.border = element_rect(colour = "black", fill = NA),
plot.margin = margin(t = 0, r = 0.2, b = 0.2, l = 0.2, unit = "cm"),
legend.position = "none"
)
# Create p3 (Rank Plot)
df <- data.frame(
pos = seq_along(stats),
fc = stats
)
p3 <- ggplot(df) +
geom_col(aes(x = pos, y = fc, fill = fc),
width = 1) +
scale_fill_gradient2(
low = htCol[1], # Blue
mid = htCol[2], # White
high = htCol[3], # Red
midpoint = 0,
guide = "none"
) +
geom_hline(yintercept = 0,
size = 0.5,
color = "black",
linetype = "dashed") +
scale_x_continuous(breaks = seq(0, nrow(gsdata), 5000),
expand = c(0, 0)) +
scale_y_continuous(expand = c(0.1, 0.1)) +
theme_bw(base_size = base_size) +
theme(
panel.grid = element_blank(),
plot.margin = margin(t = -.1,
r = .2,
b = .2,
l = .2,
unit = "cm")
) +
labs(y = "Ranked List",
x = "Rank in Ordered Dataset")
# Return all plots
return(list(
running_score = p1,
heatmap = p2,
rank_plot = p3
))
}
# Example usage:
gsea_result <- readRDS("human_gsea_data.rds")
pathway_id <- gsea_result@result$ID[1]
plots <- createCompleteGsea(gsea_result, pathway_id)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Combine plots using aplot
final_plot <- plots$running_score %>%
insert_bottom(plots$heatmap, height = 0.15) %>%
insert_bottom(plots$rank_plot, height = 0.3)
print(final_plot)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_col()`).