Single Sample Geneset Enrichment Analysis The ssGSEA method is an extension of the GSEA method, working at the level of a single sample rather than a sample population as in the original GSEA application. The score derived from ssGSEA reflects the degree to which the input gene signature is coordinately up- or downregulated within a sample.
Load the packages to read, average and plot the data
library(matrixStats)
library(circlize)
## ========================================
## circlize version 0.4.13
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.8.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(data.table)
SSGEA function. Copy this function as it is without making any changes to it.
ssgsea = function(X, gene_sets, alpha = 0.25, scale = T, norm = F, single = T) {
row_names = rownames(X)
num_genes = nrow(X)
gene_sets = lapply(gene_sets, function(genes) {which(row_names %in% genes)})
# Ranks for genes
R = matrixStats::colRanks(X, preserveShape = T, ties.method = 'average')
# Calculate enrichment score (es) for each sample (column)
es = apply(R, 2, function(R_col) {
gene_ranks = order(R_col, decreasing = TRUE)
# Calc es for each gene set
es_sample = sapply(gene_sets, function(gene_set_idx) {
# pos: match (within the gene set)
# neg: non-match (outside the gene set)
indicator_pos = gene_ranks %in% gene_set_idx
indicator_neg = !indicator_pos
rank_alpha = (R_col[gene_ranks] * indicator_pos) ^ alpha
step_cdf_pos = cumsum(rank_alpha) / sum(rank_alpha)
step_cdf_neg = cumsum(indicator_neg) / sum(indicator_neg)
step_cdf_diff = step_cdf_pos - step_cdf_neg
# Normalize by gene number
if (scale) step_cdf_diff = step_cdf_diff / num_genes
# Use ssGSEA or not
if (single) {
sum(step_cdf_diff)
} else {
step_cdf_diff[which.max(abs(step_cdf_diff))]
}
})
unlist(es_sample)
})
if (length(gene_sets) == 1) es = matrix(es, nrow = 1)
# Normalize by absolute diff between max and min
if (norm) es = es / diff(range(es))
# Prepare output
rownames(es) = names(gene_sets)
colnames(es) = colnames(X)
return(es)
}
Read the log2 normalized data into an object for further analysis. Also please do check if your gene symbols are in proper format as to not get any errors during analysis.
data = readRDS("input1.rds")
data[1:5,1:5]
## CGGA_1001 CGGA_1006 CGGA_1007 CGGA_1011 CGGA_1015
## A1BG 12.64 7.03 30.09 6.64 1.83
## A1BG-AS1 2.12 1.13 6.64 4.32 1.39
## A2M 452.92 106.54 206.70 707.17 824.32
## A2M-AS1 3.30 0.13 0.63 1.61 1.34
## A2ML1 0.04 0.33 4.96 1.59 0.00
data = as.matrix(data)
Read the gene set matrix into an object. A gene set matrix should always have each gene set into an individual column.
gene_set = read.csv("pancreasGeneset.csv")
head(gene_set)
## HOXB6_TARGET_GENES KEGG_PANCREATIC_CANCER GOBP_PANCREAS_DEVELOPMENT
## 1 AANAT AKT1 AKT1
## 2 AATF AKT2 ALDH1A2
## 3 ABCA7 AKT3 ANXA1
## 4 ABCF2 ARAF ARNTL
## 5 ABHD2 ARHGEF6 BAD
## 6 ABITRAM BAD BAK1
## HALLMARK_PANCREAS_BETA_CELLS
## 1 ABCC8
## 2 AKT3
## 3 CHGA
## 4 DCX
## 5 DPP4
## 6 ELP4
gene_sets = as.list(as.data.frame(gene_set))
print("genes set ready")
## [1] "genes set ready"
system.time(assign('res', ssgsea(data, gene_sets, scale = TRUE, norm = FALSE)))
## user system elapsed
## 6.916 0.116 7.039
#transpose the res object for viewing purposes
res1 = t(res)
head(res1)
## HOXB6_TARGET_GENES KEGG_PANCREATIC_CANCER GOBP_PANCREAS_DEVELOPMENT
## CGGA_1001 0.2160751 0.2592720 0.02114531
## CGGA_1006 0.2401519 0.2610575 0.07829509
## CGGA_1007 0.2317348 0.2673150 0.04259967
## CGGA_1011 0.2249323 0.2562896 0.04252924
## CGGA_1015 0.2298090 0.2598015 0.04840821
## CGGA_1019 0.2244457 0.2137503 0.06291690
## HALLMARK_PANCREAS_BETA_CELLS
## CGGA_1001 0.05509396
## CGGA_1006 0.09985475
## CGGA_1007 0.07050209
## CGGA_1011 0.06281068
## CGGA_1015 0.02278613
## CGGA_1019 0.08758888
#zscore the ssgsea output for comparative analysis
mat = (res - rowMeans(res))/(rowSds(as.matrix(res)))[row(res)]
#read the info file for Heatmap annotations
info = read.csv("sampleInfoCGGA.csv")
rownames(info) = info[,1]
info = info[,-1]
#order both the objects for sample alignment. If the number of samples vary in your data, please subset the data frames and then order them.
mat = mat[, order(colnames(mat))]
info = info[order(rownames(info)), ]
#move on for further analysis, only if the statement is TRUE
identical(rownames(info), colnames(mat))
## [1] TRUE
#subset the classifier from the info matrix for heatmap annotation. For this dataset, we have used the "Grade" classifier, please chose one based on your data.
df = as.data.frame(info$Grade)
colnames(df) = "Grade"
ha = HeatmapAnnotation(df = df)
Heatmap(mat, top_annotation = ha, col = colorRamp2(c(-2,0,2), c("orangered", "white", "purple")))
#In case of high number of samples, printing the sample IDs does not look visually appeasing. Hence we can remove the sample IDs using the following command:
Heatmap(mat, top_annotation = ha, col = colorRamp2(c(-2,0,2), c("orangered", "white", "purple")), column_names_gp = gpar(fontsize = 0))