SSGSEA

Single Sample Geneset Enrichment Analysis The ssGSEA method is an extension of the GSEA method, working at the level of a single sample rather than a sample population as in the original GSEA application. The score derived from ssGSEA reflects the degree to which the input gene signature is coordinately up- or downregulated within a sample.

Load the packages to read, average and plot the data

library(matrixStats)
library(circlize)

## ========================================
## circlize version 0.4.13
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.8.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

library(data.table)

SSGEA function. Copy this function as it is without making any changes to it.

ssgsea = function(X, gene_sets, alpha = 0.25, scale = T, norm = F, single = T) {
    row_names = rownames(X)
    num_genes = nrow(X)
    gene_sets = lapply(gene_sets, function(genes) {which(row_names %in% genes)})

    # Ranks for genes
    R = matrixStats::colRanks(X, preserveShape = T, ties.method = 'average')

    # Calculate enrichment score (es) for each sample (column)
    es = apply(R, 2, function(R_col) {
        gene_ranks = order(R_col, decreasing = TRUE)

        # Calc es for each gene set
        es_sample = sapply(gene_sets, function(gene_set_idx) {
            # pos: match (within the gene set)
            # neg: non-match (outside the gene set)
            indicator_pos = gene_ranks %in% gene_set_idx
            indicator_neg = !indicator_pos

            rank_alpha  = (R_col[gene_ranks] * indicator_pos) ^ alpha

            step_cdf_pos = cumsum(rank_alpha)    / sum(rank_alpha)
            step_cdf_neg = cumsum(indicator_neg) / sum(indicator_neg)

            step_cdf_diff = step_cdf_pos - step_cdf_neg

            # Normalize by gene number
            if (scale) step_cdf_diff = step_cdf_diff / num_genes

            # Use ssGSEA or not
            if (single) {
                sum(step_cdf_diff)
            } else {
                step_cdf_diff[which.max(abs(step_cdf_diff))]
            }
        })
        unlist(es_sample)
    })

    if (length(gene_sets) == 1) es = matrix(es, nrow = 1)

    # Normalize by absolute diff between max and min
    if (norm) es = es / diff(range(es))

    # Prepare output
    rownames(es) = names(gene_sets)
    colnames(es) = colnames(X)
    return(es)
}

Read the log2 normalized data into an object for further analysis. Also please do check if your gene symbols are in proper format as to not get any errors during analysis.

data = readRDS("input1.rds")
data[1:5,1:5]

##          CGGA_1001 CGGA_1006 CGGA_1007 CGGA_1011 CGGA_1015
## A1BG         12.64      7.03     30.09      6.64      1.83
## A1BG-AS1      2.12      1.13      6.64      4.32      1.39
## A2M         452.92    106.54    206.70    707.17    824.32
## A2M-AS1       3.30      0.13      0.63      1.61      1.34
## A2ML1         0.04      0.33      4.96      1.59      0.00

data = as.matrix(data)

Read the gene set matrix into an object. A gene set matrix should always have each gene set into an individual column.

gene_set = read.csv("pancreasGeneset.csv")
head(gene_set)

##   HOXB6_TARGET_GENES KEGG_PANCREATIC_CANCER GOBP_PANCREAS_DEVELOPMENT
## 1              AANAT                   AKT1                      AKT1
## 2               AATF                   AKT2                   ALDH1A2
## 3              ABCA7                   AKT3                     ANXA1
## 4              ABCF2                   ARAF                     ARNTL
## 5              ABHD2                ARHGEF6                       BAD
## 6            ABITRAM                    BAD                      BAK1
##   HALLMARK_PANCREAS_BETA_CELLS
## 1                        ABCC8
## 2                         AKT3
## 3                         CHGA
## 4                          DCX
## 5                         DPP4
## 6                         ELP4

gene_sets = as.list(as.data.frame(gene_set))
print("genes set ready")

## [1] "genes set ready"

system.time(assign('res', ssgsea(data, gene_sets, scale = TRUE, norm = FALSE)))

##    user  system elapsed 
##   6.916   0.116   7.039

#transpose the res object for viewing purposes
res1 = t(res)
head(res1)

##           HOXB6_TARGET_GENES KEGG_PANCREATIC_CANCER GOBP_PANCREAS_DEVELOPMENT
## CGGA_1001          0.2160751              0.2592720                0.02114531
## CGGA_1006          0.2401519              0.2610575                0.07829509
## CGGA_1007          0.2317348              0.2673150                0.04259967
## CGGA_1011          0.2249323              0.2562896                0.04252924
## CGGA_1015          0.2298090              0.2598015                0.04840821
## CGGA_1019          0.2244457              0.2137503                0.06291690
##           HALLMARK_PANCREAS_BETA_CELLS
## CGGA_1001                   0.05509396
## CGGA_1006                   0.09985475
## CGGA_1007                   0.07050209
## CGGA_1011                   0.06281068
## CGGA_1015                   0.02278613
## CGGA_1019                   0.08758888

#zscore the ssgsea output for comparative analysis
mat = (res - rowMeans(res))/(rowSds(as.matrix(res)))[row(res)]

#read the info file for Heatmap annotations
info = read.csv("sampleInfoCGGA.csv")
rownames(info) = info[,1]
info = info[,-1]

#order both the objects for sample alignment. If the number of samples vary in your data, please subset the data frames and then order them.

mat = mat[, order(colnames(mat))]
info = info[order(rownames(info)), ]

#move on for further analysis, only if the statement is TRUE
identical(rownames(info), colnames(mat))

## [1] TRUE

#subset the classifier from the info matrix for heatmap annotation. For this dataset, we have used the "Grade" classifier, please chose one based on your data.

df = as.data.frame(info$Grade)
colnames(df) = "Grade"
ha = HeatmapAnnotation(df = df)

Heatmap(mat, top_annotation = ha, col = colorRamp2(c(-2,0,2), c("orangered", "white", "purple")))

#In case of high number of samples, printing the sample IDs does not look visually appeasing. Hence we can remove the sample IDs using the following command:

Heatmap(mat, top_annotation = ha, col = colorRamp2(c(-2,0,2), c("orangered", "white", "purple")),  column_names_gp = gpar(fontsize = 0))

SSGSEA

Pranali

24/07/2021