library(Seurat)

## Loading required package: SeuratObject

## Loading required package: sp

## 
## Attaching package: 'SeuratObject'

## The following objects are masked from 'package:base':
## 
##     intersect, t

library(pheatmap)
library(tibble)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(purrr)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(gtable)
library(grid)
library(viridis)

## Loading required package: viridisLite

library(RColorBrewer)
library(fgsea)
library(reshape2)
library(ggplot2)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

library(grid)
library(this.path)

setwd(this.path::here())

GSEA of exhausted vs. resident memory CD8+ T cells in LCMV and Pan Cancer

Calculate the gene set enrichment of exhausted T cells vs. TRM in LCMV bulk RNA-seq and Pan Cancer scRNA-seq datasets. This analysis focuses on gene sets corresponding to biologically relevant GO terms like protein stability and leukocyte cell-cell adhesion. Subsets of these gene sets are taken by intersecting them with the TF communities from Taiji.

inputs: custom_gene_sets from SubGenesets_TRM_TEX_TFcommunity_20240419.csv, Seurat object from Pancancer_CD8_simple_7cluster_220518.rds for the Pan Cancer analysis, and bulk RNA-seq counts from Taiji_counts_matrix.tsv for the LCMV analysis.

outputs: dot plot comparing GSEA results of Pan Cancer vs. LCMV for the queried GO subsets

First, we will initialize the gene sets to be queried:

GO_term_to_description = list(GO_term = c("GO:0031331", "GO:0031647",   "GO:1903052",   "GO:0043161",   "GO:0061919",   "GO:0097193",   "GO:0032623",   "GO:0032743",   "GO:0032663",   "GO:0071559",   "GO:0007015",   "GO:2000112",   "GO:0043087",   "GO:0007159"),
                              Description = c('Catabolism', 'Protein stability', 'Proteolysis', 'UPS', 'Autophagy', 'Intrinsic apoptosis', 'IL-2 production', 'Pos. reg. of IL-2 production', 'Reg. of IL-2 production', 'TGF-Beta', 'Actin', 'Reg. of cellular macromolecule biosynthetic process', 'Reg. of GTPase activity', 'Leukocyte cell-cell adhesion'))

signatures = read.table('SubGenesets_TRM_TEX_TFcommunity_20240419.csv', header=FALSE, sep=',')

rownames(signatures) = signatures$V1
signatures = signatures[-1]
custom_gene_sets = list()

for (r in rownames(signatures)) {
  tmp = as.character(signatures[r,])
  tmp = tmp[tmp != ""]
  custom_gene_sets[[r]] = toupper(tmp)
}

custom_gene_sets = lapply(custom_gene_sets, function (x) x[x != "NA"])
gene_set_order = names(custom_gene_sets)

Perform GSEA for exhausted (Tex) CD8+ T cells vs. resident memory (TRM)

sobj = readRDS('Pancancer_CD8_simple_7cluster_220518.rds')

de_results = list()
background_cells = WhichCells(sobj, idents = 'TRM')
query_cells = WhichCells(sobj, idents = 'Tex')

de_res = FindMarkers(sobj, ident.1 = query_cells, ident.2 = background_cells, min.pct = 0.1, assay = 'RNA', slot = 'data')

## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more 
## efficient implementation (no further action necessary).
## This message will be shown once per session

de_res = de_res[order(de_res$avg_log2FC, decreasing = TRUE), ]

gene_ranks = setNames(de_res$avg_log2FC, rownames(de_res))
gsea_res = fgsea(pathways=custom_gene_sets, stats=gene_ranks, minSize=0, maxSize=5000)
# note: GSEA typically does not benefit from filtering

nes_vec = setNames(gsea_res$NES, gsea_res$pathway)
padj_vec = setNames(gsea_res$padj, gsea_res$pathway)
tmp_df = as.data.frame(cbind(padj_vec, nes_vec))
colnames(tmp_df) = c('padj', 'NES')
tmp_df$Condition = "PanCancer"

tmp_df$padj_size = (-log10(tmp_df$padj))^0.5
tmp_df = tmp_df[gene_set_order, ]
tmp_df$GeneSet = rownames(tmp_df)
tmp_df$GeneSet = sapply(tmp_df$GeneSet, function(x) paste0(x, ' (', GO_term_to_description$Description[GO_term_to_description$GO_term == x], ')'))
tmp_df$GeneSet = factor(tmp_df$GeneSet, levels = unique(tmp_df$GeneSet))
rownames(tmp_df) = NULL

tmp_df1 = tmp_df

Next, perform GSEA for the LCMV bulk RNA-seq datasets used in the Taiji analysis.

library(edgeR)

## Warning: package 'edgeR' was built under R version 4.4.1

## Loading required package: limma

## Warning: package 'limma' was built under R version 4.4.1

cts_mtx = read.csv('Taiji_counts_matrix.tsv', sep='\t', row.names=1)
exclude_idx = grep('*lung*', colnames(cts_mtx))
cts_mtx = cts_mtx[, -exclude_idx]

cell_type = factor(unname(sapply(colnames(cts_mtx), function(x) strsplit(x, '_')[[1]][[1]])))
experiment = factor(unname(sapply(colnames(cts_mtx), function(x) strsplit(x, '_')[[1]][[2]])))

dge = DGEList(counts = cts_mtx)
dge$samples$group = relevel(cell_type, 'TRM')
dge$samples$experiment = experiment

des = model.matrix(~1 + group, data=dge$samples)
# des = model.matrix(~0 + group, data=dge$samples)
# contrast = makeContrasts(groupTexTerm - groupTRM, levels = des)

dge = calcNormFactors(dge)
print(dim(dge))

## [1] 46403    12

keep = filterByExpr(dge, des)
dge = dge[keep, , keep.lib.sizes = FALSE]
print(dim(dge))

## [1] 10145    12

dge = estimateDisp(dge, des)

fit = glmQLFit(dge, des)
glm_res = glmQLFTest(fit)
# glm_res = glmQLFTest(fit, contrast=contrast)

adj_pvals = p.adjust(glm_res$table$PValue, method='BY')

res_table = as.data.frame(topTags(glm_res, n=Inf)) # n=Inf to include all genes
res_table = res_table[order(res_table$logFC, decreasing=TRUE), ]

gene_ranks = setNames(res_table$logFC, toupper(rownames(res_table)))

gsea_res = fgsea(pathways=custom_gene_sets, stats=gene_ranks, minSize=0, maxSize=5000)

## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.

nes_vec = setNames(gsea_res$NES, gsea_res$pathway)
padj_vec = setNames(gsea_res$padj, gsea_res$pathway)
tmp_df = as.data.frame(cbind(padj_vec, nes_vec))
colnames(tmp_df) = c('padj', 'NES')
tmp_df$Condition = "LCMV"

tmp_df$padj_size = (-log10(tmp_df$padj))^0.5
tmp_df = tmp_df[gene_set_order, ]
tmp_df$GeneSet = rownames(tmp_df)
tmp_df$GeneSet = sapply(tmp_df$GeneSet, function(x) paste0(x, ' (', GO_term_to_description$Description[GO_term_to_description$GO_term == x], ')'))
tmp_df$GeneSet = factor(tmp_df$GeneSet, levels = unique(tmp_df$GeneSet))
rownames(tmp_df) = NULL

tmp_df2 = tmp_df

Now combine the results from both GSEAs into a single composite figure:

Figure2h

2024-09-19

GSEA of exhausted vs. resident memory CD8+ T cells in LCMV and Pan Cancer