library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
## The following objects are masked from 'package:base':
##
## intersect, t
library(pheatmap)
library(tibble)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(gtable)
library(grid)
library(viridis)
## Loading required package: viridisLite
library(RColorBrewer)
library(fgsea)
library(reshape2)
library(ggplot2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(grid)
library(this.path)
setwd(this.path::here())
Calculate the gene set enrichment of exhausted T cells vs. TRM in LCMV bulk RNA-seq and Pan Cancer scRNA-seq datasets. This analysis focuses on gene sets corresponding to biologically relevant GO terms like protein stability and leukocyte cell-cell adhesion. Subsets of these gene sets are taken by intersecting them with the TF communities from Taiji.
inputs: custom_gene_sets from SubGenesets_TRM_TEX_TFcommunity_20240419.csv, Seurat object from Pancancer_CD8_simple_7cluster_220518.rds for the Pan Cancer analysis, and bulk RNA-seq counts from Taiji_counts_matrix.tsv for the LCMV analysis.
outputs: dot plot comparing GSEA results of Pan Cancer vs. LCMV for the queried GO subsets
First, we will initialize the gene sets to be queried:
GO_term_to_description = list(GO_term = c("GO:0031331", "GO:0031647", "GO:1903052", "GO:0043161", "GO:0061919", "GO:0097193", "GO:0032623", "GO:0032743", "GO:0032663", "GO:0071559", "GO:0007015", "GO:2000112", "GO:0043087", "GO:0007159"),
Description = c('Catabolism', 'Protein stability', 'Proteolysis', 'UPS', 'Autophagy', 'Intrinsic apoptosis', 'IL-2 production', 'Pos. reg. of IL-2 production', 'Reg. of IL-2 production', 'TGF-Beta', 'Actin', 'Reg. of cellular macromolecule biosynthetic process', 'Reg. of GTPase activity', 'Leukocyte cell-cell adhesion'))
signatures = read.table('SubGenesets_TRM_TEX_TFcommunity_20240419.csv', header=FALSE, sep=',')
rownames(signatures) = signatures$V1
signatures = signatures[-1]
custom_gene_sets = list()
for (r in rownames(signatures)) {
tmp = as.character(signatures[r,])
tmp = tmp[tmp != ""]
custom_gene_sets[[r]] = toupper(tmp)
}
custom_gene_sets = lapply(custom_gene_sets, function (x) x[x != "NA"])
gene_set_order = names(custom_gene_sets)
Perform GSEA for exhausted (Tex) CD8+ T cells vs. resident memory (TRM)
sobj = readRDS('Pancancer_CD8_simple_7cluster_220518.rds')
de_results = list()
background_cells = WhichCells(sobj, idents = 'TRM')
query_cells = WhichCells(sobj, idents = 'Tex')
de_res = FindMarkers(sobj, ident.1 = query_cells, ident.2 = background_cells, min.pct = 0.1, assay = 'RNA', slot = 'data')
## For a (much!) faster implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the presto package
## --------------------------------------------
## install.packages('devtools')
## devtools::install_github('immunogenomics/presto')
## --------------------------------------------
## After installation of presto, Seurat will automatically use the more
## efficient implementation (no further action necessary).
## This message will be shown once per session
de_res = de_res[order(de_res$avg_log2FC, decreasing = TRUE), ]
gene_ranks = setNames(de_res$avg_log2FC, rownames(de_res))
gsea_res = fgsea(pathways=custom_gene_sets, stats=gene_ranks, minSize=0, maxSize=5000)
# note: GSEA typically does not benefit from filtering
nes_vec = setNames(gsea_res$NES, gsea_res$pathway)
padj_vec = setNames(gsea_res$padj, gsea_res$pathway)
tmp_df = as.data.frame(cbind(padj_vec, nes_vec))
colnames(tmp_df) = c('padj', 'NES')
tmp_df$Condition = "PanCancer"
tmp_df$padj_size = (-log10(tmp_df$padj))^0.5
tmp_df = tmp_df[gene_set_order, ]
tmp_df$GeneSet = rownames(tmp_df)
tmp_df$GeneSet = sapply(tmp_df$GeneSet, function(x) paste0(x, ' (', GO_term_to_description$Description[GO_term_to_description$GO_term == x], ')'))
tmp_df$GeneSet = factor(tmp_df$GeneSet, levels = unique(tmp_df$GeneSet))
rownames(tmp_df) = NULL
tmp_df1 = tmp_df
Next, perform GSEA for the LCMV bulk RNA-seq datasets used in the Taiji analysis.
library(edgeR)
## Warning: package 'edgeR' was built under R version 4.4.1
## Loading required package: limma
## Warning: package 'limma' was built under R version 4.4.1
cts_mtx = read.csv('Taiji_counts_matrix.tsv', sep='\t', row.names=1)
exclude_idx = grep('*lung*', colnames(cts_mtx))
cts_mtx = cts_mtx[, -exclude_idx]
cell_type = factor(unname(sapply(colnames(cts_mtx), function(x) strsplit(x, '_')[[1]][[1]])))
experiment = factor(unname(sapply(colnames(cts_mtx), function(x) strsplit(x, '_')[[1]][[2]])))
dge = DGEList(counts = cts_mtx)
dge$samples$group = relevel(cell_type, 'TRM')
dge$samples$experiment = experiment
des = model.matrix(~1 + group, data=dge$samples)
# des = model.matrix(~0 + group, data=dge$samples)
# contrast = makeContrasts(groupTexTerm - groupTRM, levels = des)
dge = calcNormFactors(dge)
print(dim(dge))
## [1] 46403 12
keep = filterByExpr(dge, des)
dge = dge[keep, , keep.lib.sizes = FALSE]
print(dim(dge))
## [1] 10145 12
dge = estimateDisp(dge, des)
fit = glmQLFit(dge, des)
glm_res = glmQLFTest(fit)
# glm_res = glmQLFTest(fit, contrast=contrast)
adj_pvals = p.adjust(glm_res$table$PValue, method='BY')
res_table = as.data.frame(topTags(glm_res, n=Inf)) # n=Inf to include all genes
res_table = res_table[order(res_table$logFC, decreasing=TRUE), ]
gene_ranks = setNames(res_table$logFC, toupper(rownames(res_table)))
gsea_res = fgsea(pathways=custom_gene_sets, stats=gene_ranks, minSize=0, maxSize=5000)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.02% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
nes_vec = setNames(gsea_res$NES, gsea_res$pathway)
padj_vec = setNames(gsea_res$padj, gsea_res$pathway)
tmp_df = as.data.frame(cbind(padj_vec, nes_vec))
colnames(tmp_df) = c('padj', 'NES')
tmp_df$Condition = "LCMV"
tmp_df$padj_size = (-log10(tmp_df$padj))^0.5
tmp_df = tmp_df[gene_set_order, ]
tmp_df$GeneSet = rownames(tmp_df)
tmp_df$GeneSet = sapply(tmp_df$GeneSet, function(x) paste0(x, ' (', GO_term_to_description$Description[GO_term_to_description$GO_term == x], ')'))
tmp_df$GeneSet = factor(tmp_df$GeneSet, levels = unique(tmp_df$GeneSet))
rownames(tmp_df) = NULL
tmp_df2 = tmp_df
Now combine the results from both GSEAs into a single composite figure: