Introduction

The purpose of this analysis is to perform GSEA and GSVA to evaluate the enrichment of differential gene expression data (DESeq2 output) for publicly available human and murine gene sets in the Molecular Signatures Database (MSigDB) and Gene Ontology terms using the package clusterProfiler.

GSEA

Data

Input: DESeq2 results in .CSV format for various comparisons of interest.

setwd("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation")
# Read in data from deseq2
df1 = read.csv("Cohort_1/Output/CD4_PTCLvsCD4_CTRL_DESeq2res.csv", header=TRUE)
df2 = read.csv("Cohort_2/Output/CD4_PTCLvsCD4_LN_CTRL_DESeq2res.csv", header=TRUE)

Rank genes

There is no consensus for the what ranking metric should be used for GSEA. Popular options from the DESeq2 output include ranking by the ‘log2FoldChange’ column or the ‘stat’ column, which is the Wald test statistic. By default, this script ranks genes by the Wald test statistic. To rank by log2 fold change instead, replace ‘stat’ with ‘log2FoldChange’.

# Rank genes by stat column of deseq2 result
deg_genes_list1 <- df1$stat # Creates a vector of the stat column from DESeq2 output
names(deg_genes_list1) <- df1$gene_name # Names vector with gene symbols
deg_genes_list1 <- na.omit(deg_genes_list1) # Omits any NA values
deg_genes_list1 = sort(deg_genes_list1, decreasing = TRUE) # Sorts in decreasing order
deg_names1 <- names(deg_genes_list1) # Makes a vector of just the ranked gene names

deg_genes_list2 <- df2$stat # Creates a vector of the stat column from DESeq2 output
names(deg_genes_list2) <- df2$gene_name # Names vector with gene symbols
deg_genes_list2 <- na.omit(deg_genes_list2) # Omits any NA values
deg_genes_list2 = sort(deg_genes_list2, decreasing = TRUE) # Sorts in decreasing order
deg_names2 <- names(deg_genes_list2) # Makes a vector of just the ranked gene names

Retrieve the collections of interest from MSigDB:

# H = hallmark gene sets, C1 = positional gene sets, C2 = curated gene sets, C3 = motif gene sets, C4 = computational gene sets, C5 = GO gene sets, C6 = oncogenic signatures, C7 = immunologic signatures.
m_t2g_C5 <- msigdbr(species = "Homo sapiens", category = "C5") %>%
  dplyr::select(gs_name, gene_symbol)

m_t2g_H <- msigdbr(species = "Homo sapiens", category = "H") %>%
  dplyr::select(gs_name, gene_symbol)

m_t2g_all <- msigdbr(species = "Homo sapiens") %>%
  dplyr::select(gs_name, gene_symbol)

GSEA using all gene sets in the C5 (gene ontology) MSigDB collection

gse1_msigdb_C5 <- GSEA(deg_genes_list1,
                  exponent = 1,
                  nPerm = 10000,
                  minGSSize = 1,
                  maxGSSize = 800,
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",
                  TERM2GENE = m_t2g_C5,
                  verbose = TRUE,
                  by = "fgsea")

gse2_msigdb_C5 <- GSEA(deg_genes_list2,
                  exponent = 1,
                  nPerm = 10000,
                  minGSSize = 1,
                  maxGSSize = 800,
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",
                  TERM2GENE = m_t2g_C5,
                  verbose = TRUE,
                  by = "fgsea")

gse1_msigdb_C5 %>%
  dotplot(showCategory = 10, x = "NES", split=".sign") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 1, MSigDB Gene Sets\nC5 Collection (Gene Ontology)", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

gse2_msigdb_C5 %>%
  dotplot(showCategory = 10, x = "NES", split=".sign") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 2, MSigDB Gene Sets\nC5 Collection (Gene Ontology)", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

GSEA using all gene sets in the H (hallmark) MSigDB collection

gse1_msigdb_H <- GSEA(deg_genes_list1,
                  exponent = 1,
                  nPerm = 10000,
                  minGSSize = 1,
                  maxGSSize = 800,
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",
                  TERM2GENE = m_t2g_H,
                  verbose = TRUE,
                  by = "fgsea")

gse2_msigdb_H <- GSEA(deg_genes_list2,
                  exponent = 1,
                  nPerm = 10000,
                  minGSSize = 1,
                  maxGSSize = 800,
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",
                  TERM2GENE = m_t2g_H,
                  verbose = TRUE,
                  by = "fgsea")

gse1_msigdb_H %>%
  dotplot(showCategory = 15, x = "NES", split=".sign") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 1, MSigDB Gene Sets\n Hallmark Collection", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

gse2_msigdb_H %>%
  dotplot(showCategory = 15, x = "NES", split=".sign") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 2, MSigDB Gene Sets\n Hallmark Collection", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

GSEA using all gene sets from all MSigDB collections

gse1_msigdb_all <- GSEA(deg_genes_list1, # Use the sorted (ranked) list of gene symbols *and* stat ranking column
                  exponent = 1,
                  nPerm = 10000,
                  minGSSize = 1,
                  maxGSSize = 800,
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",
                  TERM2GENE = m_t2g_all,
                  verbose = TRUE,
                  by = "fgsea")

gse2_msigdb_all <- GSEA(deg_genes_list2, # Use the sorted (ranked) list of gene symbols *and* stat ranking column
                  exponent = 1,
                  nPerm = 10000,
                  minGSSize = 1,
                  maxGSSize = 800,
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",
                  TERM2GENE = m_t2g_all,
                  verbose = TRUE,
                  by = "fgsea")

gse1_msigdb_all %>%
  dotplot(showCategory = 15, x = "NES", split=".sign") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) +
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 1, MSigDB Gene Sets\n All Collections", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

gse2_msigdb_all %>%
  dotplot(showCategory = 15, x = "NES", split=".sign") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) +
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 2, MSigDB Gene Sets\n All Collections", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

Export GSEA results for all significant results in all MSigDB categories as CSV files:

gse1_msigdb_df <- as.data.frame(gse1_msigdb_all)
write.csv(gse1_msigdb_df, file="Cohort_1/Output/Cohort1_gsea_MSigDBGeneSets.csv")
gse2_msigdb_df <- as.data.frame(gse2_msigdb_all)
write.csv(gse2_msigdb_df, file="Cohort_2/Output/Cohort2_gsea_MSigDBGeneSets.csv")

GSEA for specific gene sets of interest

Cell signaling pathways enriched in human PTCL

# import gmt
signaling_gmt <- read.gmt("CombinedSignalingPathways.gmt")

## Perform GSEA with order ranked gene list and TERM2GENE = the merged data frames of our annotated gene sets of interest.
### Cohort 1
gse_signaling1 <- GSEA(deg_genes_list1, # Use the sorted list of gene symbols *and* stat ranking column
                   exponent = 1,
                   pvalueCutoff = 0.05,
                   pAdjustMethod = "BH",
                   TERM2GENE = signaling_gmt,
                   verbose = TRUE,
                   by = "fgsea")

### Cohort 2
gse_signaling2 <- GSEA(deg_genes_list2, # Use the sorted list of gene symbols *and* stat ranking column
                   exponent = 1,
                   pvalueCutoff = 0.05,
                   pAdjustMethod = "BH",
                   TERM2GENE = signaling_gmt,
                   verbose = TRUE,
                   by = "fgsea")

gse_signaling1 %>%
  dotplot(showCategory = 30, x = "NES") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 1, Common PTCL Signaling Pathways", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

gseaplot2(gse_signaling1, 
          geneSetID = c("MTOR_UP.V1_UP",
                        "PTEN_DN.V1_UP",
                        "AKT_UP.V1_UP",
                        "HALLMARK_MTORC1_SIGNALING",
                        "WP_PI3KAKT_SIGNALING_PATHWAY"
                        ),
          title = "Effector Sets - Cohort 1",
          color = c("#1F77B4", "#D62728"))

gse_signaling2 %>%
  dotplot(showCategory = 30, x = "NES") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 2, Common PTCL Signaling Pathways", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

gseaplot2(gse_signaling2, 
          geneSetID = c("MTOR_UP.V1_UP",
                        "PTEN_DN.V1_UP",
                        "AKT_UP.V1_UP",
                        "HALLMARK_MTORC1_SIGNALING",
                        "WP_PI3KAKT_SIGNALING_PATHWAY"
                        ),
          title = "Effector Sets - Cohort 2",
          color = c("#1F77B4", "#D62728")
)

T-cell development and differentiation gene signatures

# import gmt
cellorigin_gmt <- read.gmt("thymusTCellAndTCRGeneSets.gmt")

### Cohort 1
gse_origin1 <- GSEA(deg_genes_list1, # Use the sorted list of gene symbols *and* stat ranking column
                   exponent = 1,
                   pvalueCutoff = 0.05,
                   pAdjustMethod = "BH",
                   TERM2GENE = cellorigin_gmt,
                   verbose = TRUE,
                   by = "fgsea")
write.csv(gse_origin1, file = "cellOrigin_gsea_cohort1.csv")

### Cohort 2
gse_origin2 <- GSEA(deg_genes_list2, # Use the sorted list of gene symbols *and* stat ranking column
                   exponent = 1,
                   pvalueCutoff = 0.05,
                   pAdjustMethod = "BH",
                   TERM2GENE = cellorigin_gmt,
                   verbose = TRUE,
                   by = "fgsea")
write.csv(gse_origin2, file = "cellOrigin_gsea_cohort2.csv")

gse_origin1 %>%
  dotplot(showCategory = 15, x = "NES") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 1, Signatures of T-cell Development and Differentiation", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

gseaplot2(gse_origin1, 
          geneSetID = c("LEE_EARLY_T_LYMPHOCYTE_UP",
                        "GSE139242_CD4_THYMOCYTES_VS_ADULT_BLOOD_TOP250UP",
                        "GSE139242_CD4_THYMOCYTES_VS_INFANT_BLOOD_TOP250UP",
                        "GSE1460_INTRATHYMIC_T_PROGENITOR_VS_NAIVE_CD4_TCELL_ADULT_BLOOD_UP",
                        "GSE1460_CD4_THYMOCYTE_VS_NAIVE_CD4_TCELL_ADULT_BLOOD_UP",
                        "BIOCARTA_TCR_PATHWAY",
                        "WP_T_CELL_RECEPTOR_AND_COSTIMULATORY_SIGNALING"),
          title = "Effector Sets - Cohort 1",
          color = c("red", "purple", "darkgreen", "orange2", "brown", "hotpink", "skyblue1")
)

gse_origin2 %>%
  dotplot(showCategory = 30, x = "NES") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 2, Signatures of T-cell Development and Differentiation", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

gseaplot2(gse_origin2, 
          geneSetID = c("LEE_EARLY_T_LYMPHOCYTE_UP",
                        "GSE139242_CD4_THYMOCYTES_VS_ADULT_BLOOD_TOP250UP",
                        "GSE139242_CD4_THYMOCYTES_VS_INFANT_BLOOD_TOP250UP",
                        "GSE1460_INTRATHYMIC_T_PROGENITOR_VS_NAIVE_CD4_TCELL_ADULT_BLOOD_UP",
                        "GSE1460_CD4_THYMOCYTE_VS_NAIVE_CD4_TCELL_ADULT_BLOOD_UP",
                        "BIOCARTA_TCR_PATHWAY",
                        "WP_T_CELL_RECEPTOR_AND_COSTIMULATORY_SIGNALING"),
          title = "Effector Sets - Cohort 2",
          color = c("red", "purple", "darkgreen", "orange2", "brown", "hotpink", "skyblue1")
)

GSVA

Data

# variance stabilized transformed count matrices
vsd_cohort1 <- read.csv("Cohort_1/Output/VST_NormalizedCounts.csv")
vsd_cohort2 <- read.csv("Cohort_2/Output/VST_NormalizedCounts.csv")

# metadata
metadata1 <- read.table(file = "Cohort_1/Input/metadata.txt", header = FALSE)
colnames(metadata1) <- c("avery_num", "sample_name", "phenotype")
rownames(metadata1) <- metadata1$sample_name
metadata1 <- dplyr::select(metadata1, c("phenotype"))
metadata1 <- metadata1[row.names(metadata1) != "CF21", , drop = FALSE]

metadata2 <- read.table(file = "Cohort_2/Input/metadata.txt", header = FALSE)
colnames(metadata2) <- c("avery_num", "sample_name", "phenotype")
rownames(metadata2) <- metadata2$sample_name
metadata2 <- dplyr::select(metadata2, c("phenotype"))

# gmt file
gmt <- getGmt("thymusTCellAndTCRGeneSets.gmt", geneIdType = SymbolIdentifier())

# prepare vst data for GSVA

vsd_cohort1 <- vsd_cohort1[!duplicated(vsd_cohort1$gene_name), ]
rownames(vsd_cohort1) <- vsd_cohort1$gene_name
vsd_cohort1 <- dplyr::select(vsd_cohort1, -c("X", "probe_id", "gene_name", "description"))
vsd_cohort1_matrix <- data.matrix(vsd_cohort1) # Convert to a matrix prior to analysis.

## keep only CD4 PTCL and CD4 CTRL samples from Cohort 1
keepGroups <- c("CD4_PTCL", "CD4_CTRL")
metadata1 <- metadata1 %>%
  filter(phenotype %in% keepGroups)
keepList <- rownames(metadata1)
vsd_cohort1_matrix <- vsd_cohort1_matrix[, colnames(vsd_cohort1_matrix) %in% keepList]

vsd_cohort2 <- vsd_cohort2[!duplicated(vsd_cohort2$gene_name), ]
rownames(vsd_cohort2) <- vsd_cohort2$gene_name
vsd_cohort2 <- dplyr::select(vsd_cohort2, -c("X", "probe_id", "gene_name", "description"))
vsd_cohort2_matrix <- data.matrix(vsd_cohort2) # Convert to a matrix prior to analysis.

Analysis

##### Cohort 1 #####
# set parameters for GSVA
gsva_param1 <- gsvaParam(
  vsd_cohort1_matrix,
  gmt,
  kcdf = "Gaussian", # Compute Gaussian-distributed scores
  minSize = 10, # Minimum gene set size
  maxSize = 500, # Maximum gene set size
  maxDiff = TRUE,
  absRanking = FALSE
)

# perform GSVA
gsva_results1 <- gsva(
  gsva_param1,
  verbose=FALSE,   # Don't print out the progress bar
)

## No annotation package name available in the input data object.
## Attempting to directly match identifiers in data to gene sets.

gsva_heatmap1 <- pheatmap::pheatmap(gsva_results1,
                                      annotation_col = metadata1,
                                      show_colnames = TRUE,
                                      show_rownames = TRUE,
                                      fontsize_row = 10,
                                      cluster_cols = TRUE, 
                                      cluster_rows = TRUE,
                                      cutree_rows = 3,
                                      cutree_cols = 3,
                                      main = "GSVA. Input: Vst transformed normalized DESeq2 counts, Cohort 1\nClustering: Ward, Distance: Euclidean",
                                      clustering_distance_rows = "euclidean",
                                      clustering_distance_cols = "euclidean",
                                      clustering_method = "ward.D2") + 
 scale_fill_gradient(c('dodgerblue', 'black', "yellow"))# Shrink the pathway labels a tad

# set parameters for GSVA
gsva_param2 <- gsvaParam(
  vsd_cohort2_matrix,
  gmt,
  kcdf = "Gaussian", # Compute Gaussian-distributed scores
  minSize = 10, # Minimum gene set size
  maxSize = 500, # Maximum gene set size
  maxDiff = TRUE,
  absRanking = FALSE
)

# perform GSVA
gsva_results2 <- gsva(
  gsva_param2,
  verbose=FALSE,   # Don't print out the progress bar
)

## No annotation package name available in the input data object.
## Attempting to directly match identifiers in data to gene sets.

gsva_heatmap2 <- pheatmap::pheatmap(gsva_results2,
                                      annotation_col = metadata2,
                                      show_colnames = TRUE,
                                      show_rownames = TRUE,
                                      fontsize_row = 10,
                                      cluster_cols = TRUE, 
                                      cluster_rows = TRUE,
                                      cutree_rows = 3,
                                      cutree_cols = 4,
                                      main = "GSVA. Input: Vst transformed normalized DESeq2 counts, Cohort 2\nClustering: Ward, Distance: Euclidean",
                                      clustering_distance_rows = "euclidean",
                                      clustering_distance_cols = "euclidean",
                                      clustering_method = "ward.D2") + 
 scale_fill_gradient(c('dodgerblue', 'black', "yellow"))# Shrink the pathway labels a tad

Statistical analysis of GSVA scores

# Convert metadata file to annotated data frame
annot_metadata1 <- AnnotatedDataFrame(data=metadata1) 
annot_metadata2 <- AnnotatedDataFrame(data=metadata2) 

# Create ExpressionSet object from gsva results matrix and the new metadata. Colnames of gsva_results must match rownames of annot_metadata
gsva_exp1 <- ExpressionSet(assayData = gsva_results1, phenoData = annot_metadata1)
gsva_exp2 <- ExpressionSet(assayData = gsva_results2, phenoData = annot_metadata2)

# Perform differential expression anlaysis using limma package
mod1 <- model.matrix(~ factor(gsva_exp1$phenotype))
fit1 <- lmFit(gsva_results1, mod1)
fit1 <- eBayes(fit1)
res1 <- decideTests(fit1, p.value=0.05)
summary(res1)

##        (Intercept) factor(gsva_exp1$phenotype)CD4_PTCL
## Down            19                                  15
## NotSig          10                                  10
## Up              13                                  17

# set factor levels for Cohort 2 since there are more than 2 groups
cohort2_groups <- factor(gsva_exp2$phenotype, levels = c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL"))
mod2 <- model.matrix(~ cohort2_groups)
fit2 <- lmFit(gsva_results2, mod2)
fit2 <- eBayes(fit2)
res2 <- decideTests(fit2, p.value=0.05)
summary(res2)

##        (Intercept) cohort2_groupsCD4_LN_CTRL cohort2_groupsCD4THYM_CTRL
## Down            26                        14                          7
## NotSig          16                        12                         22
## Up               0                        16                         13

# Export
fit1_df <- as.data.frame(fit1)
rownames(fit1_df) <- rownames(fit1)
write.csv(fit1_df, file="Cohort_1/Output/GSVAlimma_CD4PTCLvsCD4CTRL.csv")

fit2_df <- as.data.frame(fit2)
rownames(fit2_df) <- rownames(fit2)
write.csv(fit2_df, file="Cohort_2/Output/GSVAlimma_CD4PTCLvsCD4CTRL.csv")

Heatmap of only differentially expressed pathways in CD4 PTCL vs CD4 CTRL

# Cohort 1
tt1 <- topTable(fit1, coef=2, n=Inf) # Coefficient is the column of the comparison of interest.
DEpwys1 <- rownames(tt1)[tt1$adj.P.Val <= 0.05] # Extract only those gene sets differentially expressed with p > 0.05
DEpwys_es1 <- exprs(gsva_exp1[DEpwys1, ]) # Index the Expression Set for only those significantly differentially expressed gene sets.

# Cohort 2
## CD4_PTCL vs CD4_LN_CTRL
tt2 <- topTable(fit2, coef="cohort2_groupsCD4_LN_CTRL", n=Inf) 
DEpwys2 <- rownames(tt2)[tt2$adj.P.Val <= 0.05]
DEpwys_es2 <- exprs(gsva_exp2[DEpwys2, ])

## CD4_PTCL vs CD4_THYM_CTRL
tt3 <- topTable(fit2, coef="cohort2_groupsCD4THYM_CTRL", n=Inf) 
DEpwys3 <- rownames(tt3)[tt3$adj.P.Val <= 0.05]
DEpwys_es3 <- exprs(gsva_exp2[DEpwys3, ])

# Export
write.csv(tt1, file="Cohort_1/Output/GSVAlimma_CD4PTCLvCD4CTRL_toptable.csv")
write.csv(tt2, file="Cohort_2/Output/GSVAlimma_CD4PTCLvCD4_LN_CTRL_toptable.csv")
write.csv(tt3, file="Cohort_2/Output/GSVAlimma_CD4PTCLvCD4_THYM_CTRL_toptable.csv")

# Draw  heatmaps
## Cohort 1
### Set phenotype labels for display on heatmap
colorLegend <- c("mediumaquamarine", "orange1") 
names(colorLegend) <- c("CD4_PTCL", "CD4_CTRL")
sample.color.map <- colorLegend[pData(gsva_exp1)[, "phenotype"]] 
names(sample.color.map) <- colnames(DEpwys_es1)

### Reset graphics settings (this heatmap is large and could throw margin size errors)
#graphics.off()

### Perform hierarchical clustering of GSVA scores
sampleClustering1 <- hclust(as.dist(1-cor(DEpwys_es1, method="spearman")),
                           method="ward.D2")
geneSetClustering1 <- hclust(as.dist(1-cor(t(DEpwys_es1), method="pearson")),
                            method="ward.D2")

### Plot
colors <- colorRampPalette(c("dodgerblue4", "aliceblue", "firebrick1"))(256)
par(mar=c(1,10,1,10))

heatmap.2(DEpwys_es1,
          scale="row",
          col=colors,
          ColSideColors=sample.color.map,
          xlab="",
          ylab="",
          main="Gene Set Variation Analysis, Cohort 1",
          trace="none",
          keysize=1,
          density.info="none",
          margins=c(8,35),
          labRow=rownames(DEpwys_es1),
          labCol=sampleClustering1$labels,
          Colv=as.dendrogram(sampleClustering1),
          Rowv=as.dendrogram(geneSetClustering1))
legend("topright", names(colorLegend), fill=colorLegend, inset=0.1, bty="n")

## Cohort 2
### Set phenotype labels for display on heatmap
colorLegend <- c("mediumaquamarine", "orange1", "steelblue") 
names(colorLegend) <- c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")
sample.color.map <- colorLegend[pData(gsva_exp2)[, "phenotype"]] 
names(sample.color.map) <- colnames(DEpwys_es2)

### Reset graphics settings (this heatmap is large and could throw margin size errors)
#graphics.off()

### Perform hierarchical clustering of GSVA scores
sampleClustering2 <- hclust(as.dist(1-cor(DEpwys_es2, method="spearman")),
                           method="ward.D2")
geneSetClustering2 <- hclust(as.dist(1-cor(t(DEpwys_es2), method="pearson")),
                            method="ward.D2")

### Plot
colors <- colorRampPalette(c("dodgerblue4", "aliceblue", "firebrick1"))(256)
par(oma = c(0, 0, 0, 0))  # Reduce outer margins
par(mar = c(4, 4, 2, 1))  # Fine-tune inner margins

heatmap.2(DEpwys_es2,
          scale="row",
          col=colors,
          ColSideColors=sample.color.map,
          xlab="",
          ylab="",
          main="Gene Set Variation Analysis, Cohort 2",
          trace="none",
          keysize=0.6,
          density.info="none",
          margins=c(3,40),
          labRow=rownames(DEpwys_es2),
          labCol=sampleClustering2$labels,
          Colv=as.dendrogram(sampleClustering2),
          Rowv=as.dendrogram(geneSetClustering2))
legend("topright", names(colorLegend), fill=colorLegend, inset=0.05, bty="n")

Citations

sessionInfo()

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] gplots_3.2.0                limma_3.60.6               
##  [3] GSVA_1.52.3                 knitr_1.49                 
##  [5] RColorBrewer_1.1-3          GSEABase_1.66.0            
##  [7] graph_1.82.0                annotate_1.82.0            
##  [9] XML_3.99-0.18               AnnotationDbi_1.66.0       
## [11] lubridate_1.9.4             forcats_1.0.0              
## [13] stringr_1.5.1               purrr_1.0.2                
## [15] readr_2.1.5                 tidyr_1.3.1                
## [17] tibble_3.2.1                tidyverse_2.0.0            
## [19] msigdbr_7.5.1               ggplot2_3.5.1              
## [21] enrichplot_1.24.4           pathview_1.44.0            
## [23] dplyr_1.1.4                 clusterProfiler_4.12.6     
## [25] DESeq2_1.44.0               SummarizedExperiment_1.34.0
## [27] Biobase_2.64.0              MatrixGenerics_1.16.0      
## [29] matrixStats_1.5.0           GenomicRanges_1.56.2       
## [31] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [33] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## [35] BiocManager_1.30.25        
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.4.0               bitops_1.0-9               
##   [3] ggplotify_0.1.2             R.oo_1.27.0                
##   [5] polyclip_1.10-7             lifecycle_1.0.4            
##   [7] httr2_1.1.0                 lattice_0.22-6             
##   [9] MASS_7.3-60.2               magrittr_2.0.3             
##  [11] sass_0.4.9                  rmarkdown_2.29             
##  [13] jquerylib_0.1.4             yaml_2.3.10                
##  [15] cowplot_1.1.3               DBI_1.2.3                  
##  [17] abind_1.4-8                 zlibbioc_1.50.0            
##  [19] R.utils_2.12.3              ggraph_2.2.1               
##  [21] RCurl_1.98-1.16             yulab.utils_0.2.0          
##  [23] tweenr_2.0.3                rappdirs_0.3.3             
##  [25] GenomeInfoDbData_1.2.12     ggrepel_0.9.6              
##  [27] irlba_2.3.5.1               tidytree_0.4.6             
##  [29] pheatmap_1.0.12             codetools_0.2-20           
##  [31] DelayedArray_0.30.1         DOSE_3.30.5                
##  [33] ggforce_0.4.2               tidyselect_1.2.1           
##  [35] aplot_0.2.4                 UCSC.utils_1.0.0           
##  [37] farver_2.1.2                ScaledMatrix_1.12.0        
##  [39] viridis_0.6.5               jsonlite_1.8.9             
##  [41] tidygraph_1.3.1             tools_4.4.0                
##  [43] treeio_1.28.0               snow_0.4-4                 
##  [45] Rcpp_1.0.14                 glue_1.8.0                 
##  [47] gridExtra_2.3               SparseArray_1.4.8          
##  [49] xfun_0.49                   qvalue_2.36.0              
##  [51] HDF5Array_1.32.1            withr_3.0.2                
##  [53] fastmap_1.2.0               rhdf5filters_1.16.0        
##  [55] caTools_1.18.3              digest_0.6.35              
##  [57] rsvd_1.0.5                  timechange_0.3.0           
##  [59] R6_2.5.1                    gridGraphics_0.5-1         
##  [61] colorspace_2.1-1            GO.db_3.19.1               
##  [63] gtools_3.9.5                RSQLite_2.3.9              
##  [65] R.methodsS3_1.8.2           generics_0.1.3             
##  [67] data.table_1.16.4           graphlayouts_1.2.2         
##  [69] httr_1.4.7                  S4Arrays_1.4.1             
##  [71] scatterpie_0.2.4            pkgconfig_2.0.3            
##  [73] gtable_0.3.6                blob_1.2.4                 
##  [75] SingleCellExperiment_1.26.0 XVector_0.44.0             
##  [77] shadowtext_0.1.4            htmltools_0.5.8.1          
##  [79] fgsea_1.30.0                scales_1.3.0               
##  [81] png_0.1-8                   SpatialExperiment_1.14.0   
##  [83] ggfun_0.1.8                 rstudioapi_0.17.1          
##  [85] tzdb_0.4.0                  reshape2_1.4.4             
##  [87] rjson_0.2.23                nlme_3.1-164               
##  [89] org.Hs.eg.db_3.19.1         cachem_1.1.0               
##  [91] rhdf5_2.48.0                KernSmooth_2.23-22         
##  [93] parallel_4.4.0              pillar_1.10.1              
##  [95] grid_4.4.0                  vctrs_0.6.5                
##  [97] BiocSingular_1.20.0         beachmat_2.20.0            
##  [99] xtable_1.8-4                Rgraphviz_2.48.0           
## [101] evaluate_1.0.3              KEGGgraph_1.64.0           
## [103] magick_2.8.5                cli_3.6.2                  
## [105] locfit_1.5-9.10             compiler_4.4.0             
## [107] rlang_1.1.3                 crayon_1.5.3               
## [109] labeling_0.4.3              plyr_1.8.9                 
## [111] fs_1.6.5                    stringi_1.8.4              
## [113] viridisLite_0.4.2           BiocParallel_1.38.0        
## [115] babelgene_22.9              munsell_0.5.1              
## [117] Biostrings_2.72.1           lazyeval_0.2.2             
## [119] GOSemSim_2.30.2             Matrix_1.7-0               
## [121] hms_1.1.3                   patchwork_1.3.0            
## [123] sparseMatrixStats_1.16.0    bit64_4.6.0-1              
## [125] Rhdf5lib_1.26.0             KEGGREST_1.44.1            
## [127] statmod_1.5.0               igraph_2.1.4               
## [129] memoise_2.0.1               bslib_0.8.0                
## [131] ggtree_3.12.0               fastmatch_1.1-6            
## [133] bit_4.5.0.1                 ape_5.8-1                  
## [135] gson_0.1.0

citation()

## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.

Gene Set Enrichment and Variation Analyses of Canine CD4+ PTCL

Eileen Owens

2025-02-04

Introduction

GSEA

Data

Rank genes

GSEA using all gene sets in the C5 (gene ontology) MSigDB collection

GSEA using all gene sets in the H (hallmark) MSigDB collection

GSEA using all gene sets from all MSigDB collections

GSEA for specific gene sets of interest

Cell signaling pathways enriched in human PTCL

T-cell development and differentiation gene signatures

GSVA

Data

Analysis

Statistical analysis of GSVA scores

Heatmap of only differentially expressed pathways in CD4 PTCL vs CD4 CTRL

Citations