Introduction

The purpose of this script it to utilize gene set variation analysis (GSVA) and module scoring to compare the gene expression profile of canine CD4+ PTCL to the gene signatures of normal canine thymocytes and nodal T cells at various stages of development derived from a single-cell transcriptomic atlas.

Software

Gene Set Enrichment and Variation Analysis of Monocle 3 Gene Modules

Monocle 3 finds sets of genes that vary across clusters and groups them into modules. To do this, Monocle 3 essentially runs UMAP on the genes (as opposed to the cells) and groups them into modules using Louvain community analysis.

The analyses below will evaluate the enrichment of PTCL for gene modules identified across a UMAP of thymocytes and T cells from normal canine thymus and lymph node samples.

Data

Input: Bulk RNA-seq counts normalized and variance stabilized transformed by DESeq2, DESeq2 differential expression analysis results, and a gmt file of gene modules enriched at various stages of development (based on monocle3::find_gene_modules()).

setwd("C:/Users/edlarsen/Documents/240828_scRNAseq/PTCL_GSVA_GSEA")

# GMT file
gsva_mods_gmt <- getGmt("Gene_Module_Signatures.gmt", geneIdType = SymbolIdentifier())
gsea_mods_gmt <- read.gmt("Gene_Module_Signatures.gmt")

# VST counts
## Cohort 1
vsd1 <- read.csv("VST_NormalizedCounts_CD4sOnly_Cohort1.csv")
vsd1 <- vsd1[!duplicated(vsd1$gene_name), ] # Remove rows with duplicate gene symbols.
rownames(vsd1) <- vsd1$gene_name # make gene name rownames
vsd1 <- dplyr::select(vsd1, -c("X", "probe_id", "gene_name", "description")) # Remove all columns that don't contain counts
vsd1_matrix <- data.matrix(vsd1) # Convert to a matrix prior to analysis.

## Cohort 2
vsd2 <- read.csv("VST_NormalizedCounts_Cohort2.csv")
vsd2 <- vsd2[!duplicated(vsd2$gene_name), ] # Remove rows with duplicate gene symbols.
rownames(vsd2) <- vsd2$gene_name # make gene name rownames
vsd2 <- dplyr::select(vsd2, -c("X", "probe_id", "gene_name", "description")) # Remove all columns that don't contain counts
vsd2_matrix <- data.matrix(vsd2) # Convert to a matrix prior to analysis.

# metadata
## Cohort 1
metadata1 <- read.table("metadata_Cohort1.txt")
colnames(metadata1) <- c("avery_number", "sample_name", "phenotype")
rownames(metadata1) <- metadata1$sample_name # make sample names the rownames
metadata1 <- dplyr::select(metadata1, c("phenotype")) # Select columns in metadata1 to be annotated on the GSVA heatmap
metadata1 <- metadata1[row.names(metadata1) != "CF21", , drop = FALSE]

## keep only CD4 PTCL and CD4 CTRL samples from Cohort 1
keepGroups <- c("CD4_PTCL", "CD4_CTRL")
metadata1 <- metadata1 %>%
  filter(phenotype %in% keepGroups)
keepList <- rownames(metadata1)
vsd1_matrix <- vsd1_matrix[, colnames(vsd1_matrix) %in% keepList]

## Cohort 2
metadata2 <- read.table("metadata_Cohort2.txt")
colnames(metadata2) <- c("avery_number", "sample_name", "phenotype")
rownames(metadata2) <- metadata2$sample_name # make sample names the rownames
metadata2 <- dplyr::select(metadata2, c("phenotype")) # Select columns in metadata2 to be annotated on the GSVA heatmap

# DESeq2 results
df1 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_1/Output/CD4_PTCLvsCD4_CTRL_DESeq2res.csv", header=TRUE)
df2 = read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_2/Output/CD4_PTCLvsCD4_LN_CTRL_DESeq2res.csv", header=TRUE)

## Rank genes by stat column of deseq2 result
deg_genes_list1 <- df1$stat # Creates a vector of the stat column from DESeq2 output
names(deg_genes_list1) <- df1$gene_name # Names vector with gene symbols
deg_genes_list1 <- na.omit(deg_genes_list1) # Omits any NA values
deg_genes_list1 = sort(deg_genes_list1, decreasing = TRUE) # Sorts in decreasing order
deg_names1 <- names(deg_genes_list1) # Makes a vector of just the ranked gene names

deg_genes_list2 <- df2$stat # Creates a vector of the stat column from DESeq2 output
names(deg_genes_list2) <- df2$gene_name # Names vector with gene symbols
deg_genes_list2 <- na.omit(deg_genes_list2) # Omits any NA values
deg_genes_list2 = sort(deg_genes_list2, decreasing = TRUE) # Sorts in decreasing order
deg_names2 <- names(deg_genes_list2) # Makes a vector of just the ranked gene names

GSEA

Enrichment for gene modules (monocle3::find_gene_modules())

gse1 <- GSEA(deg_genes_list1,
                  exponent = 1,
                  nPerm = 10000,
                  minGSSize = 1,
                  maxGSSize = 800,
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",
                  TERM2GENE = gsea_mods_gmt,
                  verbose = TRUE,
                  by = "fgsea")

gse2 <- GSEA(deg_genes_list2,
                  exponent = 1,
                  nPerm = 10000,
                  minGSSize = 1,
                  maxGSSize = 800,
                  pvalueCutoff = 0.05,
                  pAdjustMethod = "BH",
                  TERM2GENE = gsea_mods_gmt,
                  verbose = TRUE,
                  by = "fgsea")

gse1 %>%
  dotplot(showCategory = 10, x = "NES", split=".sign") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 1, Monocle3 Gene Modules", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

gse2 %>%
  dotplot(showCategory = 10, x = "NES", split=".sign") +
  scale_colour_viridis_c(name = "Adjusted\nP-value",
                         option="H") +
  scale_y_discrete(labels = ~ str_wrap(gsub('_', ' ', .x), 40)) + # replaces underscores with spaces to allow wrapping of long gene set names
  geom_vline(xintercept = 0, linetype=2) +
  labs(title="GSEA, Cohort 2, Monocle3 Gene Modules", y = "Gene Set") +
  theme(plot.title = element_text(hjust = 0.5))

GSVA

# set parameters for GSVA
gsva_param1 <- gsvaParam(
  vsd1_matrix,
  gsva_mods_gmt,
  kcdf = "Gaussian", # Compute Gaussian-distributed scores
  minSize = 10, # Minimum gene set size
  maxSize = 500, # Maximum gene set size
  maxDiff = TRUE,
  absRanking = FALSE
)

gsva_param2 <- gsvaParam(
  vsd2_matrix,
  gsva_mods_gmt,
  kcdf = "Gaussian", # Compute Gaussian-distributed scores
  minSize = 10, # Minimum gene set size
  maxSize = 500, # Maximum gene set size
  maxDiff = TRUE,
  absRanking = FALSE
)

# perform GSVA
gsva_results1 <- gsva(
  gsva_param1,
  verbose=FALSE,   # Don't print out the progress bar
)

gsva_results2 <- gsva(
  gsva_param2,
  verbose=FALSE,   # Don't print out the progress bar
)

gsva_heatmap1 <- pheatmap::pheatmap(gsva_results1,
                                      annotation_col = metadata1,
                                      show_colnames = TRUE,
                                      show_rownames = TRUE,
                                      fontsize_row = 10,
                                      cluster_cols = TRUE, 
                                      cluster_rows = TRUE,
                                      cutree_rows = 2,
                                      cutree_cols = 3,
                                      main = "GSVA, Monocle3 Gene Modules. Input: Vst transformed normalized DESeq2 counts, Cohort 1\nClustering: Ward, Distance: Euclidean",
                                      clustering_distance_rows = "euclidean",
                                      clustering_distance_cols = "euclidean",
                                      clustering_method = "ward.D2") + 
 scale_fill_gradient(c('dodgerblue', 'black', "yellow"))# Shrink the pathway labels a tad

gsva_heatmap2 <- pheatmap::pheatmap(gsva_results2,
                                      annotation_col = metadata2,
                                      show_colnames = TRUE,
                                      show_rownames = TRUE,
                                      fontsize_row = 10,
                                      cluster_cols = TRUE, 
                                      cluster_rows = TRUE,
                                      cutree_rows = 3,
                                      cutree_cols = 3,
                                      main = "GSVA, Monocle3 Gene Modules. Input: Vst transformed normalized DESeq2 counts, Cohort 2\nClustering: Ward, Distance: Euclidean",
                                      clustering_distance_rows = "euclidean",
                                      clustering_distance_cols = "euclidean",
                                      clustering_method = "ward.D2") + 
 scale_fill_gradient(c('dodgerblue', 'black', "yellow"))# Shrink the pathway labels a tad

Expression of key changing genes across pseudotime in CD4+ PTCL

# Up in ETP/Double negatives
etp_dn_genes <- c("PAK3", "FBN2", "DPP4", "TBC1D9", "MRC1", "BAHCC1", "ALPL", "PTPRF")

# Up in DP
dp_genes <- c("NEDD4L", "CD1C", "ADAM12", "DHDDS", "DNTT", "FSTL4", "MAML3", "ARPP21")

# Up in SP
sp_genes <- c("INPP4B", "SCML4", "TOX2", "BCL2", "RIPOR2", "TGFBR2")

# Up in naive
naive_genes <- c("FAU", "RPL13", "RPL19", "RPS11", "RPS15A", "SELL", "LTB", "SCML4", "DLA-DRA")

# Up in mature/activated
mature_T_genes <- c("GIMAP4", "FAU", "RPL13", "RPS15A", "MAF", "PARP8", "CBLB", "LTB", "B2M", "DLA-DRA", "HLA-DQB2")

######################## COHORT 1 ########################
normalized_counts <- read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_1/Output/Cohort1_NormalizedCounts.csv")
normalized_counts <- normalized_counts[!duplicated(normalized_counts$gene_name), ] # Remove rows with duplicate gene symbols.
rownames(normalized_counts) <- normalized_counts$gene_name # make gene name rownames
normalized_counts <- dplyr::select(normalized_counts, -c("X", "probe_id", "gene_name", "description")) # Remove all columns that don't contain counts
normalized_counts <- normalized_counts[, colnames(normalized_counts) %in% keepList] # for cohort 1, keep only CD4 PTCL and CD4 CTRL phenotypes
normalized_counts$gene_name <- row.names(normalized_counts) # add gene_name column back

metadata1$sample_name <- row.names(metadata1)

############ GENE LIST 1 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset1 <- normalized_counts %>%
  filter(gene_name %in% etp_dn_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset1 <- countsSubset1 %>%
  tidyr::gather(colnames(countsSubset1)[1:29], key = "sample_name", value="normalized_counts")  # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset1$normalized_counts <- as.numeric(as.character(gathered_countsSubset1$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group. This will merge the 2 data frames with respect to the "sample_name" column (i.e., a column with the same column name in both data frames)
gathered_countsSubset1 <- inner_join(metadata1, gathered_countsSubset1)


############ GENE LIST 2 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset2 <- normalized_counts %>%
  filter(gene_name %in% dp_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset2 <- countsSubset2 %>%
  tidyr::gather(colnames(countsSubset2)[1:29], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset2$normalized_counts <- as.numeric(as.character(gathered_countsSubset2$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset2 <- inner_join(metadata1, gathered_countsSubset2)


############ GENE LIST 3 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset3 <- normalized_counts %>%
  filter(gene_name %in% sp_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset3 <- countsSubset3 %>%
  tidyr::gather(colnames(countsSubset3)[1:29], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset3$normalized_counts <- as.numeric(as.character(gathered_countsSubset3$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset3 <- inner_join(metadata1, gathered_countsSubset3)

############ GENE LIST 4 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset4 <- normalized_counts %>%
  filter(gene_name %in% naive_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset4 <- countsSubset4 %>%
  tidyr::gather(colnames(countsSubset4)[1:29], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset4$normalized_counts <- as.numeric(as.character(gathered_countsSubset4$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset4 <- inner_join(metadata1, gathered_countsSubset4)

############ GENE LIST 5 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset5 <- normalized_counts %>%
  filter(gene_name %in% mature_T_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset5 <- countsSubset5 %>%
  tidyr::gather(colnames(countsSubset5)[1:29], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset5$normalized_counts <- as.numeric(as.character(gathered_countsSubset5$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset5 <- inner_join(metadata1, gathered_countsSubset5)

############ COHORT 1, GENE LIST 1 ############
ggplot(gathered_countsSubset1, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 1") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ COHORT 1, GENE LIST 2 ############
ggplot(gathered_countsSubset2, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 1") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ COHORT 1, GENE LIST 3 ############
ggplot(gathered_countsSubset3, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 1") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ COHORT 1, GENE LIST 4 ############
ggplot(gathered_countsSubset4, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 1") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ COHORT 1, GENE LIST 5 ############
ggplot(gathered_countsSubset5, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 1") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

######################## COHORT 2 ########################
normalized_counts <- read.csv("C:/Users/edlarsen/Documents/Jan2025 PTCLRNASeq for Dissertation/Cohort_2/Output/Cohort2_NormalizedCounts.csv")

metadata2$sample_name <- row.names(metadata2) # add sample_name column to metadata

############ GENE LIST 1 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset1 <- normalized_counts %>%
  filter(gene_name %in% etp_dn_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset1 <- countsSubset1 %>%
  tidyr::gather(colnames(countsSubset1)[2:104], key = "sample_name", value="normalized_counts")  # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset1$normalized_counts <- as.numeric(as.character(gathered_countsSubset1$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group. This will merge the 2 data frames with respect to the "sample_name" column (i.e., a column with the same column name in both data frames)
gathered_countsSubset1 <- inner_join(metadata2, gathered_countsSubset1)


############ GENE LIST 2 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset2 <- normalized_counts %>%
  filter(gene_name %in% dp_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset2 <- countsSubset2 %>%
  tidyr::gather(colnames(countsSubset2)[2:104], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset2$normalized_counts <- as.numeric(as.character(gathered_countsSubset2$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset2 <- inner_join(metadata2, gathered_countsSubset2)


############ GENE LIST 3 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset3 <- normalized_counts %>%
  filter(gene_name %in% sp_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset3 <- countsSubset3 %>%
  tidyr::gather(colnames(countsSubset3)[2:104], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset3$normalized_counts <- as.numeric(as.character(gathered_countsSubset3$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset3 <- inner_join(metadata2, gathered_countsSubset3)

############ GENE LIST 4 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset4 <- normalized_counts %>%
  filter(gene_name %in% naive_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset4 <- countsSubset4 %>%
  tidyr::gather(colnames(countsSubset4)[2:104], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset4$normalized_counts <- as.numeric(as.character(gathered_countsSubset4$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset4 <- inner_join(metadata2, gathered_countsSubset4)

############ GENE LIST 5 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset5 <- normalized_counts %>%
  filter(gene_name %in% mature_T_genes)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset5 <- countsSubset5 %>%
  tidyr::gather(colnames(countsSubset5)[2:104], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset5$normalized_counts <- as.numeric(as.character(gathered_countsSubset5$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset5 <- inner_join(metadata2, gathered_countsSubset5)

############ COHORT 2, GENE LIST 1 ############
ggplot(gathered_countsSubset1, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ COHORT 2, GENE LIST 2 ############
ggplot(gathered_countsSubset2, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ COHORT 2, GENE LIST 3 ############
ggplot(gathered_countsSubset3, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ COHORT 2, GENE LIST 4 ############
ggplot(gathered_countsSubset4, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ COHORT 2, GENE LIST 5 ############
ggplot(gathered_countsSubset5, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Variable Genes Across Pseudotime: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

Citations

sessionInfo()

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] stringr_1.5.1               clusterProfiler_4.12.6     
##  [3] SeuratWrappers_0.4.0        ggpubr_0.6.0               
##  [5] presto_1.0.0                data.table_1.16.4          
##  [7] Rcpp_1.0.14                 monocle3_1.3.7             
##  [9] SingleCellExperiment_1.26.0 Seurat_5.2.1               
## [11] SeuratObject_5.0.2          sp_2.1-4                   
## [13] gplots_3.2.0                limma_3.60.6               
## [15] GSEABase_1.66.0             graph_1.82.0               
## [17] annotate_1.82.0             XML_3.99-0.18              
## [19] AnnotationDbi_1.66.0        GSVA_1.52.3                
## [21] ggplot2_3.5.1               knitr_1.49                 
## [23] readr_2.1.5                 dplyr_1.1.4                
## [25] pheatmap_1.0.12             RColorBrewer_1.1-3         
## [27] DESeq2_1.44.0               SummarizedExperiment_1.34.0
## [29] Biobase_2.64.0              MatrixGenerics_1.16.0      
## [31] matrixStats_1.5.0           GenomicRanges_1.56.2       
## [33] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [35] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## 
## loaded via a namespace (and not attached):
##   [1] fs_1.6.5                 spatstat.sparse_3.1-0    bitops_1.0-9            
##   [4] enrichplot_1.24.4        httr_1.4.7               tools_4.4.0             
##   [7] sctransform_0.4.1        backports_1.5.0          R6_2.5.1                
##  [10] HDF5Array_1.32.1         lazyeval_0.2.2           uwot_0.2.2              
##  [13] rhdf5filters_1.16.0      withr_3.0.2              gridExtra_2.3           
##  [16] progressr_0.15.1         cli_3.6.2                spatstat.explore_3.3-4  
##  [19] fastDummies_1.7.5        scatterpie_0.2.4         labeling_0.4.3          
##  [22] sass_0.4.9               spatstat.data_3.1-4      ggridges_0.5.6          
##  [25] pbapply_1.7-2            yulab.utils_0.2.0        gson_0.1.0              
##  [28] DOSE_3.30.5              R.utils_2.12.3           parallelly_1.42.0       
##  [31] rstudioapi_0.17.1        RSQLite_2.3.9            gridGraphics_0.5-1      
##  [34] generics_0.1.3           gtools_3.9.5             ica_1.0-3               
##  [37] spatstat.random_3.3-2    car_3.1-3                GO.db_3.19.1            
##  [40] Matrix_1.7-0             abind_1.4-8              R.methodsS3_1.8.2       
##  [43] lifecycle_1.0.4          yaml_2.3.10              carData_3.0-5           
##  [46] qvalue_2.36.0            rhdf5_2.48.0             SparseArray_1.4.8       
##  [49] Rtsne_0.17               grid_4.4.0               blob_1.2.4              
##  [52] promises_1.3.2           crayon_1.5.3             miniUI_0.1.1.1          
##  [55] lattice_0.22-6           beachmat_2.20.0          cowplot_1.1.3           
##  [58] KEGGREST_1.44.1          magick_2.8.5             pillar_1.10.1           
##  [61] fgsea_1.30.0             rjson_0.2.23             boot_1.3-30             
##  [64] future.apply_1.11.3      codetools_0.2-20         fastmatch_1.1-6         
##  [67] glue_1.8.0               ggfun_0.1.8              spatstat.univar_3.1-1   
##  [70] remotes_2.5.0            treeio_1.28.0            vctrs_0.6.5             
##  [73] png_0.1-8                spam_2.11-1              Rdpack_2.6.2            
##  [76] gtable_0.3.6             cachem_1.1.0             xfun_0.49               
##  [79] rbibutils_2.3            S4Arrays_1.4.1           mime_0.12               
##  [82] tidygraph_1.3.1          reformulas_0.4.0         survival_3.5-8          
##  [85] statmod_1.5.0            fitdistrplus_1.2-2       ROCR_1.0-11             
##  [88] nlme_3.1-164             ggtree_3.12.0            bit64_4.6.0-1           
##  [91] RcppAnnoy_0.0.22         bslib_0.8.0              irlba_2.3.5.1           
##  [94] KernSmooth_2.23-22       colorspace_2.1-1         DBI_1.2.3               
##  [97] tidyselect_1.2.1         bit_4.5.0.1              compiler_4.4.0          
## [100] httr2_1.1.0              DelayedArray_0.30.1      plotly_4.10.4           
## [103] shadowtext_0.1.4         scales_1.3.0             caTools_1.18.3          
## [106] lmtest_0.9-40            rappdirs_0.3.3           SpatialExperiment_1.14.0
## [109] digest_0.6.35            goftest_1.2-3            spatstat.utils_3.1-2    
## [112] minqa_1.2.8              rmarkdown_2.29           XVector_0.44.0          
## [115] htmltools_0.5.8.1        pkgconfig_2.0.3          lme4_1.1-36             
## [118] sparseMatrixStats_1.16.0 fastmap_1.2.0            rlang_1.1.3             
## [121] htmlwidgets_1.6.4        UCSC.utils_1.0.0         shiny_1.10.0            
## [124] farver_2.1.2             jquerylib_0.1.4          zoo_1.8-12              
## [127] jsonlite_1.8.9           BiocParallel_1.38.0      GOSemSim_2.30.2         
## [130] R.oo_1.27.0              BiocSingular_1.20.0      magrittr_2.0.3          
## [133] ggplotify_0.1.2          Formula_1.2-5            GenomeInfoDbData_1.2.12 
## [136] dotCall64_1.2            patchwork_1.3.0          Rhdf5lib_1.26.0         
## [139] munsell_0.5.1            ape_5.8-1                viridis_0.6.5           
## [142] reticulate_1.40.0        stringi_1.8.4            ggraph_2.2.1            
## [145] zlibbioc_1.50.0          MASS_7.3-60.2            plyr_1.8.9              
## [148] parallel_4.4.0           listenv_0.9.1            ggrepel_0.9.6           
## [151] deldir_2.0-4             graphlayouts_1.2.2       Biostrings_2.72.1       
## [154] splines_4.4.0            tensor_1.5               hms_1.1.3               
## [157] locfit_1.5-9.10          igraph_2.1.4             spatstat.geom_3.3-5     
## [160] ggsignif_0.6.4           RcppHNSW_0.6.0           reshape2_1.4.4          
## [163] ScaledMatrix_1.12.0      evaluate_1.0.3           BiocManager_1.30.25     
## [166] tweenr_2.0.3             nloptr_2.1.1             tzdb_0.4.0              
## [169] httpuv_1.6.15            RANN_2.6.2               tidyr_1.3.1             
## [172] purrr_1.0.2              polyclip_1.10-7          future_1.34.0           
## [175] scattermore_1.2          ggforce_0.4.2            rsvd_1.0.5              
## [178] broom_1.0.7              xtable_1.8-4             tidytree_0.4.6          
## [181] RSpectra_0.16-2          rstatix_0.7.2            later_1.4.1             
## [184] viridisLite_0.4.2        snow_0.4-4               tibble_3.2.1            
## [187] aplot_0.2.4              memoise_2.0.1            cluster_2.1.6           
## [190] globals_0.16.3

citation()

## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.

Comparing the GEP of Canine CD4+ PTCL to the Monocle 3 Atlas of Normal Canine Thymocytes and T cells

Eileen Owens

2025-02-03