Introduction

The purpose of this script is to perform a differential gene expression analysis on RNA-seq data with DESeq2. The DESeq2 documentation is available here: https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html

Software

if (!require("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
BiocManager::install(version = "3.18", force=TRUE)

options(BioC_mirror = "http://bioconductor.org")

BiocManager::install("DESeq2")
install.packages("corrplot")
install.packages("RColorBrewer")
install.packages("pheatmap")
BiocManager::install("apeglm")
install.packages("tidyverse")
install.packages("rlang")
BiocManager::install("EnhancedVolcano")
install.packages("ggplot2")
install.packages("stringr")
install.packages("ggpubr")

library(DESeq2)
library(corrplot)
library(RColorBrewer)
library(pheatmap)
library(apeglm)
library(tidyverse)
library(rlang)
library(EnhancedVolcano)
library(ggplot2)
library(stringr)
library(ggpubr)

Data

### Set the appropriate working directory ###
setwd("C:/Users/edlarsen/Documents/PTCLRNASeq")

### featureCounts file ###
countsData <- read.table(file = "Cohort_2/Input/230602_CanFam31Alignment_feature_counts.txt", header = FALSE, row.names = 1, skip = 2)

### metadata file ###
metadata <- read.table(file = "Cohort_2/Input/metadata.txt", header = FALSE)
colnames(metadata) <- c("avery_num", "sample_name", "phenotype")

# Give column names to countsData:
colnames(countsData) <- c("chr", "start", "stop", "strand", "length", as.vector(metadata$sample_name))

Count matrix (cts) file for DESeq2

# Save columns containing read counts as an object called cts:
cts <- as.matrix(countsData[,6:108])

coldata file for DESeq2

rownames(metadata)<- metadata$sample_name
coldata <- metadata[, c("avery_num", "phenotype"), drop=FALSE] # drop=FALSE keeps row names
coldata$phenotype <- as.factor(coldata$phenotype) # set phenotype as factor

# Ensure that the rownames of coldata exactly match the colnames of cts:
all(rownames(coldata) == colnames(cts)) # Should return TRUE

## [1] TRUE

Differential expression analysis with DESeq2

Study parameters

dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ phenotype)

Filter for present genes

Excludes all samples that have less than 10 reads.

keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

DESeq2 analysis

dds <- DESeq(dds)

Dispersion estimates

A simple helper function that plots the per-gene dispersion estimates together with the fitted mean-dispersion relationship. This is a good way to double check your data is a good fit for the DESeq2 model. The data should generally scatter around the curve, with dispersion increasing with increasing mean expression levels.

plotDispEsts(dds)

Principal component analysis

Perform a variance stabilized transformation of the normalized count data and perform the plotPCA function.

vsd <- vst(dds, blind=TRUE)
vsd_pcaData <- plotPCA(vsd, intgroup="phenotype", returnData=FALSE) +
  geom_text(aes(label=colnames(vsd)), position = position_nudge(y = -2)) +
  scale_color_manual(values = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"))
vsd_pcaData + ggtitle("PCA: Cohort 2") + theme(plot.title = element_text(hjust = 0.5))

Differential expression analysis

Calculate the statistically significant differences between conditions. Specify the coefficient or contrast you want to build a results table for. Follow up with log fold change shrinkage for visualization and gene ranking.

res <- results(dds)

#### CD4 PTCL vs CD4 Control Nodal Lymphocytes ####
CD4_PTCLvsCD4_LN_CTRL_results <- results(dds, contrast=c("phenotype", "CD4_PTCL", "CD4_LN_CTRL")) # Replace CD4_PTCL and CD4_LN_CTRL with phenotypes from metadata.
CD4_PTCLvsCD4_LN_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD4_PTCL", "CD4_LN_CTRL"), res=CD4_PTCLvsCD4_LN_CTRL_results, type="normal")

#### CD4 PTCL vs CD4 Control Thymocytes ####
CD4_PTCLvsCD4THYM_CTRL_results <- results(dds, contrast=c("phenotype", "CD4_PTCL", "CD4THYM_CTRL")) # Replace CD4_PTCL and CD4THYM_CTRL with phenotypes from metadata.
CD4_PTCLvsCD4THYM_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD4_PTCL", "CD4THYM_CTRL"), res=CD4_PTCLvsCD4THYM_CTRL_results, type="normal")

#### CD4 Control Thymocytes vs CD4 Control Nodal Lymphocytes ####
CD4THYM_CTRLvsCD4_LN_CTRL_results <- results(dds, contrast=c("phenotype", "CD4THYM_CTRL", "CD4_LN_CTRL")) # Replace CD4THYM_CTRL and CD4_LN_CTRL with phenotypes from metadata.
CD4THYM_CTRLvsCD4_LN_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD4THYM_CTRL", "CD4_LN_CTRL"), res=CD4THYM_CTRLvsCD4_LN_CTRL_results, type="normal")

Export DESeq2 results tables

genenames <- read.csv('EnsemblCanFam31GeneDescription.csv') # csv file corresponding ensembl gene IDs to HNGC symbols

#### CD4 PTCL vs CD4 Control Nodal Lymphocytes ####
CD4_PTCLvsCD4_LN_CTRL_shrink$probe_id <- rownames(CD4_PTCLvsCD4_LN_CTRL_shrink)
CD4_PTCLvsCD4_LN_CTRL_final_res <- merge(as(CD4_PTCLvsCD4_LN_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE) # all.x=TRUE includes rows of DESeq result even if there is no matching row of the genenames table.
rownames(CD4_PTCLvsCD4_LN_CTRL_final_res) <- CD4_PTCLvsCD4_LN_CTRL_final_res$probe_id
CD4_PTCLvsCD4_LN_CTRL_final_res <- CD4_PTCLvsCD4_LN_CTRL_final_res[,-1]
write.csv(CD4_PTCLvsCD4_LN_CTRL_final_res, file="Cohort_2/Output/CD4_PTCLvsCD4_LN_CTRL_DESeq2res.csv")

#### CD4 PTCL vs CD4 Control Thymocytes ####
CD4_PTCLvsCD4THYM_CTRL_shrink$probe_id <- rownames(CD4_PTCLvsCD4THYM_CTRL_shrink)
CD4_PTCLvsCD4THYM_CTRL_final_res <- merge(as(CD4_PTCLvsCD4THYM_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE)
rownames(CD4_PTCLvsCD4THYM_CTRL_final_res) <- CD4_PTCLvsCD4THYM_CTRL_final_res$probe_id
CD4_PTCLvsCD4THYM_CTRL_final_res <- CD4_PTCLvsCD4THYM_CTRL_final_res[,-1]
write.csv(CD4_PTCLvsCD4THYM_CTRL_final_res, file="Cohort_2/Output/CD4_PTCLvsCD4THYM_CTRL_DESeq2res.csv")

#### CD4 Control Thymocytes vs CD4 Control Nodal Lymphocytes ####
CD4THYM_CTRLvsCD4_LN_CTRL_shrink$probe_id <- rownames(CD4THYM_CTRLvsCD4_LN_CTRL_shrink)
CD4THYM_CTRLvsCD4_LN_CTRL_final_res <- merge(as(CD4THYM_CTRLvsCD4_LN_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE)
rownames(CD4THYM_CTRLvsCD4_LN_CTRL_final_res) <- CD4THYM_CTRLvsCD4_LN_CTRL_final_res$probe_id
CD4THYM_CTRLvsCD4_LN_CTRL_final_res <- CD4THYM_CTRLvsCD4_LN_CTRL_final_res[,-1]
write.csv(CD4THYM_CTRLvsCD4_LN_CTRL_final_res, file="Cohort_2/Output/CD4THYM_CTRLvsCD4_LN_CTRL_DESeq2res.csv")

Export normalized count data

sizeFactors(dds)
normalized_counts <- as.data.frame(counts(dds, normalized=TRUE))
normalized_counts$probe_id <- rownames(normalized_counts)
normalized_counts <- merge(as(normalized_counts,"data.frame"), genenames, by="probe_id", all.x=TRUE) # Add column with gene names.
write.csv(normalized_counts, file="Cohort_2/Output/Cohort2_NormalizedCounts.csv")

MA plot

Shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet.

plotMA(CD4_PTCLvsCD4_LN_CTRL_shrink, main="CD4_PTCL vs CD4_LN_CTRL, Cohort 2", ylim = c(-7,7), 
       ylab = "log fold change",
       xlab = "means of normalized counts")

Plot gene counts

Plot the normalized counts for genes of interest between sample phenotypes.

# Define the lists of genes to plot
geneList1 <- c("GATA3", "TBX21", "TNFRSF8", "BCL6")
geneList2 <- c("CD34", "KIT", "CCR9", "DNTT")
geneList3 <- c("ZAP70", "LCK", "ITK", "IL2RB")
geneList4 <- c("IL2RA", "DLA-DQA1", "DLA-DRA", "HLA-DQB1")

Extract the normalized counts and variance stabilized transformed counts for the provided lists of genes.

############ GENE LIST 1 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset1 <- normalized_counts %>%
  filter(gene_name %in% geneList1)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset1 <- countsSubset1 %>%
  gather(colnames(countsSubset1)[2:104], key = "sample_name", value="normalized_counts")  # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset1$normalized_counts <- as.numeric(as.character(gathered_countsSubset1$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group. This will merge the 2 data frames with respect to the "sample_name" column (i.e., a column with the same column name in both data frames)
gathered_countsSubset1 <- inner_join(metadata, gathered_countsSubset1)


############ GENE LIST 2 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset2 <- normalized_counts %>%
  filter(gene_name %in% geneList2)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset2 <- countsSubset2 %>%
  gather(colnames(countsSubset2)[2:104], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset2$normalized_counts <- as.numeric(as.character(gathered_countsSubset2$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset2 <- inner_join(metadata, gathered_countsSubset2)


############ GENE LIST 3 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset3 <- normalized_counts %>%
  filter(gene_name %in% geneList3)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset3 <- countsSubset3 %>%
  gather(colnames(countsSubset3)[2:104], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset3$normalized_counts <- as.numeric(as.character(gathered_countsSubset3$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset3 <- inner_join(metadata, gathered_countsSubset3)


############ GENE LIST 4 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset4 <- normalized_counts %>%
  filter(gene_name %in% geneList4)

# Gather the columns to get the normalized counts for each sample in a single column. 
gathered_countsSubset4 <- countsSubset4 %>%
  gather(colnames(countsSubset4)[2:104], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset3$normalized_counts <- as.numeric(as.character(gathered_countsSubset3$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.

# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset4 <- inner_join(metadata, gathered_countsSubset4)

Check data distribution

Visually inspect the distribution of your count data. T-tests can be used for data that fits a normal Gaussian distribution, but a Wilcoxon test is preferred for comparing groups when the data is not normally distributed.

Density plots

############ GENE LIST 1 ############
ggdensity(gathered_countsSubset1$normalized_counts,
          main = "Count data density: GENELIST1",
          xlab = "Normalized counts")

############ GENE LIST 2 ############
ggdensity(gathered_countsSubset2$normalized_counts,
          main = "Count data density: GENELIST2",
          xlab = "Normalized counts")

############ GENE LIST 3 ############
ggdensity(gathered_countsSubset3$normalized_counts,
          main = "Count data density: GENELIST3",
          xlab = "Normalized counts")

############ GENE LIST 4 ############
ggdensity(gathered_countsSubset4$normalized_counts,
          main = "Count data density: GENELIST4",
          xlab = "Normalized counts")

Q-Q plots

############ GENE LIST 1 ############
ggqqplot(gathered_countsSubset1$normalized_counts, main = "Q-Q plot for normal distribution of count data: GENELIST1")

############ GENE LIST 2 ############
ggqqplot(gathered_countsSubset2$normalized_counts, main = "Q-Q plot for normal distribution of count data: GENELIST2")

############ GENE LIST 3 ############
ggqqplot(gathered_countsSubset3$normalized_counts, main = "Q-Q plot for normal distribution of count data: GENELIST3")

############ GENE LIST 4 ############
ggqqplot(gathered_countsSubset4$normalized_counts, main = "Q-Q plot for normal distribution of count data: GENELIST4")

Dotplots

############ GENE LIST 1 ############
ggplot(gathered_countsSubset1, aes(x=gene_name, y=normalized_counts, color=phenotype, shape=phenotype)) +
  #geom_boxplot(position=position_dodge(0.3)) + # Un-comment this line to overlay box plots
  geom_jitter(size=3, position=position_dodge(0.5)) +
  scale_color_manual(values = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue")) + # replace "Phenotype1" etc. with appropriate phenotypes from your metadata file
  scale_y_log10() +
  
  # set axis labels and plot title
  xlab("Genes") +
  ylab("log10 Normalized Counts") +
  ggtitle("Gene Expression: Cohort 2") +
  
  # display stats; replace wilcox_test with t_test based on results of normalization test
  geom_pwc(aes(group=phenotype), method = "wilcox_test", dodge=0.6, tip.length=0, hide.ns = TRUE, label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  coord_cartesian(clip = "off") +
  theme(axis.text.x = element_text(size=14, face="bold", angle=45, hjust=1),
        plot.title = element_text(hjust=0.5),
        legend.text = element_text(size=12),
        legend.title = element_text(size=14))

############ GENE LIST 2 ############
ggplot(gathered_countsSubset2, aes(x=gene_name, y=normalized_counts, color=phenotype, shape=phenotype)) +
  #geom_boxplot(position=position_dodge(0.3)) + # Un-comment this line to overlay box plots
  geom_jitter(size=3, position=position_dodge(0.5)) +
  scale_color_manual(values = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue")) + # replace "Phenotype1" etc. with appropriate phenotypes from your metadata file
  scale_y_log10() +
  
  # set axis labels and plot title
  xlab("Genes") +
  ylab("log10 Normalized Counts") +
  ggtitle("Expression of Markers of Immaturity: Cohort 2") +
  
  # display stats; replace wilcox_test with t_test based on results of normalization test
  geom_pwc(aes(group=phenotype), method = "wilcox_test", dodge=0.6, tip.length=0, hide.ns = TRUE, label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  coord_cartesian(clip = "off") +
  theme(axis.text.x = element_text(size=14, face="bold", angle=45, hjust=1),
        plot.title = element_text(hjust=0.5),
        legend.text = element_text(size=12),
        legend.title = element_text(size=14))

############ GENE LIST 3 ############
ggplot(gathered_countsSubset3, aes(x=gene_name, y=normalized_counts, color=phenotype, shape=phenotype)) +
  #geom_boxplot(position=position_dodge(0.3)) + # Un-comment this line to overlay box plots
  geom_jitter(size=3, position=position_dodge(0.5)) +
  scale_color_manual(values = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue")) + # replace "Phenotype1" etc. with appropriate phenotypes from your metadata file
  scale_y_log10() +
  
  # set axis labels and plot title
  xlab("Genes") +
  ylab("log10 Normalized Counts") +
  ggtitle("Expression of TCR Signaling Genes: Cohort 2") +
  
  # display stats; replace wilcox_test with t_test based on results of normalization test
  geom_pwc(aes(group=phenotype), method = "wilcox_test", dodge=0.6, tip.length=0, hide.ns = TRUE, label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  coord_cartesian(clip = "off") +
  theme(axis.text.x = element_text(size=14, face="bold", angle=45, hjust=1),
        plot.title = element_text(hjust=0.5),
        legend.text = element_text(size=12),
        legend.title = element_text(size=14))

############ GENE LIST 4 ############
ggplot(gathered_countsSubset4, aes(x=gene_name, y=normalized_counts, color=phenotype, shape=phenotype)) +
  #geom_boxplot(position=position_dodge(0.3)) + # Un-comment this line to overlay box plots
  geom_jitter(size=3, position=position_dodge(0.5)) +
  scale_color_manual(values = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue")) + # replace "Phenotype1" etc. with appropriate phenotypes from your metadata file
  scale_y_log10() +
  
  # set axis labels and plot title
  xlab("Genes") +
  ylab("log10 Normalized Counts") +
  ggtitle("Expression of CD25 and MHC Class II Genes: Cohort 2") +
  
  # display stats; replace wilcox_test with t_test based on results of normalization test
  geom_pwc(aes(group=phenotype), method = "wilcox_test", dodge=0.6, tip.length=0, hide.ns = TRUE, label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  coord_cartesian(clip = "off") +
  theme(axis.text.x = element_text(size=14, face="bold", angle=45, hjust=1),
        plot.title = element_text(hjust=0.5),
        legend.text = element_text(size=12),
        legend.title = element_text(size=14))

Faceted dotplots

############ GENE LIST 1 ############
ggplot(gathered_countsSubset1, aes(x=phenotype, y=normalized_counts, color=phenotype, shape=phenotype)) +
  geom_jitter(size=2, position=position_dodge(0.3)) +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Gene Expression: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ GENE LIST 2 ############
ggplot(gathered_countsSubset2, aes(x=phenotype, y=normalized_counts, color=phenotype, shape=phenotype)) +
  geom_jitter(size=2, position=position_dodge(0.3)) +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Markers of Immaturity: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ GENE LIST 3 ############
ggplot(gathered_countsSubset3, aes(x=phenotype, y=normalized_counts, color=phenotype, shape=phenotype)) +
  geom_jitter(size=2, position=position_dodge(0.3)) +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of TCR Signaling Genes: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

############ GENE LIST 4 ############
ggplot(gathered_countsSubset4, aes(x=phenotype, y=normalized_counts, color=phenotype, shape=phenotype)) +
  geom_jitter(size=2, position=position_dodge(0.3)) +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of CD25 and MHC Class II Genes: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")), 
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title= element_text(hjust = 0.5))

Box plots

############ GENE LIST 1 ############
ggplot(gathered_countsSubset1, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Gene Expression: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")),
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

############ GENE LIST 2 ############
ggplot(gathered_countsSubset2, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of Markers of Immaturity: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")),
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

############ GENE LIST 3 ############
ggplot(gathered_countsSubset3, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of TCR Signaling Genes: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")),
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

############ GENE LIST 4 ############
ggplot(gathered_countsSubset4, aes(phenotype, normalized_counts, fill=phenotype)) +
  geom_boxplot() +
  scale_y_log10() +
  facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
  scale_x_discrete(limits=c("CD4_PTCL", "CD4_LN_CTRL", "CD4THYM_CTRL")) + # change the order of items along the X axis
  
  # set axis labels and plot title
  labs(x="Group",
       y="log10 Normalized Counts",
       fill="Group",
       title="Expression of CD25 and MHC Class II Genes: Cohort 2") +
  
  # display stats
  stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_LN_CTRL"), c("CD4_LN_CTRL", "CD4THYM_CTRL"), c("CD4_PTCL", "CD4THYM_CTRL")),
                     tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
  
  # set style preferences
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5))

Correlation Matrix

Evaluate the sample-to-sample correlation by calculating the distances (dissimilarities) between each sample and creating a matrix of distance values. Since the majority of genes are not differentially expressed, samples generally have high correlations with each other (>0.80). Samples with low correlation may indicate outliers or sample contamination.

sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$phenotype, colnames(vsd), sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)

Heatmap grouping samples based on their gene expression similarity. The color blocks indicate substructure in the data, so replicates should cluster together as a block for each sample group.

p <- pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors,
         main="Correlation Matrix: Cohort 2")

Volcano Plots

Volcano plots display the fold change against the p-value.

volcano_data1 <- CD4_PTCLvsCD4_LN_CTRL_final_res

EnhancedVolcano(volcano_data1, 
                lab = volcano_data1$gene_name,
                x = 'log2FoldChange',
                y = 'padj',
                title = 'CD4_PTCL vs. CD4_LN_CTRL, Cohort 2',
                FCcutoff = 1.5,
                pCutoff = 0.05,
                pointSize = 2.0,
                labSize = 4.0,
                labCol = 'black',
                labFace = 'bold',
                colAlpha = 4/5,
                legendPosition = 'right',
                legendLabSize = 12,
                legendIconSize = 6.0,
                drawConnectors = TRUE,
                widthConnectors = 0.2,
                colConnectors = 'black'
                )

# Repeat for as many comparisons as desired
volcano_data2 <- CD4_PTCLvsCD4THYM_CTRL_final_res

EnhancedVolcano(volcano_data2, 
                lab = volcano_data2$gene_name,
                x = 'log2FoldChange',
                y = 'padj',
                title = 'CD4_PTCL vs. CD4THYM_CTRL, Cohort 2',
                FCcutoff = 1.5,
                pCutoff = 0.05,
                pointSize = 2.0,
                labSize = 4.0,
                labCol = 'black',
                labFace = 'bold',
                colAlpha = 4/5,
                legendPosition = 'right',
                legendLabSize = 12,
                legendIconSize = 6.0,
                drawConnectors = TRUE,
                widthConnectors = 0.2,
                colConnectors = 'black'
                )

volcano_data3 <- CD4THYM_CTRLvsCD4_LN_CTRL_final_res

EnhancedVolcano(volcano_data3, 
                lab = volcano_data2$gene_name,
                x = 'log2FoldChange',
                y = 'padj',
                title = 'CD4THYM_CTRL vs. CD4_LN_CTRL, Cohort 2',
                FCcutoff = 1.5,
                pCutoff = 0.05,
                pointSize = 2.0,
                labSize = 4.0,
                labCol = 'black',
                labFace = 'bold',
                colAlpha = 4/5,
                legendPosition = 'right',
                legendLabSize = 12,
                legendIconSize = 6.0,
                drawConnectors = TRUE,
                widthConnectors = 0.2,
                colConnectors = 'black'
                )

Final data export

DESeq2 dds object

saveRDS(dds, file="Cohort_2/Output/Cohort2_dds.Rdata")

Export matrix of vst transformed normalized counts:

vsd <- assay(vst(dds))
vsd <- as.data.frame(vsd, drop=FALSE) # Convert to a data frame.
vsd$probe_id <- rownames(vsd) # Add probe ID as a column
vsd <- merge(as(vsd,"data.frame"), genenames, by="probe_id", all.x=TRUE) # Adds a column of gene names for the associated probe ID
write.csv(vsd, file="Cohort_2/Output/VST_NormalizedCounts.csv") # Export as csv

Citations

sessionInfo()

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggpubr_0.6.0                EnhancedVolcano_1.22.0     
##  [3] ggrepel_0.9.6               rlang_1.1.3                
##  [5] lubridate_1.9.4             forcats_1.0.0              
##  [7] stringr_1.5.1               dplyr_1.1.4                
##  [9] purrr_1.0.2                 readr_2.1.5                
## [11] tidyr_1.3.1                 tibble_3.2.1               
## [13] ggplot2_3.5.1               tidyverse_2.0.0            
## [15] apeglm_1.26.1               pheatmap_1.0.12            
## [17] RColorBrewer_1.1-3          corrplot_0.95              
## [19] DESeq2_1.44.0               SummarizedExperiment_1.34.0
## [21] Biobase_2.64.0              MatrixGenerics_1.16.0      
## [23] matrixStats_1.5.0           GenomicRanges_1.56.2       
## [25] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [27] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1        farver_2.1.2            fastmap_1.2.0          
##  [4] digest_0.6.35           timechange_0.3.0        lifecycle_1.0.4        
##  [7] magrittr_2.0.3          compiler_4.4.0          sass_0.4.9             
## [10] tools_4.4.0             yaml_2.3.10             ggsignif_0.6.4         
## [13] knitr_1.49              labeling_0.4.3          S4Arrays_1.4.1         
## [16] DelayedArray_0.30.1     plyr_1.8.9              abind_1.4-8            
## [19] BiocParallel_1.38.0     withr_3.0.2             numDeriv_2016.8-1.1    
## [22] grid_4.4.0              colorspace_2.1-1        scales_1.3.0           
## [25] MASS_7.3-60.2           bbmle_1.0.25.1          cli_3.6.2              
## [28] mvtnorm_1.3-3           rmarkdown_2.29          crayon_1.5.3           
## [31] generics_0.1.3          rstudioapi_0.17.1       httr_1.4.7             
## [34] tzdb_0.4.0              bdsmatrix_1.3-7         cachem_1.1.0           
## [37] zlibbioc_1.50.0         parallel_4.4.0          XVector_0.44.0         
## [40] vctrs_0.6.5             Matrix_1.7-0            carData_3.0-5          
## [43] jsonlite_1.8.9          car_3.1-3               hms_1.1.3              
## [46] rstatix_0.7.2           Formula_1.2-5           locfit_1.5-9.10        
## [49] jquerylib_0.1.4         glue_1.8.0              emdbook_1.3.13         
## [52] codetools_0.2-20        stringi_1.8.4           gtable_0.3.6           
## [55] UCSC.utils_1.0.0        munsell_0.5.1           pillar_1.10.1          
## [58] htmltools_0.5.8.1       GenomeInfoDbData_1.2.12 R6_2.5.1               
## [61] evaluate_1.0.3          lattice_0.22-6          backports_1.5.0        
## [64] broom_1.0.7             bslib_0.8.0             Rcpp_1.0.14            
## [67] coda_0.19-4.1           SparseArray_1.4.8       xfun_0.49              
## [70] pkgconfig_2.0.3

citation()

## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.

Differential Expression Analysis with DESeq2: Cohort 2 (CanFam3.1)

Eileen Owens

2025-02-25