The purpose of this script is to perform a differential gene expression analysis on RNA-seq data with DESeq2. The DESeq2 documentation is available here: https://www.bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(version = "3.18", force=TRUE)
options(BioC_mirror = "http://bioconductor.org")
BiocManager::install("DESeq2")
install.packages("corrplot")
install.packages("RColorBrewer")
install.packages("pheatmap")
BiocManager::install("apeglm")
install.packages("tidyverse")
install.packages("rlang")
BiocManager::install("EnhancedVolcano")
install.packages("ggplot2")
install.packages("stringr")
install.packages("ggpubr")
library(DESeq2)
library(corrplot)
library(RColorBrewer)
library(pheatmap)
library(apeglm)
library(tidyverse)
library(rlang)
library(EnhancedVolcano)
library(ggplot2)
library(stringr)
library(ggpubr)
### Set the appropriate working directory ###
setwd("C:/Users/edlarsen/Documents/PTCLRNASeq")
### featureCounts file ###
countsData <- read.table(file = "Cohort_1/Input/220628_featureCounts_withCanFam31Alignment.txt", header = FALSE, row.names = 1, skip = 2)
### metadata file ###
metadata <- read.table(file = "Cohort_1/Input/metadata.txt", header = FALSE)
colnames(metadata) <- c("avery_num", "sample_name", "phenotype")
# Give column names to countsData:
colnames(countsData) <- c("chr", "start", "stop", "strand", "length", as.vector(metadata$sample_name))
# Save columns containing read counts as an object called cts:
cts <- as.matrix(countsData[,6:45])
rownames(metadata)<- metadata$sample_name
coldata <- metadata[, c("avery_num", "phenotype"), drop=FALSE] # drop=FALSE keeps row names
coldata$phenotype <- as.factor(coldata$phenotype) # set phenotype as factor
# Ensure that the rownames of coldata exactly match the colnames of cts:
all(rownames(coldata) == colnames(cts)) # Should return TRUE
## [1] TRUE
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ phenotype)
Excludes all samples that have less than 10 reads.
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds <- dds[,-21] # Remove column number that contains the outlier.
dds <- DESeq(dds)
A simple helper function that plots the per-gene dispersion estimates together with the fitted mean-dispersion relationship. This is a good way to double check your data is a good fit for the DESeq2 model. The data should generally scatter around the curve, with dispersion increasing with increasing mean expression levels.
plotDispEsts(dds)
Perform a variance stabilized transformation of the normalized count data and perform the plotPCA function.
vsd <- vst(dds, blind=TRUE)
vsd_pcaData <- plotPCA(vsd, intgroup="phenotype", returnData=FALSE) +
geom_text(aes(label=colnames(vsd)), position = position_nudge(y = -2))
vsd_pcaData + ggtitle("PCA: Cohort 1") + theme(plot.title = element_text(hjust = 0.5))
Calculate the statistically significant differences between conditions. Specify the coefficient or contrast you want to build a results table for. Follow up with log fold change shrinkage for visualization and gene ranking.
res <- results(dds)
#### CD4 PTCL vs CD4 Control Nodal Lymphocytes ####
CD4_PTCLvsCD4_CTRL_results <- results(dds, contrast=c("phenotype", "CD4_PTCL", "CD4_CTRL")) # Replace CD4_PTCL and CD4_CTRL with phenotypes from metadata.
CD4_PTCLvsCD4_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD4_PTCL", "CD4_CTRL"), res=CD4_PTCLvsCD4_CTRL_results, type="normal")
#### CD4 PTCL vs CD8 Control Nodal Lymphocytes ####
CD4_PTCLvsCD8_CTRL_results <- results(dds, contrast=c("phenotype", "CD4_PTCL", "CD8_CTRL")) # Replace CD4_PTCL and CD8_CTRL with phenotypes from metadata.
CD4_PTCLvsCD8_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD4_PTCL", "CD8_CTRL"), res=CD4_PTCLvsCD8_CTRL_results, type="normal")
#### CD4 PTCL vs CD8 PTCL ####
CD4_PTCLvsCD8_PTCL_results <- results(dds, contrast=c("phenotype", "CD4_PTCL", "CD8_PTCL")) # Replace CD4_PTCL and CD8_PTCL with phenotypes from metadata.
CD4_PTCLvsCD8_PTCL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD4_PTCL", "CD8_PTCL"), res=CD4_PTCLvsCD8_PTCL_results, type="normal")
#### CD4 PTCL vs DN PTCL ####
CD4_PTCLvsDN_PTCL_results <- results(dds, contrast=c("phenotype", "CD4_PTCL", "DN_PTCL")) # Replace CD4_PTCL and DN_PTCL with phenotypes from metadata.
CD4_PTCLvsDN_PTCL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD4_PTCL", "DN_PTCL"), res=CD4_PTCLvsDN_PTCL_results, type="normal")
#### CD8 PTCL vs CD8 Control Nodal Lymphocytes ####
CD8_PTCLvsCD8_CTRL_results <- results(dds, contrast=c("phenotype", "CD8_PTCL", "CD8_CTRL")) # Replace CD8_PTCL and CD8_CTRL with phenotypes from metadata.
CD8_PTCLvsCD8_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD8_PTCL", "CD8_CTRL"), res=CD8_PTCLvsCD8_CTRL_results, type="normal")
#### CD8 PTCL vs CD4 Control Nodal Lymphocytes ####
CD8_PTCLvsCD4_CTRL_results <- results(dds, contrast=c("phenotype", "CD8_PTCL", "CD4_CTRL")) # Replace CD8_PTCL and CD4_CTRL with phenotypes from metadata.
CD8_PTCLvsCD4_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD8_PTCL", "CD4_CTRL"), res=CD8_PTCLvsCD4_CTRL_results, type="normal")
#### CD8 PTCL vs DN PTCL ####
CD8_PTCLvsDN_PTCL_results <- results(dds, contrast=c("phenotype", "CD8_PTCL", "DN_PTCL")) # Replace CD8_PTCL and DN_PTCL with phenotypes from metadata.
CD8_PTCLvsDN_PTCL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD8_PTCL", "DN_PTCL"), res=CD8_PTCLvsDN_PTCL_results, type="normal")
#### DN PTCL vs CD8 Control Nodal Lymphocytes ####
DN_PTCLvsCD8_CTRL_results <- results(dds, contrast=c("phenotype", "DN_PTCL", "CD8_CTRL")) # Replace DN_PTCL and CD8_CTRL with phenotypes from metadata.
DN_PTCLvsCD8_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "DN_PTCL", "CD8_CTRL"), res=DN_PTCLvsCD8_CTRL_results, type="normal")
#### DN PTCL vs CD4 Control Nodal Lymphocytes ####
DN_PTCLvsCD4_CTRL_results <- results(dds, contrast=c("phenotype", "DN_PTCL", "CD4_CTRL")) # Replace DN_PTCL and CD4_CTRL with phenotypes from metadata.
DN_PTCLvsCD4_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "DN_PTCL", "CD4_CTRL"), res=DN_PTCLvsCD4_CTRL_results, type="normal")
#### CD4 Control Nodal Lymphocytes vs CD8 Control Nodal Lymphocytes ####
CD4_CTRLvsCD8_CTRL_results <- results(dds, contrast=c("phenotype", "CD4_CTRL", "CD8_CTRL")) # Replace CD4_CTRL and CD8_CTRL with phenotypes from metadata.
CD4_CTRLvsCD8_CTRL_shrink <- lfcShrink(dds=dds, contrast=c("phenotype", "CD4_CTRL", "CD8_CTRL"), res=CD4_CTRLvsCD8_CTRL_results, type="normal")
genenames <- read.csv('EnsemblCanFam31GeneDescription.csv') # csv file corresponding ensembl gene IDs to HNGC symbols
#### CD4 PTCL vs CD4 Control Nodal Lymphocytes ####
CD4_PTCLvsCD4_CTRL_shrink$probe_id <- rownames(CD4_PTCLvsCD4_CTRL_shrink)
CD4_PTCLvsCD4_CTRL_final_res <- merge(as(CD4_PTCLvsCD4_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE) # all.x=TRUE includes rows of DESeq result even if there is no matching row of the genenames table.
rownames(CD4_PTCLvsCD4_CTRL_final_res) <- CD4_PTCLvsCD4_CTRL_final_res$probe_id
CD4_PTCLvsCD4_CTRL_final_res <- CD4_PTCLvsCD4_CTRL_final_res[,-1]
write.csv(CD4_PTCLvsCD4_CTRL_final_res, file="Cohort_1/Output/CD4_PTCLvsCD4_CTRL_DESeq2res.csv")
#### CD4 PTCL vs CD8 Control Nodal Lymphocytes ####
CD4_PTCLvsCD8_CTRL_shrink$probe_id <- rownames(CD4_PTCLvsCD8_CTRL_shrink)
CD4_PTCLvsCD8_CTRL_final_res <- merge(as(CD4_PTCLvsCD8_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE)
rownames(CD4_PTCLvsCD8_CTRL_final_res) <- CD4_PTCLvsCD8_CTRL_final_res$probe_id
CD4_PTCLvsCD8_CTRL_final_res <- CD4_PTCLvsCD8_CTRL_final_res[,-1]
write.csv(CD4_PTCLvsCD8_CTRL_final_res, file="Cohort_1/Output/CD4_PTCLvsCD8_CTRL_DESeq2res.csv")
#### CD4 PTCL vs CD8 PTCL ####
CD4_PTCLvsCD8_PTCL_shrink$probe_id <- rownames(CD4_PTCLvsCD8_PTCL_shrink)
CD4_PTCLvsCD8_PTCL_final_res <- merge(as(CD4_PTCLvsCD8_PTCL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE) # all.x=TRUE includes rows of DESeq result even if there is no matching row of the genenames table.
rownames(CD4_PTCLvsCD8_PTCL_final_res) <- CD4_PTCLvsCD8_PTCL_final_res$probe_id
CD4_PTCLvsCD8_PTCL_final_res <- CD4_PTCLvsCD8_PTCL_final_res[,-1]
write.csv(CD4_PTCLvsCD8_PTCL_final_res, file="Cohort_1/Output/CD4_PTCLvsCD8_PTCL_DESeq2res.csv")
#### CD4 PTCL vs DN PTCL ####
CD4_PTCLvsDN_PTCL_shrink$probe_id <- rownames(CD4_PTCLvsDN_PTCL_shrink)
CD4_PTCLvsDN_PTCL_final_res <- merge(as(CD4_PTCLvsDN_PTCL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE) # all.x=TRUE includes rows of DESeq result even if there is no matching row of the genenames table.
rownames(CD4_PTCLvsDN_PTCL_final_res) <- CD4_PTCLvsDN_PTCL_final_res$probe_id
CD4_PTCLvsDN_PTCL_final_res <- CD4_PTCLvsDN_PTCL_final_res[,-1]
write.csv(CD4_PTCLvsDN_PTCL_final_res, file="Cohort_1/Output/CD4_PTCLvsDN_PTCL_DESeq2res.csv")
#### CD8 PTCL vs CD8 Control Nodal Lymphocytes ####
CD8_PTCLvsCD8_CTRL_shrink$probe_id <- rownames(CD8_PTCLvsCD8_CTRL_shrink)
CD8_PTCLvsCD8_CTRL_final_res <- merge(as(CD8_PTCLvsCD8_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE)
rownames(CD8_PTCLvsCD8_CTRL_final_res) <- CD8_PTCLvsCD8_CTRL_final_res$probe_id
CD8_PTCLvsCD8_CTRL_final_res <- CD8_PTCLvsCD8_CTRL_final_res[,-1]
write.csv(CD8_PTCLvsCD8_CTRL_final_res, file="Cohort_1/Output/CD8_PTCLvsCD8_CTRL_DESeq2res.csv")
#### CD8 PTCL vs CD4 Control Nodal Lymphocytes ####
CD8_PTCLvsCD4_CTRL_shrink$probe_id <- rownames(CD8_PTCLvsCD4_CTRL_shrink)
CD8_PTCLvsCD4_CTRL_final_res <- merge(as(CD8_PTCLvsCD4_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE)
rownames(CD8_PTCLvsCD4_CTRL_final_res) <- CD8_PTCLvsCD4_CTRL_final_res$probe_id
CD8_PTCLvsCD4_CTRL_final_res <- CD8_PTCLvsCD4_CTRL_final_res[,-1]
write.csv(CD8_PTCLvsCD4_CTRL_final_res, file="Cohort_1/Output/CD8_PTCLvsCD4_CTRL_DESeq2res.csv")
#### CD8 PTCL vs DN PTCL ####
CD8_PTCLvsDN_PTCL_shrink$probe_id <- rownames(CD8_PTCLvsDN_PTCL_shrink)
CD8_PTCLvsDN_PTCL_final_res <- merge(as(CD8_PTCLvsDN_PTCL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE) # all.x=TRUE includes rows of DESeq result even if there is no matching row of the genenames table.
rownames(CD8_PTCLvsDN_PTCL_final_res) <- CD8_PTCLvsDN_PTCL_final_res$probe_id
CD8_PTCLvsDN_PTCL_final_res <- CD8_PTCLvsDN_PTCL_final_res[,-1]
write.csv(CD8_PTCLvsDN_PTCL_final_res, file="Cohort_1/Output/CD8_PTCLvsDN_PTCL_DESeq2res.csv")
#### DN PTCL vs CD8 Control Nodal Lymphocytes ####
DN_PTCLvsCD8_CTRL_shrink$probe_id <- rownames(DN_PTCLvsCD8_CTRL_shrink)
DN_PTCLvsCD8_CTRL_final_res <- merge(as(DN_PTCLvsCD8_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE)
rownames(DN_PTCLvsCD8_CTRL_final_res) <- DN_PTCLvsCD8_CTRL_final_res$probe_id
DN_PTCLvsCD8_CTRL_final_res <- DN_PTCLvsCD8_CTRL_final_res[,-1]
write.csv(DN_PTCLvsCD8_CTRL_final_res, file="Cohort_1/Output/DN_PTCLvsCD8_CTRL_DESeq2res.csv")
#### DN PTCL vs CD4 Control Nodal Lymphocytes ####
DN_PTCLvsCD4_CTRL_shrink$probe_id <- rownames(DN_PTCLvsCD4_CTRL_shrink)
DN_PTCLvsCD4_CTRL_final_res <- merge(as(DN_PTCLvsCD4_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE)
rownames(DN_PTCLvsCD4_CTRL_final_res) <- DN_PTCLvsCD4_CTRL_final_res$probe_id
DN_PTCLvsCD4_CTRL_final_res <- DN_PTCLvsCD4_CTRL_final_res[,-1]
write.csv(DN_PTCLvsCD4_CTRL_final_res, file="Cohort_1/Output/DN_PTCLvsCD4_CTRL_DESeq2res.csv")
#### CD4 Control Nodal Lymphocytes vs CD8 Control Nodal Lymphocytes ####
CD4_CTRLvsCD8_CTRL_shrink$probe_id <- rownames(CD4_CTRLvsCD8_CTRL_shrink)
CD4_CTRLvsCD8_CTRL_final_res <- merge(as(CD4_CTRLvsCD8_CTRL_shrink,"data.frame"), genenames, by="probe_id", all.x=TRUE)
rownames(CD4_CTRLvsCD8_CTRL_final_res) <- CD4_CTRLvsCD8_CTRL_final_res$probe_id
CD4_CTRLvsCD8_CTRL_final_res <- CD4_CTRLvsCD8_CTRL_final_res[,-1]
write.csv(CD4_CTRLvsCD8_CTRL_final_res, file="Cohort_1/Output/CD4_CTRLvsCD8_CTRL_DESeq2res.csv")
sizeFactors(dds)
normalized_counts <- as.data.frame(counts(dds, normalized=TRUE))
normalized_counts$probe_id <- rownames(normalized_counts)
normalized_counts <- merge(as(normalized_counts,"data.frame"), genenames, by="probe_id", all.x=TRUE) # Add column with gene names.
write.csv(normalized_counts, file="Cohort_1/Output/Cohort1_NormalizedCounts.csv")
Shows the log2 fold changes attributable to a given variable over the mean of normalized counts for all the samples in the DESeqDataSet.
plotMA(CD4_PTCLvsCD4_CTRL_shrink, main="CD4_PTCL vs CD4_CTRL, Cohort 1", ylim = c(-7,7),
ylab = "log fold change",
xlab = "means of normalized counts")
Plot the normalized counts for genes of interest between sample phenotypes.
# Define the lists of genes to plot
geneList1 <- c("GATA3", "TBX21", "TNFRSF8", "BCL6")
geneList2 <- c("CD34", "KIT", "CCR9", "DNTT")
geneList3 <- c("ZAP70", "LCK", "ITK", "IL2RB")
geneList4 <- c("IL2RA", "DLA-DQA1", "DLA-DRA", "HLA-DQB1")
Extract the normalized counts and variance stabilized transformed counts for the provided lists of genes.
############ GENE LIST 1 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset1 <- normalized_counts %>%
filter(gene_name %in% geneList1)
# Gather the columns to get the normalized counts for each sample in a single column.
gathered_countsSubset1 <- countsSubset1 %>%
gather(colnames(countsSubset1)[2:40], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset1$normalized_counts <- as.numeric(as.character(gathered_countsSubset1$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.
# Combine with metadata to allow coloring of counts by sample group. This will merge the 2 data frames with respect to the "sample_name" column (i.e., a column with the same column name in both data frames)
gathered_countsSubset1 <- inner_join(metadata, gathered_countsSubset1)
############ GENE LIST 2 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset2 <- normalized_counts %>%
filter(gene_name %in% geneList2)
# Gather the columns to get the normalized counts for each sample in a single column.
gathered_countsSubset2 <- countsSubset2 %>%
gather(colnames(countsSubset2)[2:40], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset2$normalized_counts <- as.numeric(as.character(gathered_countsSubset2$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.
# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset2 <- inner_join(metadata, gathered_countsSubset2)
############ GENE LIST 3 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset3 <- normalized_counts %>%
filter(gene_name %in% geneList3)
# Gather the columns to get the normalized counts for each sample in a single column.
gathered_countsSubset3 <- countsSubset3 %>%
gather(colnames(countsSubset3)[2:40], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset3$normalized_counts <- as.numeric(as.character(gathered_countsSubset3$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.
# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset3 <- inner_join(metadata, gathered_countsSubset3)
############ GENE LIST 4 ############
# Subset the normalized count matrix to include only the genes in this list
countsSubset4 <- normalized_counts %>%
filter(gene_name %in% geneList4)
# Gather the columns to get the normalized counts for each sample in a single column.
gathered_countsSubset4 <- countsSubset4 %>%
gather(colnames(countsSubset4)[2:40], key = "sample_name", value="normalized_counts") # ***Adjust the numbers in the brackets to index on just the sample columns containing counts.***
gathered_countsSubset3$normalized_counts <- as.numeric(as.character(gathered_countsSubset3$normalized_counts)) # Ensures the normalized count data are in the "numeric" class.
# Combine with metadata to allow coloring of counts by sample group
gathered_countsSubset4 <- inner_join(metadata, gathered_countsSubset4)
Visually inspect the distribution of your count data. T-tests can be used for data that fits a normal Gaussian distribution, but a Wilcoxon test is preferred for comparing groups when the data is not normally distributed.
############ GENE LIST 1 ############
ggdensity(gathered_countsSubset1$normalized_counts,
main = "Count data density: GENELIST1",
xlab = "Normalized counts")
############ GENE LIST 2 ############
ggdensity(gathered_countsSubset2$normalized_counts,
main = "Count data density: GENELIST2",
xlab = "Normalized counts")
############ GENE LIST 3 ############
ggdensity(gathered_countsSubset3$normalized_counts,
main = "Count data density: GENELIST3",
xlab = "Normalized counts")
############ GENE LIST 4 ############
ggdensity(gathered_countsSubset4$normalized_counts,
main = "Count data density: GENELIST4",
xlab = "Normalized counts")
############ GENE LIST 1 ############
ggqqplot(gathered_countsSubset1$normalized_counts, main = "Q-Q plot for normal distribution of count data: GENELIST1")
############ GENE LIST 2 ############
ggqqplot(gathered_countsSubset2$normalized_counts, main = "Q-Q plot for normal distribution of count data: GENELIST2")
############ GENE LIST 3 ############
ggqqplot(gathered_countsSubset3$normalized_counts, main = "Q-Q plot for normal distribution of count data: GENELIST3")
############ GENE LIST 4 ############
ggqqplot(gathered_countsSubset4$normalized_counts, main = "Q-Q plot for normal distribution of count data: GENELIST4")
############ GENE LIST 1 ############
ggplot(gathered_countsSubset1, aes(x=gene_name, y=normalized_counts, color=phenotype, shape=phenotype)) +
#geom_boxplot(position=position_dodge(0.3)) + # Un-comment this line to overlay box plots
geom_jitter(size=3, position=position_dodge(0.5)) +
scale_color_manual(values = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral")) + # replace "Phenotype1" etc. with appropriate phenotypes from your metadata file
scale_y_log10() +
# set axis labels and plot title
xlab("Genes") +
ylab("log10 Normalized Counts") +
ggtitle("Gene Expression: Cohort 1") +
# display stats; replace wilcox_test with t_test based on results of normalization test
geom_pwc(aes(group=phenotype), method = "wilcox_test", dodge=0.6, tip.length=0, hide.ns = TRUE, label = "p.signif") +
# set style preferences
theme_bw() +
coord_cartesian(clip = "off") +
theme(axis.text.x = element_text(size=14, face="bold", angle=45, hjust=1),
plot.title = element_text(hjust=0.5),
legend.text = element_text(size=12),
legend.title = element_text(size=14))
############ GENE LIST 2 ############
ggplot(gathered_countsSubset2, aes(x=gene_name, y=normalized_counts, color=phenotype, shape=phenotype)) +
#geom_boxplot(position=position_dodge(0.3)) + # Un-comment this line to overlay box plots
geom_jitter(size=3, position=position_dodge(0.5)) +
scale_color_manual(values = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral")) + # replace "Phenotype1" etc. with appropriate phenotypes from your metadata file
scale_y_log10() +
# set axis labels and plot title
xlab("Genes") +
ylab("log10 Normalized Counts") +
ggtitle("Expression of Markers of Immaturity: Cohort 1") +
# display stats; replace wilcox_test with t_test based on results of normalization test
geom_pwc(aes(group=phenotype), method = "wilcox_test", dodge=0.6, tip.length=0, hide.ns = TRUE, label = "p.signif") +
# set style preferences
theme_bw() +
coord_cartesian(clip = "off") +
theme(axis.text.x = element_text(size=14, face="bold", angle=45, hjust=1),
plot.title = element_text(hjust=0.5),
legend.text = element_text(size=12),
legend.title = element_text(size=14))
############ GENE LIST 3 ############
ggplot(gathered_countsSubset3, aes(x=gene_name, y=normalized_counts, color=phenotype, shape=phenotype)) +
#geom_boxplot(position=position_dodge(0.3)) + # Un-comment this line to overlay box plots
geom_jitter(size=3, position=position_dodge(0.5)) +
scale_color_manual(values = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral")) + # replace "Phenotype1" etc. with appropriate phenotypes from your metadata file
scale_y_log10() +
# set axis labels and plot title
xlab("Genes") +
ylab("log10 Normalized Counts") +
ggtitle("Expression of TCR Signaling Genes: Cohort 1") +
# display stats; replace wilcox_test with t_test based on results of normalization test
geom_pwc(aes(group=phenotype), method = "wilcox_test", dodge=0.6, tip.length=0, hide.ns = TRUE, label = "p.signif") +
# set style preferences
theme_bw() +
coord_cartesian(clip = "off") +
theme(axis.text.x = element_text(size=14, face="bold", angle=45, hjust=1),
plot.title = element_text(hjust=0.5),
legend.text = element_text(size=12),
legend.title = element_text(size=14))
############ GENE LIST 4 ############
ggplot(gathered_countsSubset4, aes(x=gene_name, y=normalized_counts, color=phenotype, shape=phenotype)) +
#geom_boxplot(position=position_dodge(0.3)) + # Un-comment this line to overlay box plots
geom_jitter(size=3, position=position_dodge(0.5)) +
scale_color_manual(values = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "khaki3", "CD4_CTRL" = "coral")) + # replace "Phenotype1" etc. with appropriate phenotypes from your metadata file
scale_y_log10() +
# set axis labels and plot title
xlab("Genes") +
ylab("log10 Normalized Counts") +
ggtitle("Expression of CD25 and MHC Class II Genes: Cohort 1") +
# display stats; replace wilcox_test with t_test based on results of normalization test
geom_pwc(aes(group=phenotype), method = "wilcox_test", dodge=0.6, tip.length=0, hide.ns = TRUE, label = "p.signif") +
# set style preferences
theme_bw() +
coord_cartesian(clip = "off") +
theme(axis.text.x = element_text(size=14, face="bold", angle=45, hjust=1),
plot.title = element_text(hjust=0.5),
legend.text = element_text(size=12),
legend.title = element_text(size=14))
############ GENE LIST 1 ############
ggplot(gathered_countsSubset1, aes(x=phenotype, y=normalized_counts, color=phenotype, shape=phenotype)) +
geom_jitter(size=2, position=position_dodge(0.3)) +
scale_y_log10() +
facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL", "CD8_PTCL", "CD8_CTRL", "DN_PTCL")) + # change the order of items along the X axis
# set axis labels and plot title
labs(x="Group",
y="log10 Normalized Counts",
fill="Group",
title="Gene Expression: Cohort 1") +
# display stats
stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL"), c("CD8_PTCL", "CD8_CTRL"), c("DN_PTCL", "CD4_CTRL"), c("DN_PTCL", "CD8_CTRL")),
tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
# set style preferences
theme_bw() +
theme(plot.title= element_text(hjust = 0.5))
############ GENE LIST 2 ############
ggplot(gathered_countsSubset2, aes(x=phenotype, y=normalized_counts, color=phenotype, shape=phenotype)) +
geom_jitter(size=2, position=position_dodge(0.3)) +
scale_y_log10() +
facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL", "CD8_PTCL", "CD8_CTRL", "DN_PTCL")) + # change the order of items along the X axis
# set axis labels and plot title
labs(x="Group",
y="log10 Normalized Counts",
fill="Group",
title="Expression of Markers of Immaturity: Cohort 1") +
# display stats
stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL"), c("CD8_PTCL", "CD8_CTRL"), c("DN_PTCL", "CD4_CTRL"), c("DN_PTCL", "CD8_CTRL")),
tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
# set style preferences
theme_bw() +
theme(plot.title= element_text(hjust = 0.5))
############ GENE LIST 3 ############
ggplot(gathered_countsSubset3, aes(x=phenotype, y=normalized_counts, color=phenotype, shape=phenotype)) +
geom_jitter(size=2, position=position_dodge(0.3)) +
scale_y_log10() +
facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL", "CD8_PTCL", "CD8_CTRL", "DN_PTCL")) + # change the order of items along the X axis
# set axis labels and plot title
labs(x="Group",
y="log10 Normalized Counts",
fill="Group",
title="Expression of TCR Signaling Genes: Cohort 1") +
# display stats
stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL"), c("CD8_PTCL", "CD8_CTRL"), c("DN_PTCL", "CD4_CTRL"), c("DN_PTCL", "CD8_CTRL")),
tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
# set style preferences
theme_bw() +
theme(plot.title= element_text(hjust = 0.5))
############ GENE LIST 4 ############
ggplot(gathered_countsSubset4, aes(x=phenotype, y=normalized_counts, color=phenotype, shape=phenotype)) +
geom_jitter(size=2, position=position_dodge(0.3)) +
scale_y_log10() +
facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL", "CD8_PTCL", "CD8_CTRL", "DN_PTCL")) + # change the order of items along the X axis
# set axis labels and plot title
labs(x="Group",
y="log10 Normalized Counts",
fill="Group",
title="Expression of CD25 and MHC Class II Genes: Cohort 1") +
# display stats
stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL"), c("CD8_PTCL", "CD8_CTRL"), c("DN_PTCL", "CD4_CTRL"), c("DN_PTCL", "CD8_CTRL")),
tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
# set style preferences
theme_bw() +
theme(plot.title= element_text(hjust = 0.5))
############ GENE LIST 1 ############
ggplot(gathered_countsSubset1, aes(phenotype, normalized_counts, fill=phenotype)) +
geom_boxplot() +
scale_y_log10() +
facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL", "CD8_PTCL", "CD8_CTRL", "DN_PTCL")) + # change the order of items along the X axis
# set axis labels and plot title
labs(x="Group",
y="log10 Normalized Counts",
fill="Group",
title="Gene Expression: Cohort 1") +
# display stats
stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL"), c("CD8_PTCL", "CD8_CTRL"), c("DN_PTCL", "CD4_CTRL"), c("DN_PTCL", "CD8_CTRL")),
tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
# set style preferences
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
############ GENE LIST 2 ############
ggplot(gathered_countsSubset2, aes(phenotype, normalized_counts, fill=phenotype)) +
geom_boxplot() +
scale_y_log10() +
facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL", "CD8_PTCL", "CD8_CTRL", "DN_PTCL")) + # change the order of items along the X axis
# set axis labels and plot title
labs(x="Group",
y="log10 Normalized Counts",
fill="Group",
title="Expression of Markers of Immaturity: Cohort 1") +
# display stats
stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL"), c("CD8_PTCL", "CD8_CTRL"), c("DN_PTCL", "CD4_CTRL"), c("DN_PTCL", "CD8_CTRL")),
tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
# set style preferences
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
############ GENE LIST 3 ############
ggplot(gathered_countsSubset3, aes(phenotype, normalized_counts, fill=phenotype)) +
geom_boxplot() +
scale_y_log10() +
facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL", "CD8_PTCL", "CD8_CTRL", "DN_PTCL")) + # change the order of items along the X axis
# set axis labels and plot title
labs(x="Group",
y="log10 Normalized Counts",
fill="Group",
title="Expression of TCR Signaling Genes: Cohort 1") +
# display stats
stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL"), c("CD8_PTCL", "CD8_CTRL"), c("DN_PTCL", "CD4_CTRL"), c("DN_PTCL", "CD8_CTRL")),
tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
# set style preferences
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
############ GENE LIST 4 ############
ggplot(gathered_countsSubset4, aes(phenotype, normalized_counts, fill=phenotype)) +
geom_boxplot() +
scale_y_log10() +
facet_wrap(~gene_name, scales="free") + # free axis scales (as opposed to fixed) are preferred as count data may vary between the individual plots
scale_x_discrete(limits=c("CD4_PTCL", "CD4_CTRL", "CD8_PTCL", "CD8_CTRL", "DN_PTCL")) + # change the order of items along the X axis
# set axis labels and plot title
labs(x="Group",
y="log10 Normalized Counts",
fill="Group",
title="Expression of CD25 and MHC Class II Genes: Cohort 1") +
# display stats
stat_compare_means(comparisons = list(c("CD4_PTCL", "CD4_CTRL"), c("CD8_PTCL", "CD8_CTRL"), c("DN_PTCL", "CD4_CTRL"), c("DN_PTCL", "CD8_CTRL")),
tip.length=0, size=3, method = "wilcox.test", label = "p.signif") +
# set style preferences
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
Evaluate the sample-to-sample correlation by calculating the distances (dissimilarities) between each sample and creating a matrix of distance values. Since the majority of genes are not differentially expressed, samples generally have high correlations with each other (>0.80). Samples with low correlation may indicate outliers or sample contamination.
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$phenotype, colnames(vsd), sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
Heatmap grouping samples based on their gene expression similarity. The color blocks indicate substructure in the data, so replicates should cluster together as a block for each sample group.
p <- pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,
main="Correlation Matrix: Cohort 1")
Volcano plots display the fold change against the p-value.
volcano_data1 <- CD4_PTCLvsCD4_CTRL_final_res
EnhancedVolcano(volcano_data1,
lab = volcano_data1$gene_name,
x = 'log2FoldChange',
y = 'padj',
title = 'CD4_PTCL vs. CD4_CTRL, Cohort 1',
FCcutoff = 1.5,
pCutoff = 0.05,
pointSize = 2.0,
labSize = 4.0,
labCol = 'black',
labFace = 'bold',
colAlpha = 4/5,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 6.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'black'
)
# Repeat for as many comparisons as desired
volcano_data2 <- CD4_PTCLvsCD8_PTCL_final_res
EnhancedVolcano(volcano_data2,
lab = volcano_data2$gene_name,
x = 'log2FoldChange',
y = 'padj',
title = 'CD4_PTCL vs. CD8_PTCL, Cohort 1',
FCcutoff = 1.5,
pCutoff = 0.05,
pointSize = 2.0,
labSize = 4.0,
labCol = 'black',
labFace = 'bold',
colAlpha = 4/5,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 6.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'black'
)
volcano_data3 <- CD4_PTCLvsDN_PTCL_final_res
EnhancedVolcano(volcano_data3,
lab = volcano_data2$gene_name,
x = 'log2FoldChange',
y = 'padj',
title = 'CD4_PTCL vs. DN_PTCL, Cohort 1',
FCcutoff = 1.5,
pCutoff = 0.05,
pointSize = 2.0,
labSize = 4.0,
labCol = 'black',
labFace = 'bold',
colAlpha = 4/5,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 6.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'black'
)
volcano_data4 <- CD8_PTCLvsCD8_CTRL_final_res
EnhancedVolcano(volcano_data4,
lab = volcano_data2$gene_name,
x = 'log2FoldChange',
y = 'padj',
title = 'CD8_PTCL vs. CD8_CTRL, Cohort 1',
FCcutoff = 1.5,
pCutoff = 0.05,
pointSize = 2.0,
labSize = 4.0,
labCol = 'black',
labFace = 'bold',
colAlpha = 4/5,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 6.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'black'
)
volcano_data5 <- DN_PTCLvsCD8_CTRL_final_res
EnhancedVolcano(volcano_data5,
lab = volcano_data2$gene_name,
x = 'log2FoldChange',
y = 'padj',
title = 'DN_PTCL vs. CD8_CTRL, Cohort 1',
FCcutoff = 1.5,
pCutoff = 0.05,
pointSize = 2.0,
labSize = 4.0,
labCol = 'black',
labFace = 'bold',
colAlpha = 4/5,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 6.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'black'
)
volcano_data6 <- DN_PTCLvsCD4_CTRL_final_res
EnhancedVolcano(volcano_data6,
lab = volcano_data2$gene_name,
x = 'log2FoldChange',
y = 'padj',
title = 'DN_PTCL vs. CD4_CTRL, Cohort 1',
FCcutoff = 1.5,
pCutoff = 0.05,
pointSize = 2.0,
labSize = 4.0,
labCol = 'black',
labFace = 'bold',
colAlpha = 4/5,
legendPosition = 'right',
legendLabSize = 12,
legendIconSize = 6.0,
drawConnectors = TRUE,
widthConnectors = 0.2,
colConnectors = 'black'
)
# All samples
saveRDS(dds, file="Cohort_1/Output/Cohort1_dds.Rdata")
# CD4 PTCLs and CD4 controls only
keepGroups <- c("CD4_PTCL", "CD4_CTRL")
metadata_CD4 <- metadata %>%
filter(phenotype %in% keepGroups)
keepList <- rownames(metadata_CD4)
countsData_CD4 <- countsData[, colnames(countsData) %in% keepList]
cts_CD4 <- as.matrix(countsData_CD4)
coldata_CD4 <- metadata_CD4[, c("phenotype"), drop=FALSE]
coldata_CD4$phenotype <- as.factor(coldata_CD4$phenotype)
dds_CD4 <- DESeqDataSetFromMatrix(countData = cts_CD4,
colData = coldata_CD4,
design = ~ phenotype)
keep <- rowSums(counts(dds_CD4)) >= 10
dds_CD4 <- dds_CD4[keep,]
dds_CD4 <- dds_CD4[,-12] # remove CF21 outlier
dds_CD4 <- DESeq(dds_CD4)
saveRDS(dds_CD4, file="Cohort_1/Output/Cohort1_CD4sOnly_dds.Rdata")
# All samples
vsd <- assay(vst(dds))
vsd <- as.data.frame(vsd, drop=FALSE) # Convert to a data frame.
vsd$probe_id <- rownames(vsd) # Add probe ID as a column
vsd <- merge(as(vsd,"data.frame"), genenames, by="probe_id", all.x=TRUE) # Adds a column of gene names for the associated probe ID
write.csv(vsd, file="Cohort_1/Output/VST_NormalizedCounts.csv") # Export as csv
# CD4s only
vsd <- assay(vst(dds_CD4))
vsd <- as.data.frame(vsd, drop=FALSE) # Convert to a data frame.
vsd$probe_id <- rownames(vsd) # Add probe ID as a column
vsd <- merge(as(vsd,"data.frame"), genenames, by="probe_id", all.x=TRUE) # Adds a column of gene names for the associated probe ID
write.csv(vsd, file="Cohort_1/Output/VST_NormalizedCounts_CD4sOnly.csv") # Export as csv
sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggpubr_0.6.0 EnhancedVolcano_1.22.0
## [3] ggrepel_0.9.6 rlang_1.1.3
## [5] lubridate_1.9.4 forcats_1.0.0
## [7] stringr_1.5.1 dplyr_1.1.4
## [9] purrr_1.0.2 readr_2.1.5
## [11] tidyr_1.3.1 tibble_3.2.1
## [13] ggplot2_3.5.1 tidyverse_2.0.0
## [15] apeglm_1.26.1 pheatmap_1.0.12
## [17] RColorBrewer_1.1-3 corrplot_0.95
## [19] DESeq2_1.44.0 SummarizedExperiment_1.34.0
## [21] Biobase_2.64.0 MatrixGenerics_1.16.0
## [23] matrixStats_1.4.1 GenomicRanges_1.56.2
## [25] GenomeInfoDb_1.40.1 IRanges_2.38.1
## [27] S4Vectors_0.42.1 BiocGenerics_0.50.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 farver_2.1.2 fastmap_1.2.0
## [4] digest_0.6.35 timechange_0.3.0 lifecycle_1.0.4
## [7] magrittr_2.0.3 compiler_4.4.0 sass_0.4.9
## [10] tools_4.4.0 yaml_2.3.10 ggsignif_0.6.4
## [13] knitr_1.49 labeling_0.4.3 S4Arrays_1.4.1
## [16] DelayedArray_0.30.1 plyr_1.8.9 abind_1.4-8
## [19] BiocParallel_1.38.0 withr_3.0.2 numDeriv_2016.8-1.1
## [22] grid_4.4.0 colorspace_2.1-1 scales_1.3.0
## [25] MASS_7.3-60.2 bbmle_1.0.25.1 cli_3.6.2
## [28] mvtnorm_1.3-3 rmarkdown_2.29 crayon_1.5.3
## [31] generics_0.1.3 rstudioapi_0.17.1 httr_1.4.7
## [34] tzdb_0.4.0 bdsmatrix_1.3-7 cachem_1.1.0
## [37] zlibbioc_1.50.0 parallel_4.4.0 XVector_0.44.0
## [40] vctrs_0.6.5 Matrix_1.7-0 carData_3.0-5
## [43] jsonlite_1.8.9 car_3.1-3 hms_1.1.3
## [46] rstatix_0.7.2 Formula_1.2-5 locfit_1.5-9.10
## [49] jquerylib_0.1.4 glue_1.7.0 emdbook_1.3.13
## [52] codetools_0.2-20 stringi_1.8.4 gtable_0.3.6
## [55] UCSC.utils_1.0.0 munsell_0.5.1 pillar_1.10.1
## [58] htmltools_0.5.8.1 GenomeInfoDbData_1.2.12 R6_2.5.1
## [61] evaluate_1.0.3 lattice_0.22-6 backports_1.5.0
## [64] broom_1.0.7 bslib_0.8.0 Rcpp_1.0.13
## [67] coda_0.19-4.1 SparseArray_1.4.8 xfun_0.49
## [70] pkgconfig_2.0.3
citation()
## To cite R in publications use:
##
## R Core Team (2024). _R: A Language and Environment for Statistical
## Computing_. R Foundation for Statistical Computing, Vienna, Austria.
## <https://www.R-project.org/>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2024},
## url = {https://www.R-project.org/},
## }
##
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.