Introduction

This script performs clustering and draws heatmaps of RNA-seq data, with a variety of filtering options.

Data

Variance stabilized transformed count matrix

vsd_cohort1 <- read.csv("Cohort_1/Output/VST_NormalizedCounts.csv")
vsd_cohort1_CD4s <- read.csv("Cohort_1/Output/VST_NormalizedCounts_CD4sOnly.csv")
vsd_cohort2 <- read.csv("Cohort_2/Output/VST_NormalizedCounts.csv")

## Make gene names the row names
rownames(vsd_cohort1) <- make.unique(as.character(vsd_cohort1$gene_name)) # make.unique accounts for duplicate gene names since R won't allow duplicate rownames
rownames(vsd_cohort1_CD4s) <- make.unique(as.character(vsd_cohort1_CD4s$gene_name))
rownames(vsd_cohort2) <- make.unique(as.character(vsd_cohort2$gene_name))

## Remove any columns other than sample count data
vsd_cohort1 <- dplyr::select(vsd_cohort1, -c("X", "probe_id", "gene_name", "description")) # exclude any columns that are not sample IDs
vsd_cohort1_CD4s <- dplyr::select(vsd_cohort1_CD4s, -c("X", "probe_id", "gene_name", "description"))
vsd_cohort2 <- dplyr::select(vsd_cohort2, -c("X", "probe_id", "gene_name", "description"))

Metadata

metadata1 <- read.table(file = "Cohort_1/Input/metadata.txt", header = FALSE)
metadata2 <- read.table(file = "Cohort_2/Input/metadata.txt", header = FALSE)
colnames(metadata1) <- c("avery_num", "sample_name", "phenotype")
colnames(metadata2) <- c("avery_num", "sample_name", "phenotype")
rownames(metadata1) <- metadata1$sample_name
metadata1 <- dplyr::select(metadata1, c("phenotype"))
rownames(metadata2) <- metadata2$sample_name
metadata2 <- dplyr::select(metadata2, c("phenotype"))

keepGroups <- c("CD4_PTCL", "CD4_CTRL")
metadata_CD4 <- metadata1 %>%
  filter(phenotype %in% keepGroups)

Unsupervised clustering of the top 2000 genes (by median absolute derivation)

# Calculate the median absolute derivation for all rows in the vst transformed data
median_absolute_derivation_cohort1 = apply(vsd_cohort1,1,mad) # "1" indicates that the manipulation is performed on rows
median_absolute_derivation_cohort1_CD4s = apply(vsd_cohort1_CD4s,1,mad)
median_absolute_derivation_cohort2 = apply(vsd_cohort2,1,mad)

# check data distribution
hist(median_absolute_derivation_cohort1, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 1"), breaks=nrow(vsd_cohort1)*0.1)

hist(median_absolute_derivation_cohort1_CD4s, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 1, \nCD4 PTCLs and CD4 Controls Only"), breaks=nrow(vsd_cohort1)*0.1)

hist(median_absolute_derivation_cohort2, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 2"), breaks=nrow(vsd_cohort2)*0.1)

# index the vst transformed count data to include only those rows (genes) that appeared in the top 2000 based on median absolute derivation.
mad2k_cohort1=vsd_cohort1[rev(order(median_absolute_derivation_cohort1))[1:2000],]
mad2k_cohort1_CD4s=vsd_cohort1_CD4s[rev(order(median_absolute_derivation_cohort1_CD4s))[1:2000],]
mad2k_cohort2=vsd_cohort2[rev(order(median_absolute_derivation_cohort2))[1:2000],]

Euclidean distance

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
  phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"))

# Draw heatmap
mad_heatmap_cohort1 <- pheatmap(mad2k_cohort1, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              #cutree_rows = 3,
              cutree_cols = 3,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1 \n Clustering: Ward, Distance: Euclidean",
              clustering_distance_rows = "euclidean",
              clustering_distance_cols = "euclidean",
              clustering_method = "ward.D2",
              annotation_col = metadata1,
              annotation_colors = ann_colors_cohort1,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1_CD4 = list(
  phenotype = c("CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"))

# Draw heatmap
mad_heatmap_cohort1_CD4 <- pheatmap(mad2k_cohort1_CD4s,
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 2,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1 \n Clustering: Ward, Distance: Euclidean",
              clustering_distance_rows = "euclidean",
              clustering_distance_cols = "euclidean",
              clustering_method = "ward.D2",
              annotation_col = metadata_CD4,
              annotation_colors = ann_colors_cohort1_CD4,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort2 = list(
  phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"))

# Draw heatmap
mad_heatmap_cohort2 <- pheatmap(mad2k_cohort2, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 2,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 2 \n Clustering: Ward, Distance: Euclidean",
              clustering_distance_rows = "euclidean",
              clustering_distance_cols = "euclidean",
              clustering_method = "ward.D2",
              annotation_col = metadata2,
              annotation_colors = ann_colors_cohort2,
              show_rownames = FALSE)

Pearson correlation

mad_heatmap_cohort1 <- pheatmap(mad2k_cohort1, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 3,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1\n Clustering: Ward, Distance: Pearson correlation",
              clustering_distance_rows = "correlation",
              clustering_distance_cols = "correlation",
              clustering_method = "ward.D2",
              annotation_col = metadata1,
              annotation_colors = ann_colors_cohort1,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

mad_heatmap_cohort1_CD4 <- pheatmap(mad2k_cohort1_CD4s, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 2,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1\n Clustering: Ward, Distance: Pearson correlation",
              clustering_distance_rows = "correlation",
              clustering_distance_cols = "correlation",
              clustering_method = "ward.D2",
              annotation_col = metadata_CD4,
              annotation_colors = ann_colors_cohort1_CD4,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

mad_heatmap_cohort2 <- pheatmap(mad2k_cohort2, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 2,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 2\n Clustering: Ward, Distance: Pearson correlation",
              clustering_distance_rows = "correlation",
              clustering_distance_cols = "correlation",
              clustering_method = "ward.D2",
              annotation_col = metadata2,
              annotation_colors = ann_colors_cohort2,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

Clustering based on expression of human PTCL gene signatures

Iqbal et al.

Source: https://doi.org/10.1182/blood-2013-11-536359

# Define list of genes and their groups
geneList <- c("TBX21", "IFNG", "STAT1", "CSF2", "CCL3", "EOMES", "IL2RB", "CXCR3", "CD28", "AXL", "CD40", "CD59", "FTL", "LILRB1", "SIRPA",
              "GATA3", "EGR1", "SEPTIN6", "CAT", "IL18R1", "IK", "ACKR3", "CCR4", "MSH6")
geneGroups <- c("TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL",
                "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL")
geneGroup <- data.frame(Group = geneGroups)
rownames(geneGroup) <- geneList

# subset vst data to include only those genes
vsd_geneList_cohort1 <- vsd_cohort1 %>%
  filter(rownames(vsd_cohort1) %in% geneList)

vsd_geneList_cohort2 <- vsd_cohort2 %>%
  filter(rownames(vsd_cohort2) %in% geneList)

Euclidean distance

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
  phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"),
  Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))

ann_colors_cohort2 = list(
  phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"),
  Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))

# Generate heatmap
vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        cutree_rows = 2,
                        cutree_cols = 3,
                        clustering_distance_rows = "euclidean",
                        clustering_distance_cols = "euclidean",
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort 1\n Clustering: Ward, Distance: Euclidean",
                        annotation_col = metadata1,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort1,
                       show_rownames = TRUE) # show gene names on the heatmap

vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        clustering_distance_rows = "euclidean",
                        clustering_distance_cols = "euclidean",
                        cutree_rows = 3,
                        cutree_cols = 3,
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort 2\n Clustering: Ward, Distance: Euclidean",
                        annotation_col = metadata2,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort2,
                       show_rownames = TRUE) # show gene names on the heatmap

Pearson correlation

vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        clustering_distance_rows = "correlation",
                        clustering_distance_cols = "correlation",
                        cutree_rows = 2,
                        cutree_cols = 2,
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort1\nClustering: Ward, Distance: Pearson correlation",
                        annotation_col = metadata1,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort1,
                       show_rownames = TRUE) # show gene names on the heatmap

vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        clustering_distance_rows = "correlation",
                        clustering_distance_cols = "correlation",
                        cutree_rows = 2,
                        cutree_cols = 3,
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort2\nClustering: Ward, Distance: Pearson correlation",
                        annotation_col = metadata2,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort2,
                       show_rownames = TRUE) # show gene names on the heatmap

Herek et al.

Source: Fig. 1A of https://doi.org/10.1182/blood.2021015019

# Define list of genes and their groups
geneList <- c("TBX21", "IFNG", "STAT1", "CSF2",
              "GATA3", "EGR1", "SEPTIN6", "CAT")
geneGroups <- c("TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL",
                "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL")
geneGroup <- data.frame(Group = geneGroups)
rownames(geneGroup) <- geneList

# subset vst data to include only those genes
vsd_geneList_cohort1 <- vsd_cohort1 %>%
  filter(rownames(vsd_cohort1) %in% geneList)

vsd_geneList_cohort2 <- vsd_cohort2 %>%
  filter(rownames(vsd_cohort2) %in% geneList)

Euclidean distance

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
  phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"),
  Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))

# Generate heatmap
vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        cutree_rows = 2,
                        cutree_cols = 2,
                        clustering_distance_rows = "euclidean",
                        clustering_distance_cols = "euclidean",
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort 1\n Clustering: Ward, Distance: Euclidean",
                        annotation_col = metadata1,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort1,
                       show_rownames = TRUE) # show gene names on the heatmap

ann_colors_cohort2 = list(
  phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"),
  Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))

# Generate heatmap
vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        cutree_rows = 2,
                        cutree_cols = 2,
                        clustering_distance_rows = "euclidean",
                        clustering_distance_cols = "euclidean",
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort 2\n Clustering: Ward, Distance: Euclidean",
                        annotation_col = metadata2,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort2,
                       show_rownames = TRUE) # show gene names on the heatmap

Citations

sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggplot2_3.5.1               knitr_1.49                 
##  [3] readr_2.1.5                 dplyr_1.1.4                
##  [5] pheatmap_1.0.12             RColorBrewer_1.1-3         
##  [7] DESeq2_1.44.0               SummarizedExperiment_1.34.0
##  [9] Biobase_2.64.0              MatrixGenerics_1.16.0      
## [11] matrixStats_1.4.1           GenomicRanges_1.56.2       
## [13] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [15] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6            xfun_0.49               bslib_0.8.0            
##  [4] lattice_0.22-6          tzdb_0.4.0              vctrs_0.6.5            
##  [7] tools_4.4.0             generics_0.1.3          parallel_4.4.0         
## [10] tibble_3.2.1            fansi_1.0.6             pkgconfig_2.0.3        
## [13] Matrix_1.7-0            lifecycle_1.0.4         GenomeInfoDbData_1.2.12
## [16] farver_2.1.2            compiler_4.4.0          munsell_0.5.1          
## [19] codetools_0.2-20        htmltools_0.5.8.1       sass_0.4.9             
## [22] yaml_2.3.8              pillar_1.9.0            crayon_1.5.3           
## [25] jquerylib_0.1.4         BiocParallel_1.38.0     DelayedArray_0.30.1    
## [28] cachem_1.1.0            abind_1.4-8             tidyselect_1.2.1       
## [31] locfit_1.5-9.10         digest_0.6.35           fastmap_1.2.0          
## [34] grid_4.4.0              colorspace_2.1-1        cli_3.6.2              
## [37] SparseArray_1.4.8       magrittr_2.0.3          S4Arrays_1.4.1         
## [40] utf8_1.2.4              withr_3.0.2             scales_1.3.0           
## [43] UCSC.utils_1.0.0        rmarkdown_2.29          XVector_0.44.0         
## [46] httr_1.4.7              hms_1.1.3               evaluate_1.0.1         
## [49] rlang_1.1.3             Rcpp_1.0.13             glue_1.7.0             
## [52] rstudioapi_0.17.1       jsonlite_1.8.9          R6_2.5.1               
## [55] zlibbioc_1.50.0
citation()
## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.