Introduction

This script performs clustering and draws heatmaps of RNA-seq data, with a variety of filtering options.

Data

Variance stabilized transformed count matrix

vsd_cohort1 <- read.csv("Cohort_1/Output/VST_NormalizedCounts.csv")
vsd_cohort1_CD4s <- read.csv("Cohort_1/Output/VST_NormalizedCounts_CD4sOnly.csv")
vsd_cohort2 <- read.csv("Cohort_2/Output/VST_NormalizedCounts.csv")

## Make gene names the row names
rownames(vsd_cohort1) <- make.unique(as.character(vsd_cohort1$gene_name)) # make.unique accounts for duplicate gene names since R won't allow duplicate rownames
rownames(vsd_cohort1_CD4s) <- make.unique(as.character(vsd_cohort1_CD4s$gene_name))
rownames(vsd_cohort2) <- make.unique(as.character(vsd_cohort2$gene_name))

## Remove any columns other than sample count data
vsd_cohort1 <- dplyr::select(vsd_cohort1, -c("X", "probe_id", "gene_name", "description")) # exclude any columns that are not sample IDs
vsd_cohort1_CD4s <- dplyr::select(vsd_cohort1_CD4s, -c("X", "probe_id", "gene_name", "description"))
vsd_cohort2 <- dplyr::select(vsd_cohort2, -c("X", "probe_id", "gene_name", "description"))

Metadata

metadata1 <- read.table(file = "Cohort_1/Input/metadata.txt", header = FALSE)
metadata2 <- read.table(file = "Cohort_2/Input/metadata.txt", header = FALSE)
colnames(metadata1) <- c("avery_num", "sample_name", "phenotype")
colnames(metadata2) <- c("avery_num", "sample_name", "phenotype")
rownames(metadata1) <- metadata1$sample_name
metadata1 <- dplyr::select(metadata1, c("phenotype"))
rownames(metadata2) <- metadata2$sample_name
metadata2 <- dplyr::select(metadata2, c("phenotype"))

keepGroups <- c("CD4_PTCL", "CD4_CTRL")
metadata_CD4 <- metadata1 %>%
  filter(phenotype %in% keepGroups)

Unsupervised clustering of the top 2000 genes (by median absolute derivation)

# Calculate the median absolute derivation for all rows in the vst transformed data
median_absolute_derivation_cohort1 = apply(vsd_cohort1,1,mad) # "1" indicates that the manipulation is performed on rows
median_absolute_derivation_cohort1_CD4s = apply(vsd_cohort1_CD4s,1,mad)
median_absolute_derivation_cohort2 = apply(vsd_cohort2,1,mad)

# check data distribution
hist(median_absolute_derivation_cohort1, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 1"), breaks=nrow(vsd_cohort1)*0.1)

hist(median_absolute_derivation_cohort1_CD4s, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 1, \nCD4 PTCLs and CD4 Controls Only"), breaks=nrow(vsd_cohort1)*0.1)

hist(median_absolute_derivation_cohort2, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 2"), breaks=nrow(vsd_cohort2)*0.1)

# index the vst transformed count data to include only those rows (genes) that appeared in the top 2000 based on median absolute derivation.
mad2k_cohort1=vsd_cohort1[rev(order(median_absolute_derivation_cohort1))[1:2000],]
mad2k_cohort1_CD4s=vsd_cohort1_CD4s[rev(order(median_absolute_derivation_cohort1_CD4s))[1:2000],]
mad2k_cohort2=vsd_cohort2[rev(order(median_absolute_derivation_cohort2))[1:2000],]

Euclidean distance

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
  phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"))

# Draw heatmap
mad_heatmap_cohort1 <- pheatmap(mad2k_cohort1, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              #cutree_rows = 3,
              cutree_cols = 3,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1 \n Clustering: Ward, Distance: Euclidean",
              clustering_distance_rows = "euclidean",
              clustering_distance_cols = "euclidean",
              clustering_method = "ward.D2",
              annotation_col = metadata1,
              annotation_colors = ann_colors_cohort1,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1_CD4 = list(
  phenotype = c("CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"))

# Draw heatmap
mad_heatmap_cohort1_CD4 <- pheatmap(mad2k_cohort1_CD4s,
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 2,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1 \n Clustering: Ward, Distance: Euclidean",
              clustering_distance_rows = "euclidean",
              clustering_distance_cols = "euclidean",
              clustering_method = "ward.D2",
              annotation_col = metadata_CD4,
              annotation_colors = ann_colors_cohort1_CD4,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort2 = list(
  phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"))

# Draw heatmap
mad_heatmap_cohort2 <- pheatmap(mad2k_cohort2, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 2,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 2 \n Clustering: Ward, Distance: Euclidean",
              clustering_distance_rows = "euclidean",
              clustering_distance_cols = "euclidean",
              clustering_method = "ward.D2",
              annotation_col = metadata2,
              annotation_colors = ann_colors_cohort2,
              show_rownames = FALSE)

Pearson correlation

mad_heatmap_cohort1 <- pheatmap(mad2k_cohort1, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 3,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1\n Clustering: Ward, Distance: Pearson correlation",
              clustering_distance_rows = "correlation",
              clustering_distance_cols = "correlation",
              clustering_method = "ward.D2",
              annotation_col = metadata1,
              annotation_colors = ann_colors_cohort1,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

mad_heatmap_cohort1_CD4 <- pheatmap(mad2k_cohort1_CD4s, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 2,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1\n Clustering: Ward, Distance: Pearson correlation",
              clustering_distance_rows = "correlation",
              clustering_distance_cols = "correlation",
              clustering_method = "ward.D2",
              annotation_col = metadata_CD4,
              annotation_colors = ann_colors_cohort1_CD4,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

mad_heatmap_cohort2 <- pheatmap(mad2k_cohort2, 
              scale="row",
              color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
              cluster_rows=TRUE,
              cluster_cols=TRUE,
              cutree_cols = 2,
              main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 2\n Clustering: Ward, Distance: Pearson correlation",
              clustering_distance_rows = "correlation",
              clustering_distance_cols = "correlation",
              clustering_method = "ward.D2",
              annotation_col = metadata2,
              annotation_colors = ann_colors_cohort2,
              show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.

Clustering based on expression of human PTCL gene signatures

Iqbal et al.

Source: https://doi.org/10.1182/blood-2013-11-536359

# Define list of genes and their groups
geneList <- c("TBX21", "IFNG", "STAT1", "CSF2", "CCL3", "EOMES", "IL2RB", "CXCR3", "CD28", "AXL", "CD40", "CD59", "FTL", "LILRB1", "SIRPA",
              "GATA3", "EGR1", "SEPTIN6", "CAT", "IL18R1", "IK", "ACKR3", "CCR4", "MSH6")
geneGroups <- c("TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL",
                "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL")
geneGroup <- data.frame(Group = geneGroups)
rownames(geneGroup) <- geneList

# subset vst data to include only those genes
vsd_geneList_cohort1 <- vsd_cohort1 %>%
  filter(rownames(vsd_cohort1) %in% geneList)

vsd_geneList_cohort2 <- vsd_cohort2 %>%
  filter(rownames(vsd_cohort2) %in% geneList)

Euclidean distance

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
  phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"),
  Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))

ann_colors_cohort2 = list(
  phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"),
  Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))

# Generate heatmap
vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        cutree_rows = 2,
                        cutree_cols = 3,
                        clustering_distance_rows = "euclidean",
                        clustering_distance_cols = "euclidean",
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort 1\n Clustering: Ward, Distance: Euclidean",
                        annotation_col = metadata1,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort1,
                       show_rownames = TRUE) # show gene names on the heatmap

vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        clustering_distance_rows = "euclidean",
                        clustering_distance_cols = "euclidean",
                        cutree_rows = 3,
                        cutree_cols = 3,
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort 2\n Clustering: Ward, Distance: Euclidean",
                        annotation_col = metadata2,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort2,
                       show_rownames = TRUE) # show gene names on the heatmap

Pearson correlation

vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        clustering_distance_rows = "correlation",
                        clustering_distance_cols = "correlation",
                        cutree_rows = 2,
                        cutree_cols = 2,
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort1\nClustering: Ward, Distance: Pearson correlation",
                        annotation_col = metadata1,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort1,
                       show_rownames = TRUE) # show gene names on the heatmap

vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        clustering_distance_rows = "correlation",
                        clustering_distance_cols = "correlation",
                        cutree_rows = 2,
                        cutree_cols = 3,
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort2\nClustering: Ward, Distance: Pearson correlation",
                        annotation_col = metadata2,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort2,
                       show_rownames = TRUE) # show gene names on the heatmap

Herek et al.

Source: Fig. 1A of https://doi.org/10.1182/blood.2021015019

# Define list of genes and their groups
geneList <- c("TBX21", "IFNG", "STAT1", "CSF2",
              "GATA3", "EGR1", "SEPTIN6", "CAT")
geneGroups <- c("TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL",
                "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL")
geneGroup <- data.frame(Group = geneGroups)
rownames(geneGroup) <- geneList

# subset vst data to include only those genes
vsd_geneList_cohort1 <- vsd_cohort1 %>%
  filter(rownames(vsd_cohort1) %in% geneList)

vsd_geneList_cohort2 <- vsd_cohort2 %>%
  filter(rownames(vsd_cohort2) %in% geneList)

Euclidean distance

# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
  phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"),
  Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))

# Generate heatmap
vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        cutree_rows = 2,
                        cutree_cols = 2,
                        clustering_distance_rows = "euclidean",
                        clustering_distance_cols = "euclidean",
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort 1\n Clustering: Ward, Distance: Euclidean",
                        annotation_col = metadata1,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort1,
                       show_rownames = TRUE) # show gene names on the heatmap

ann_colors_cohort2 = list(
  phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"),
  Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))

# Generate heatmap
vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
                        scale="row", 
                        color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
                        cluster_rows=TRUE, 
                        cluster_cols=TRUE,
                        cutree_rows = 2,
                        cutree_cols = 2,
                        clustering_distance_rows = "euclidean",
                        clustering_distance_cols = "euclidean",
                        clustering_method = "ward.D2",
                        main = "Input: Vst transformed normalized DESeq2 counts, Cohort 2\n Clustering: Ward, Distance: Euclidean",
                        annotation_col = metadata2,
                       annotation_row = geneGroup,
                       annotation_colors = ann_colors_cohort2,
                       show_rownames = TRUE) # show gene names on the heatmap

Citations

sessionInfo()

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggplot2_3.5.1               knitr_1.49                 
##  [3] readr_2.1.5                 dplyr_1.1.4                
##  [5] pheatmap_1.0.12             RColorBrewer_1.1-3         
##  [7] DESeq2_1.44.0               SummarizedExperiment_1.34.0
##  [9] Biobase_2.64.0              MatrixGenerics_1.16.0      
## [11] matrixStats_1.4.1           GenomicRanges_1.56.2       
## [13] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [15] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6            xfun_0.49               bslib_0.8.0            
##  [4] lattice_0.22-6          tzdb_0.4.0              vctrs_0.6.5            
##  [7] tools_4.4.0             generics_0.1.3          parallel_4.4.0         
## [10] tibble_3.2.1            fansi_1.0.6             pkgconfig_2.0.3        
## [13] Matrix_1.7-0            lifecycle_1.0.4         GenomeInfoDbData_1.2.12
## [16] farver_2.1.2            compiler_4.4.0          munsell_0.5.1          
## [19] codetools_0.2-20        htmltools_0.5.8.1       sass_0.4.9             
## [22] yaml_2.3.8              pillar_1.9.0            crayon_1.5.3           
## [25] jquerylib_0.1.4         BiocParallel_1.38.0     DelayedArray_0.30.1    
## [28] cachem_1.1.0            abind_1.4-8             tidyselect_1.2.1       
## [31] locfit_1.5-9.10         digest_0.6.35           fastmap_1.2.0          
## [34] grid_4.4.0              colorspace_2.1-1        cli_3.6.2              
## [37] SparseArray_1.4.8       magrittr_2.0.3          S4Arrays_1.4.1         
## [40] utf8_1.2.4              withr_3.0.2             scales_1.3.0           
## [43] UCSC.utils_1.0.0        rmarkdown_2.29          XVector_0.44.0         
## [46] httr_1.4.7              hms_1.1.3               evaluate_1.0.1         
## [49] rlang_1.1.3             Rcpp_1.0.13             glue_1.7.0             
## [52] rstudioapi_0.17.1       jsonlite_1.8.9          R6_2.5.1               
## [55] zlibbioc_1.50.0

citation()

## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.

Clustering Analysis of Canine PTCL Bulk RNA-seq Data

Eileen Owens

2025-01-14

Introduction

Data

Variance stabilized transformed count matrix

Metadata

Unsupervised clustering of the top 2000 genes (by median absolute derivation)

Euclidean distance

Pearson correlation

Clustering based on expression of human PTCL gene signatures

Iqbal et al.

Euclidean distance

Pearson correlation

Herek et al.

Euclidean distance

Citations