This script performs clustering and draws heatmaps of RNA-seq data, with a variety of filtering options.
vsd_cohort1 <- read.csv("Cohort_1/Output/VST_NormalizedCounts.csv")
vsd_cohort1_CD4s <- read.csv("Cohort_1/Output/VST_NormalizedCounts_CD4sOnly.csv")
vsd_cohort2 <- read.csv("Cohort_2/Output/VST_NormalizedCounts.csv")
## Make gene names the row names
rownames(vsd_cohort1) <- make.unique(as.character(vsd_cohort1$gene_name)) # make.unique accounts for duplicate gene names since R won't allow duplicate rownames
rownames(vsd_cohort1_CD4s) <- make.unique(as.character(vsd_cohort1_CD4s$gene_name))
rownames(vsd_cohort2) <- make.unique(as.character(vsd_cohort2$gene_name))
## Remove any columns other than sample count data
vsd_cohort1 <- dplyr::select(vsd_cohort1, -c("X", "probe_id", "gene_name", "description")) # exclude any columns that are not sample IDs
vsd_cohort1_CD4s <- dplyr::select(vsd_cohort1_CD4s, -c("X", "probe_id", "gene_name", "description"))
vsd_cohort2 <- dplyr::select(vsd_cohort2, -c("X", "probe_id", "gene_name", "description"))
metadata1 <- read.table(file = "Cohort_1/Input/metadata.txt", header = FALSE)
metadata2 <- read.table(file = "Cohort_2/Input/metadata.txt", header = FALSE)
colnames(metadata1) <- c("avery_num", "sample_name", "phenotype")
colnames(metadata2) <- c("avery_num", "sample_name", "phenotype")
rownames(metadata1) <- metadata1$sample_name
metadata1 <- dplyr::select(metadata1, c("phenotype"))
rownames(metadata2) <- metadata2$sample_name
metadata2 <- dplyr::select(metadata2, c("phenotype"))
keepGroups <- c("CD4_PTCL", "CD4_CTRL")
metadata_CD4 <- metadata1 %>%
filter(phenotype %in% keepGroups)
# Calculate the median absolute derivation for all rows in the vst transformed data
median_absolute_derivation_cohort1 = apply(vsd_cohort1,1,mad) # "1" indicates that the manipulation is performed on rows
median_absolute_derivation_cohort1_CD4s = apply(vsd_cohort1_CD4s,1,mad)
median_absolute_derivation_cohort2 = apply(vsd_cohort2,1,mad)
# check data distribution
hist(median_absolute_derivation_cohort1, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 1"), breaks=nrow(vsd_cohort1)*0.1)
hist(median_absolute_derivation_cohort1_CD4s, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 1, \nCD4 PTCLs and CD4 Controls Only"), breaks=nrow(vsd_cohort1)*0.1)
hist(median_absolute_derivation_cohort2, ylim=c(0,200), main=c("Histogram of median absolute derivation, Cohort 2"), breaks=nrow(vsd_cohort2)*0.1)
# index the vst transformed count data to include only those rows (genes) that appeared in the top 2000 based on median absolute derivation.
mad2k_cohort1=vsd_cohort1[rev(order(median_absolute_derivation_cohort1))[1:2000],]
mad2k_cohort1_CD4s=vsd_cohort1_CD4s[rev(order(median_absolute_derivation_cohort1_CD4s))[1:2000],]
mad2k_cohort2=vsd_cohort2[rev(order(median_absolute_derivation_cohort2))[1:2000],]
# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"))
# Draw heatmap
mad_heatmap_cohort1 <- pheatmap(mad2k_cohort1,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
#cutree_rows = 3,
cutree_cols = 3,
main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1 \n Clustering: Ward, Distance: Euclidean",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
annotation_col = metadata1,
annotation_colors = ann_colors_cohort1,
show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.
# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1_CD4 = list(
phenotype = c("CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"))
# Draw heatmap
mad_heatmap_cohort1_CD4 <- pheatmap(mad2k_cohort1_CD4s,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
cutree_cols = 2,
main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1 \n Clustering: Ward, Distance: Euclidean",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
annotation_col = metadata_CD4,
annotation_colors = ann_colors_cohort1_CD4,
show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.
# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort2 = list(
phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"))
# Draw heatmap
mad_heatmap_cohort2 <- pheatmap(mad2k_cohort2,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
cutree_cols = 2,
main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 2 \n Clustering: Ward, Distance: Euclidean",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
annotation_col = metadata2,
annotation_colors = ann_colors_cohort2,
show_rownames = FALSE)
mad_heatmap_cohort1 <- pheatmap(mad2k_cohort1,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
cutree_cols = 3,
main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1\n Clustering: Ward, Distance: Pearson correlation",
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
clustering_method = "ward.D2",
annotation_col = metadata1,
annotation_colors = ann_colors_cohort1,
show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.
mad_heatmap_cohort1_CD4 <- pheatmap(mad2k_cohort1_CD4s,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
cutree_cols = 2,
main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 1\n Clustering: Ward, Distance: Pearson correlation",
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
clustering_method = "ward.D2",
annotation_col = metadata_CD4,
annotation_colors = ann_colors_cohort1_CD4,
show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.
mad_heatmap_cohort2 <- pheatmap(mad2k_cohort2,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
cutree_cols = 2,
main = "Input: Vst transformed normalized DESeq2 counts for the top 2000 genes by median absolute derivation, Cohort 2\n Clustering: Ward, Distance: Pearson correlation",
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
clustering_method = "ward.D2",
annotation_col = metadata2,
annotation_colors = ann_colors_cohort2,
show_rownames = FALSE) # Change to TRUE if gene symbols should be annotated on the heatmap.
Source: https://doi.org/10.1182/blood-2013-11-536359
# Define list of genes and their groups
geneList <- c("TBX21", "IFNG", "STAT1", "CSF2", "CCL3", "EOMES", "IL2RB", "CXCR3", "CD28", "AXL", "CD40", "CD59", "FTL", "LILRB1", "SIRPA",
"GATA3", "EGR1", "SEPTIN6", "CAT", "IL18R1", "IK", "ACKR3", "CCR4", "MSH6")
geneGroups <- c("TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL",
"GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL")
geneGroup <- data.frame(Group = geneGroups)
rownames(geneGroup) <- geneList
# subset vst data to include only those genes
vsd_geneList_cohort1 <- vsd_cohort1 %>%
filter(rownames(vsd_cohort1) %in% geneList)
vsd_geneList_cohort2 <- vsd_cohort2 %>%
filter(rownames(vsd_cohort2) %in% geneList)
# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"),
Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))
ann_colors_cohort2 = list(
phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"),
Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))
# Generate heatmap
vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
cutree_rows = 2,
cutree_cols = 3,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
main = "Input: Vst transformed normalized DESeq2 counts, Cohort 1\n Clustering: Ward, Distance: Euclidean",
annotation_col = metadata1,
annotation_row = geneGroup,
annotation_colors = ann_colors_cohort1,
show_rownames = TRUE) # show gene names on the heatmap
vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
cutree_rows = 3,
cutree_cols = 3,
clustering_method = "ward.D2",
main = "Input: Vst transformed normalized DESeq2 counts, Cohort 2\n Clustering: Ward, Distance: Euclidean",
annotation_col = metadata2,
annotation_row = geneGroup,
annotation_colors = ann_colors_cohort2,
show_rownames = TRUE) # show gene names on the heatmap
vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
cutree_rows = 2,
cutree_cols = 2,
clustering_method = "ward.D2",
main = "Input: Vst transformed normalized DESeq2 counts, Cohort1\nClustering: Ward, Distance: Pearson correlation",
annotation_col = metadata1,
annotation_row = geneGroup,
annotation_colors = ann_colors_cohort1,
show_rownames = TRUE) # show gene names on the heatmap
vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
clustering_distance_rows = "correlation",
clustering_distance_cols = "correlation",
cutree_rows = 2,
cutree_cols = 3,
clustering_method = "ward.D2",
main = "Input: Vst transformed normalized DESeq2 counts, Cohort2\nClustering: Ward, Distance: Pearson correlation",
annotation_col = metadata2,
annotation_row = geneGroup,
annotation_colors = ann_colors_cohort2,
show_rownames = TRUE) # show gene names on the heatmap
Source: Fig. 1A of https://doi.org/10.1182/blood.2021015019
# Define list of genes and their groups
geneList <- c("TBX21", "IFNG", "STAT1", "CSF2",
"GATA3", "EGR1", "SEPTIN6", "CAT")
geneGroups <- c("TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL", "TBX21-PTCL",
"GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL", "GATA3-PTCL")
geneGroup <- data.frame(Group = geneGroups)
rownames(geneGroup) <- geneList
# subset vst data to include only those genes
vsd_geneList_cohort1 <- vsd_cohort1 %>%
filter(rownames(vsd_cohort1) %in% geneList)
vsd_geneList_cohort2 <- vsd_cohort2 %>%
filter(rownames(vsd_cohort2) %in% geneList)
# Specify colors (optional). Full list of color options here: http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
ann_colors_cohort1 = list(
phenotype = c("CD8_PTCL" = "deepskyblue", "DN_PTCL" = "magenta2", "CD8_CTRL" = "seagreen3", "CD4_PTCL" = "darkkhaki", "CD4_CTRL" = "coral"),
Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))
# Generate heatmap
vst_heatmap1 <- pheatmap(vsd_geneList_cohort1,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
cutree_rows = 2,
cutree_cols = 2,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
main = "Input: Vst transformed normalized DESeq2 counts, Cohort 1\n Clustering: Ward, Distance: Euclidean",
annotation_col = metadata1,
annotation_row = geneGroup,
annotation_colors = ann_colors_cohort1,
show_rownames = TRUE) # show gene names on the heatmap
ann_colors_cohort2 = list(
phenotype = c("CD4_LN_CTRL" = "coral", "CD4_PTCL" = "darkkhaki", "CD4THYM_CTRL" = "deepskyblue"),
Group = c("TBX21-PTCL" = "steelblue", "GATA3-PTCL" = "red4"))
# Generate heatmap
vst_heatmap2 <- pheatmap(vsd_geneList_cohort2,
scale="row",
color = colorRampPalette(c("blue", "white", "red"), space = "Lab")(100),
cluster_rows=TRUE,
cluster_cols=TRUE,
cutree_rows = 2,
cutree_cols = 2,
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean",
clustering_method = "ward.D2",
main = "Input: Vst transformed normalized DESeq2 counts, Cohort 2\n Clustering: Ward, Distance: Euclidean",
annotation_col = metadata2,
annotation_row = geneGroup,
annotation_colors = ann_colors_cohort2,
show_rownames = TRUE) # show gene names on the heatmap
sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggplot2_3.5.1 knitr_1.49
## [3] readr_2.1.5 dplyr_1.1.4
## [5] pheatmap_1.0.12 RColorBrewer_1.1-3
## [7] DESeq2_1.44.0 SummarizedExperiment_1.34.0
## [9] Biobase_2.64.0 MatrixGenerics_1.16.0
## [11] matrixStats_1.4.1 GenomicRanges_1.56.2
## [13] GenomeInfoDb_1.40.1 IRanges_2.38.1
## [15] S4Vectors_0.42.1 BiocGenerics_0.50.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.49 bslib_0.8.0
## [4] lattice_0.22-6 tzdb_0.4.0 vctrs_0.6.5
## [7] tools_4.4.0 generics_0.1.3 parallel_4.4.0
## [10] tibble_3.2.1 fansi_1.0.6 pkgconfig_2.0.3
## [13] Matrix_1.7-0 lifecycle_1.0.4 GenomeInfoDbData_1.2.12
## [16] farver_2.1.2 compiler_4.4.0 munsell_0.5.1
## [19] codetools_0.2-20 htmltools_0.5.8.1 sass_0.4.9
## [22] yaml_2.3.8 pillar_1.9.0 crayon_1.5.3
## [25] jquerylib_0.1.4 BiocParallel_1.38.0 DelayedArray_0.30.1
## [28] cachem_1.1.0 abind_1.4-8 tidyselect_1.2.1
## [31] locfit_1.5-9.10 digest_0.6.35 fastmap_1.2.0
## [34] grid_4.4.0 colorspace_2.1-1 cli_3.6.2
## [37] SparseArray_1.4.8 magrittr_2.0.3 S4Arrays_1.4.1
## [40] utf8_1.2.4 withr_3.0.2 scales_1.3.0
## [43] UCSC.utils_1.0.0 rmarkdown_2.29 XVector_0.44.0
## [46] httr_1.4.7 hms_1.1.3 evaluate_1.0.1
## [49] rlang_1.1.3 Rcpp_1.0.13 glue_1.7.0
## [52] rstudioapi_0.17.1 jsonlite_1.8.9 R6_2.5.1
## [55] zlibbioc_1.50.0
citation()
## To cite R in publications use:
##
## R Core Team (2024). _R: A Language and Environment for Statistical
## Computing_. R Foundation for Statistical Computing, Vienna, Austria.
## <https://www.R-project.org/>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2024},
## url = {https://www.R-project.org/},
## }
##
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.