The data used for the heatmap is taken from a Spatial Trancriptomics data YL acquired in 2024 April.
ComplexHeatmap (Gu, Eils, and Schlesner (2016)) is an R Programming Language (R Core Team (2020)) package that is currently listed in the Bioconductor package repository.
library(RColorBrewer)
library(ComplexHeatmap)
library(circlize)
library(digest)
library(cluster)
getwd()
## [1] "/tmp/RtmpjMhEGT/file39dbb21e14ae4"
setwd("/stor/work/Fleet/BCG/2024_04.Fleet_visium_customgenome/05.Analysis_in_R/6. DEG/DEG Analysis/z-score Heatmap/")
getwd()
## [1] "/stor/scratch/Fleet/archive/BCG/2024_04.Fleet_visium_customgenome/05.Analysis_in_R/6. DEG/DEG Analysis/z-score Heatmap"
c19b_mat <- read.csv("./C19b Top 72 relevant genes.csv", row.names = 1, check.names = FALSE)
c19b_metadata <- read.csv("./metadata.csv", row.names = 1, check.names = FALSE)
c19b_sig_genes <- read.csv("./sig_genes.csv", header = FALSE)
# Convert to a character vector
c19b_sig_genes <- as.vector(c19b_sig_genes$V1)
# Alternative using unlist
c19b_sig_genes <- unlist(c19b_sig_genes)
# Check the structure
str(c19b_sig_genes)
## chr [1:70] "A4galt" "Aatf" "Abcg5" "Adcy5" "Aldh1a1" "Aldh1b1" "Ang4" ...
Take a look at the contents of the data.
# first 5 rows; first 5 columns
c19b_mat[1:5,1:5]
## CL 0 CL 1 CL 2 CL 3 CL 4
## A4galt 55 44 57 115 45
## Aatf 129 109 66 148 122
## Abcg5 24 36 10 89 29
## Adcy5 107 21 356 749 38
## Aldh1a1 207 156 267 655 202
c19b_metadata
## Location Phenotype Cluster
## CL 0 Mid Abnormal 1 0
## CL 1 Top Abnormal 2 1
## CL 2 Base Abnormal 1 2
## CL 3 Base Normal 3
## CL 4 Top Abnormal 1 4
## CL 6 Mid Abnormal 1 6
## CL 7 Mid Abnormal 2 7
## CL 8 Mid Normal 8
## CL 14 Top Normal 14
c19b_sig_genes
## [1] "A4galt" "Aatf" "Abcg5" "Adcy5" "Aldh1a1" "Aldh1b1" "Ang4"
## [8] "Anxa13" "Camk1d" "Casp14" "Ccl27a" "Ccn3" "Cd177" "Cdca7"
## [15] "Cma1" "Col4a5" "Col4a6" "Coro1b" "Cox4i2" "Cox6b2" "Cyp2c55"
## [22] "Cyp2d10" "Cyp2d12" "Cyp2d22" "Cyp2d26" "Cyp2f2" "Cyp2s1" "Dact3"
## [29] "Efemp1" "Foxf1" "Frat1" "Fzd2" "Fzd7" "Gcg" "Gstm7"
## [36] "Gstp1" "Gstp2" "Gstt1" "Hmga1" "Hpgd" "Id4" "Igf2bp2"
## [43] "Il17re" "Itih2" "Jam3" "Map3k20" "Mapk15" "Mptx1" "Nfia"
## [50] "Nfic" "Nr1d1" "Nrarp" "Nrp2" "Pbx1" "Penk" "Porcn"
## [57] "Rab3a" "Rbm38" "Retsat" "Sparcl1" "Stmn2" "Sult1a1" "Sult1c2"
## [64] "Tcf21" "Tead3" "Tgfb1i1" "Trpm6" "Wdtc1" "Wnt4" "Wnt5b"
Now let’s build the actual annotation object, i.e., the legend:
# Create an initial data-frame of the annotation that we want to use
ann <- data.frame(
Location = c19b_metadata$Location,
Phenotype = c19b_metadata$Phenotype,
Cluster = c19b_metadata$Cluster,
stringsAsFactors = FALSE)
# create the colour mapping
colours <- list(
Location = c('Base' = '#c3488c', 'Mid' = '#727137', 'Top' = '#3E6b97'),
Phenotype = c('Normal' = '#1B9E77', 'Abnormal 1' = '#D95F02', 'Abnormal 2' = '#7570B3'),
Cluster = c('0' = '#f8766d', '1' = '#ec8239', '2' = '#db8e00', '3' = '#c79800',
'4' = '#aea200', '6' = '#64b200', '7' = '#00b81b', '8' = '#00bd5c',
'14' = '#00a6ff'))
# now create the ComplexHeatmap annotation object
# as most of these parameters are self-explanatory, comments will only appear where needed
colAnn <- HeatmapAnnotation(
df = ann,
which = 'col', # 'col' (samples) or 'row' (gene) annotation?
na_col = 'white', # default colour for any NA values in the annotation data-frame, 'ann'
col = colours,
annotation_height = 0.6,
annotation_width = unit(1, 'cm'),
gap = unit(1, 'mm'),
annotation_legend_param = list(
Phenotype = list(
nrow = 3, # number of rows across which the legend will be arranged
title = 'Phenotype',
title_position = 'topcenter',
legend_direction = 'vertical',
title_gp = gpar(fontsize = 12, fontface = 'bold'),
labels_gp = gpar(fontsize = 12, fontface = 'bold')),
Location = list(
nrow = 3, # number of rows across which the legend will be arranged
title = 'Location',
title_position = 'topcenter',
legend_direction = 'vertical',
title_gp = gpar(fontsize = 12, fontface = 'bold'),
labels_gp = gpar(fontsize = 12, fontface = 'bold')),
Cluster = list(
nrow = 3, # number of rows across which the legend will be arranged
title = 'Cluster',
title_position = 'topcenter',
legend_direction = 'vertical',
title_gp = gpar(fontsize = 12, fontface = 'bold'),
labels_gp = gpar(fontsize = 12, fontface = 'bold'))))
[a] scale the data to Z-scores (by row) This is quite standard when performing clustering and generating a heatmap.
c19b_heat <- t(scale(t(c19b_mat)))
[b] set colour scheme and choose breaks
myCol <- colorRampPalette(c('#3b4dc1', '#E0DCDC', '#b60b27'))(100)
myBreaks <- seq(-3, 3, length.out = 100)
[c] create annotation: box-and-whisker plots
boxplotCol <-HeatmapAnnotation(
boxplot = anno_boxplot(
c19b_heat,
border = FALSE,
gp = gpar(fill = '#CCCCCC'),
pch = '.',
size = unit(2, 'mm'),
axis = TRUE,
axis_param = list(
gp = gpar(fontsize = 12),
side = 'left')),
annotation_width = unit(c(2.0), 'cm'),
which = 'col')
boxplotRow <- HeatmapAnnotation(
boxplot = row_anno_boxplot(
c19b_heat,
border = FALSE,
gp = gpar(fill = '#CCCCCC'),
pch = '.',
size = unit(2, 'mm'),
axis = TRUE,
axis_param = list(
gp = gpar(fontsize = 12),
side = 'top')),
annotation_width = unit(c(2.0), 'cm'),
which = 'row')
[d] create annotation: gene labels Many heatmaps are produced from a large number of variables / genes, which result in it being difficult to label each gene in the plot space. Here, we can ‘step through’ the variables / genes and choose to only label a select few.
The number of rows (genes) in our object is:
299
In this code snippet, we label only the genes specified in my sig_genes list.
# Ensure sig_genes contains gene names present in the heatmap
valid_genes <- intersect(rownames(c19b_heat), c19b_sig_genes) # Only keep genes present in heat
# Get the indices of sig_genes in the heatmap
indices <- which(rownames(c19b_heat) %in% valid_genes)
# Create annotation using the indices of sig_genes
genelabels <- rowAnnotation(
Genes = anno_mark(
at = indices,
labels = rownames(c19b_heat)[indices],
labels_gp = gpar(fontsize = 10, fontface = 'bold'),
padding = 0.75
),
width = unit(2.0, 'cm') +
max_text_width(
rownames(c19b_heat)[indices],
gp = gpar(fontsize = 10, fontface = 'bold')
)
)
[e] perform partitioning around medoids (PAM) to identify clusters in the data Performing k-means or PAM on our data can help us to identify internal ‘structure’ in the data that may relate to biologically meaningful pathways, as an example.
# The error "No clustering performed, NAs in the computed dissimilarity matrix." means that the cluster::pam function couldn't calculate distances between some rows or columns in your heat matrix due to missing or invalid values (NA).
# Remove or Impute Missing Values: Option 1: Remove Rows/Columns with NAs (if sparse)
c19b_heat <- c19b_heat[complete.cases(c19b_heat), ]
pamClusters <- cluster::pam(c19b_heat, k = 6) # pre-select k = 3 centers
pamClusters$clustering <- paste0('Cluster ', pamClusters$clustering)
# fix order of the clusters to have 1 to 6, top to bottom
pamClusters$clustering <- factor(pamClusters$clustering,
levels = c('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6'))
[f] create the actual heatmap object
c19b_hmap <- Heatmap(c19b_heat,
# split the genes / rows according to the PAM clusters
split = pamClusters$clustering,
cluster_row_slices = FALSE,
name = 'Gene\nZ-\nscore',
col = colorRamp2(myBreaks, myCol),
# parameters for the colour-bar that represents gradient of expression
heatmap_legend_param = list(
color_bar = 'continuous',
legend_direction = 'vertical',
legend_width = unit(8, 'cm'),
legend_height = unit(5.0, 'cm'),
title_position = 'topcenter',
title_gp=gpar(fontsize = 12, fontface = 'bold'),
labels_gp=gpar(fontsize = 12, fontface = 'bold')),
# row (gene) parameters
cluster_rows = TRUE,
show_row_dend = TRUE,
#row_title = 'Statistically significant and biologically relevant genes',
row_title_side = 'left',
row_title_gp = gpar(fontsize = 12, fontface = 'bold'),
row_title_rot = 90,
show_row_names = FALSE,
row_names_gp = gpar(fontsize = 10, fontface = 'bold'),
row_names_side = 'left',
row_dend_width = unit(25,'mm'),
# column (sample) parameters
cluster_columns = TRUE,
show_column_dend = TRUE,
column_title = '',
column_title_side = 'bottom',
column_title_gp = gpar(fontsize = 12, fontface = 'bold'),
column_title_rot = 0,
show_column_names = FALSE,
column_names_gp = gpar(fontsize = 10, fontface = 'bold'),
column_names_max_height = unit(10, 'cm'),
column_dend_height = unit(25,'mm'),
# cluster methods for rows and columns
clustering_distance_columns = function(x) as.dist(1 - cor(t(x))),
clustering_method_columns = 'ward.D2',
clustering_distance_rows = function(x) as.dist(1 - cor(t(x))),
clustering_method_rows = 'ward.D2',
# specify top and bottom annotations
top_annotation = colAnn,
bottom_annotation = boxplotCol)
[g] draw the heatmap
# Set output to PNG
p <- draw(c19b_hmap + genelabels,
heatmap_legend_side = 'left',
annotation_legend_side = 'right',
row_sub_title_side = 'left')
png(
filename = "./heatmap.png", # File name
width = 4000, # Width in pixels
height = 5500, # Height in pixels
res = 300 # Resolution in dpi
)
p
dev.off()
## png
## 2
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /stor/system/opt/R/R-4.3.1/lib/R/lib/libRblas.so
## LAPACK: /stor/system/opt/R/R-4.3.1/lib/R/lib/libRlapack.so; LAPACK version 3.11.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Chicago
## tzcode source: system (glibc)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] cluster_2.1.8 digest_0.6.35 circlize_0.4.16
## [4] ComplexHeatmap_2.18.0 RColorBrewer_1.1-3 shiny_1.8.1.1
##
## loaded via a namespace (and not attached):
## [1] jsonlite_1.8.9 highr_0.11 compiler_4.3.1
## [4] rjson_0.2.23 crayon_1.5.3 promises_1.3.2
## [7] Rcpp_1.0.14 magick_2.8.5 parallel_4.3.1
## [10] later_1.4.1 jquerylib_0.1.4 IRanges_2.36.0
## [13] png_0.1-8 yaml_2.3.10 fastmap_1.2.0
## [16] mime_0.12 R6_2.6.1 shape_1.4.6.1
## [19] knitr_1.47 Cairo_1.6-2 BiocGenerics_0.48.1
## [22] iterators_1.0.14 GetoptLong_1.0.5 bslib_0.9.0
## [25] rlang_1.1.2 cachem_1.1.0 httpuv_1.6.13
## [28] xfun_0.44 sass_0.4.9 GlobalOptions_0.1.2
## [31] doParallel_1.0.17 memoise_2.0.1 cli_3.6.3
## [34] magrittr_2.0.3 foreach_1.5.2 rstudioapi_0.17.1
## [37] xtable_1.8-4 clue_0.3-66 lifecycle_1.0.4
## [40] S4Vectors_0.40.2 evaluate_1.0.3 codetools_0.2-20
## [43] stats4_4.3.1 colorspace_2.1-1 rmarkdown_2.27
## [46] matrixStats_1.2.0 tools_4.3.1 htmltools_0.5.8.1