C19b Heatmap

Yujin L

2025-03-31

1. Introduction

The data used for the heatmap is taken from a Spatial Trancriptomics data YL acquired in 2024 April.

ComplexHeatmap (Gu, Eils, and Schlesner (2016)) is an R Programming Language (R Core Team (2020)) package that is currently listed in the Bioconductor package repository.

2. Install and load required packages

library(RColorBrewer)
library(ComplexHeatmap)
library(circlize)
library(digest)
library(cluster)

3. Load Data

getwd()

## [1] "/tmp/RtmpjMhEGT/file39dbb21e14ae4"

setwd("/stor/work/Fleet/BCG/2024_04.Fleet_visium_customgenome/05.Analysis_in_R/6. DEG/DEG Analysis/z-score Heatmap/")
getwd()

## [1] "/stor/scratch/Fleet/archive/BCG/2024_04.Fleet_visium_customgenome/05.Analysis_in_R/6. DEG/DEG Analysis/z-score Heatmap"

c19b_mat <- read.csv("./C19b Top 72 relevant genes.csv", row.names = 1, check.names = FALSE)
c19b_metadata <- read.csv("./metadata.csv", row.names = 1, check.names = FALSE)
c19b_sig_genes <- read.csv("./sig_genes.csv", header = FALSE)

# Convert to a character vector
c19b_sig_genes <- as.vector(c19b_sig_genes$V1)

# Alternative using unlist
c19b_sig_genes <- unlist(c19b_sig_genes)

# Check the structure
str(c19b_sig_genes)

##  chr [1:70] "A4galt" "Aatf" "Abcg5" "Adcy5" "Aldh1a1" "Aldh1b1" "Ang4" ...

Take a look at the contents of the data.

  # first 5 rows; first 5 columns
c19b_mat[1:5,1:5]

##         CL 0 CL 1 CL 2 CL 3 CL 4
## A4galt    55   44   57  115   45
## Aatf     129  109   66  148  122
## Abcg5     24   36   10   89   29
## Adcy5    107   21  356  749   38
## Aldh1a1  207  156  267  655  202

c19b_metadata

##       Location  Phenotype Cluster
## CL 0       Mid Abnormal 1       0
## CL 1       Top Abnormal 2       1
## CL 2      Base Abnormal 1       2
## CL 3      Base     Normal       3
## CL 4       Top Abnormal 1       4
## CL 6       Mid Abnormal 1       6
## CL 7       Mid Abnormal 2       7
## CL 8       Mid     Normal       8
## CL 14      Top     Normal      14

c19b_sig_genes

##  [1] "A4galt"  "Aatf"    "Abcg5"   "Adcy5"   "Aldh1a1" "Aldh1b1" "Ang4"   
##  [8] "Anxa13"  "Camk1d"  "Casp14"  "Ccl27a"  "Ccn3"    "Cd177"   "Cdca7"  
## [15] "Cma1"    "Col4a5"  "Col4a6"  "Coro1b"  "Cox4i2"  "Cox6b2"  "Cyp2c55"
## [22] "Cyp2d10" "Cyp2d12" "Cyp2d22" "Cyp2d26" "Cyp2f2"  "Cyp2s1"  "Dact3"  
## [29] "Efemp1"  "Foxf1"   "Frat1"   "Fzd2"    "Fzd7"    "Gcg"     "Gstm7"  
## [36] "Gstp1"   "Gstp2"   "Gstt1"   "Hmga1"   "Hpgd"    "Id4"     "Igf2bp2"
## [43] "Il17re"  "Itih2"   "Jam3"    "Map3k20" "Mapk15"  "Mptx1"   "Nfia"   
## [50] "Nfic"    "Nr1d1"   "Nrarp"   "Nrp2"    "Pbx1"    "Penk"    "Porcn"  
## [57] "Rab3a"   "Rbm38"   "Retsat"  "Sparcl1" "Stmn2"   "Sult1a1" "Sult1c2"
## [64] "Tcf21"   "Tead3"   "Tgfb1i1" "Trpm6"   "Wdtc1"   "Wnt4"    "Wnt5b"

Now let’s build the actual annotation object, i.e., the legend:

 # Create an initial data-frame of the annotation that we want to use
  ann <- data.frame(
    Location = c19b_metadata$Location,
    Phenotype = c19b_metadata$Phenotype,
    Cluster = c19b_metadata$Cluster,
    stringsAsFactors = FALSE)

  # create the colour mapping
  colours <- list(
    Location = c('Base' = '#c3488c', 'Mid' = '#727137', 'Top' = '#3E6b97'),
    Phenotype = c('Normal' = '#1B9E77', 'Abnormal 1' = '#D95F02', 'Abnormal 2' = '#7570B3'),
    Cluster = c('0' = '#f8766d', '1' = '#ec8239', '2' = '#db8e00', '3' = '#c79800',
                '4' = '#aea200', '6' = '#64b200', '7' = '#00b81b', '8' = '#00bd5c',
                '14' = '#00a6ff'))

# now create the ComplexHeatmap annotation object
  # as most of these parameters are self-explanatory, comments will only appear where needed
colAnn <- HeatmapAnnotation(
  df = ann,
  which = 'col', # 'col' (samples) or 'row' (gene) annotation?
  na_col = 'white', # default colour for any NA values in the annotation data-frame, 'ann'
  col = colours,
  annotation_height = 0.6,
  annotation_width = unit(1, 'cm'),
  gap = unit(1, 'mm'),
  annotation_legend_param = list(
    Phenotype = list(
      nrow = 3, # number of rows across which the legend will be arranged
      title = 'Phenotype',
      title_position = 'topcenter',
      legend_direction = 'vertical',
      title_gp = gpar(fontsize = 12, fontface = 'bold'),
      labels_gp = gpar(fontsize = 12, fontface = 'bold')),
    Location = list(
      nrow = 3, # number of rows across which the legend will be arranged
      title = 'Location',
      title_position = 'topcenter',
      legend_direction = 'vertical',
      title_gp = gpar(fontsize = 12, fontface = 'bold'),
      labels_gp = gpar(fontsize = 12, fontface = 'bold')),
    Cluster = list(
      nrow = 3, # number of rows across which the legend will be arranged
      title = 'Cluster',
      title_position = 'topcenter',
      legend_direction = 'vertical',
      title_gp = gpar(fontsize = 12, fontface = 'bold'),
      labels_gp = gpar(fontsize = 12, fontface = 'bold'))))

4. Generate the heatmap

[a] scale the data to Z-scores (by row) This is quite standard when performing clustering and generating a heatmap.

c19b_heat <- t(scale(t(c19b_mat)))

[b] set colour scheme and choose breaks

  myCol <- colorRampPalette(c('#3b4dc1', '#E0DCDC', '#b60b27'))(100)
  myBreaks <- seq(-3, 3, length.out = 100)

[c] create annotation: box-and-whisker plots

  boxplotCol <-HeatmapAnnotation(
    boxplot = anno_boxplot(
      c19b_heat,
      border = FALSE,
      gp = gpar(fill = '#CCCCCC'),
      pch = '.',
      size = unit(2, 'mm'),
      axis = TRUE,
      axis_param = list(
        gp = gpar(fontsize = 12),
        side = 'left')),
      annotation_width = unit(c(2.0), 'cm'),
      which = 'col')

  boxplotRow <- HeatmapAnnotation(
    boxplot = row_anno_boxplot(
      c19b_heat,
      border = FALSE,
      gp = gpar(fill = '#CCCCCC'),
      pch = '.',
      size = unit(2, 'mm'),
      axis = TRUE,
      axis_param = list(
        gp = gpar(fontsize = 12),
        side = 'top')),
      annotation_width = unit(c(2.0), 'cm'),
      which = 'row')

[d] create annotation: gene labels Many heatmaps are produced from a large number of variables / genes, which result in it being difficult to label each gene in the plot space. Here, we can ‘step through’ the variables / genes and choose to only label a select few.

The number of rows (genes) in our object is:

299

In this code snippet, we label only the genes specified in my sig_genes list.

# Ensure sig_genes contains gene names present in the heatmap
valid_genes <- intersect(rownames(c19b_heat), c19b_sig_genes) # Only keep genes present in heat

# Get the indices of sig_genes in the heatmap
indices <- which(rownames(c19b_heat) %in% valid_genes)

# Create annotation using the indices of sig_genes
genelabels <- rowAnnotation(
  Genes = anno_mark(
    at = indices,
    labels = rownames(c19b_heat)[indices],
    labels_gp = gpar(fontsize = 10, fontface = 'bold'),
    padding = 0.75
  ),
  width = unit(2.0, 'cm') +
    max_text_width(
      rownames(c19b_heat)[indices],
      gp = gpar(fontsize = 10, fontface = 'bold')
    )
)

[e] perform partitioning around medoids (PAM) to identify clusters in the data Performing k-means or PAM on our data can help us to identify internal ‘structure’ in the data that may relate to biologically meaningful pathways, as an example.

# The error "No clustering performed, NAs in the computed dissimilarity matrix." means that the cluster::pam function couldn't calculate distances between some rows or columns in your heat matrix due to missing or invalid values (NA).
# Remove or Impute Missing Values: Option 1: Remove Rows/Columns with NAs (if sparse)

  c19b_heat <- c19b_heat[complete.cases(c19b_heat), ]
  pamClusters <- cluster::pam(c19b_heat, k = 6) # pre-select k = 3 centers
  pamClusters$clustering <- paste0('Cluster ', pamClusters$clustering)

  # fix order of the clusters to have 1 to 6, top to bottom
  pamClusters$clustering <- factor(pamClusters$clustering,
    levels = c('Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6'))

[f] create the actual heatmap object

  c19b_hmap <- Heatmap(c19b_heat,

    # split the genes / rows according to the PAM clusters
      split = pamClusters$clustering,
      cluster_row_slices = FALSE,

    name = 'Gene\nZ-\nscore',

    col = colorRamp2(myBreaks, myCol),

    # parameters for the colour-bar that represents gradient of expression
      heatmap_legend_param = list(
        color_bar = 'continuous',
        legend_direction = 'vertical',
        legend_width = unit(8, 'cm'),
        legend_height = unit(5.0, 'cm'),
        title_position = 'topcenter',
        title_gp=gpar(fontsize = 12, fontface = 'bold'),
        labels_gp=gpar(fontsize = 12, fontface = 'bold')),

    # row (gene) parameters
      cluster_rows = TRUE,
      show_row_dend = TRUE,
      #row_title = 'Statistically significant and biologically relevant genes',
      row_title_side = 'left',
      row_title_gp = gpar(fontsize = 12,  fontface = 'bold'),
      row_title_rot = 90,
      show_row_names = FALSE,
      row_names_gp = gpar(fontsize = 10, fontface = 'bold'),
      row_names_side = 'left',
      row_dend_width = unit(25,'mm'),

    # column (sample) parameters
      cluster_columns = TRUE,
      show_column_dend = TRUE,
      column_title = '',
      column_title_side = 'bottom',
      column_title_gp = gpar(fontsize = 12, fontface = 'bold'),
      column_title_rot = 0,
      show_column_names = FALSE,
      column_names_gp = gpar(fontsize = 10, fontface = 'bold'),
      column_names_max_height = unit(10, 'cm'),
      column_dend_height = unit(25,'mm'),

    # cluster methods for rows and columns
      clustering_distance_columns = function(x) as.dist(1 - cor(t(x))),
      clustering_method_columns = 'ward.D2',
      clustering_distance_rows = function(x) as.dist(1 - cor(t(x))),
      clustering_method_rows = 'ward.D2',

    # specify top and bottom annotations
      top_annotation = colAnn,
      bottom_annotation = boxplotCol)

[g] draw the heatmap

# Set output to PNG
p <- draw(c19b_hmap + genelabels,
    heatmap_legend_side = 'left',
    annotation_legend_side = 'right',
    row_sub_title_side = 'left')

png(
  filename = "./heatmap.png",          # File name
  width = 4000,                      # Width in pixels
  height = 5500,                     # Height in pixels
  res = 300                          # Resolution in dpi
) 

p

dev.off()

## png 
##   2

sessionInfo()

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /stor/system/opt/R/R-4.3.1/lib/R/lib/libRblas.so 
## LAPACK: /stor/system/opt/R/R-4.3.1/lib/R/lib/libRlapack.so;  LAPACK version 3.11.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Chicago
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] cluster_2.1.8         digest_0.6.35         circlize_0.4.16      
## [4] ComplexHeatmap_2.18.0 RColorBrewer_1.1-3    shiny_1.8.1.1        
## 
## loaded via a namespace (and not attached):
##  [1] jsonlite_1.8.9      highr_0.11          compiler_4.3.1     
##  [4] rjson_0.2.23        crayon_1.5.3        promises_1.3.2     
##  [7] Rcpp_1.0.14         magick_2.8.5        parallel_4.3.1     
## [10] later_1.4.1         jquerylib_0.1.4     IRanges_2.36.0     
## [13] png_0.1-8           yaml_2.3.10         fastmap_1.2.0      
## [16] mime_0.12           R6_2.6.1            shape_1.4.6.1      
## [19] knitr_1.47          Cairo_1.6-2         BiocGenerics_0.48.1
## [22] iterators_1.0.14    GetoptLong_1.0.5    bslib_0.9.0        
## [25] rlang_1.1.2         cachem_1.1.0        httpuv_1.6.13      
## [28] xfun_0.44           sass_0.4.9          GlobalOptions_0.1.2
## [31] doParallel_1.0.17   memoise_2.0.1       cli_3.6.3          
## [34] magrittr_2.0.3      foreach_1.5.2       rstudioapi_0.17.1  
## [37] xtable_1.8-4        clue_0.3-66         lifecycle_1.0.4    
## [40] S4Vectors_0.40.2    evaluate_1.0.3      codetools_0.2-20   
## [43] stats4_4.3.1        colorspace_2.1-1    rmarkdown_2.27     
## [46] matrixStats_1.2.0   tools_4.3.1         htmltools_0.5.8.1