Step-by-step heatmap tutorial with pheatmap()

original tutorial I followed: https://biostatsquid.com/step-by-step-heatmap-tutorial-with-pheatmap/

simulated gene expressions with annotations using pheatmap

pheatmap package

library(pheatmap)

synthesise gene expression test data

N_GENES <- 50
N_OBSERVATIONS <- 10

geData <- matrix(rnorm(500), N_GENES, N_OBSERVATIONS)
colnames(geData) <- paste0("Sample_", 1:N_OBSERVATIONS)
rownames(geData) <- paste0("Gene_", 1:N_GENES)
head(geData)
         Sample_1    Sample_2   Sample_3   Sample_4    Sample_5    Sample_6
Gene_1 -0.6991055  1.14613447 -1.1802587 -0.6959333 -0.78744135 -0.20791640
Gene_2  0.2742142  0.05545421 -0.8683816  0.3068968  0.86633898  0.78012622
Gene_3 -0.3837213  0.67378293  1.5813930  0.7630419  1.74542758 -0.47304076
Gene_4  1.9310654  2.38825352  0.2484976 -2.0303287 -1.16369394 -1.33636199
Gene_5  1.4499932 -0.07189191 -0.8252876 -0.8925559  0.92687998  0.79835607
Gene_6 -0.6259183 -1.15188830 -0.8685327 -0.5698344 -0.09392029 -0.03969982
          Sample_7    Sample_8   Sample_9  Sample_10
Gene_1 -0.75566072 -0.26539055  0.3108790 -0.7443402
Gene_2 -0.61100052  0.85495888  0.1907298 -0.2972436
Gene_3  0.03476893 -0.94139247 -1.0419336 -0.4293556
Gene_4  0.14741243  0.03587988  0.8580009 -0.4563711
Gene_5  0.13428051  0.03744672  0.6608050  0.8179740
Gene_6 -1.50026414  1.83770970  0.1826200  1.1988738

annotations

create a data.table for sample/column annotation

ann_dt <- data.table(
  Group = rep(
      c("Disease", "Control"),
      c(N_OBSERVATIONS/2, N_OBSERVATIONS/2)
    ),
  Lymphocyte_count = rnorm(N_OBSERVATIONS)
)
row.names(ann_dt) <- colnames(geData)
print(ann_dt)
      Group Lymphocyte_count
     <char>            <num>
 1: Disease      -1.08979285
 2: Disease      -0.26269777
 3: Disease       0.01970315
 4: Disease      -0.43202829
 5: Disease      -2.85344727
 6: Control      -0.72119582
 7: Control       0.19682254
 8: Control       0.93658749
 9: Control       0.04796364
10: Control      -0.23197666

create a data.table for feature/row annotation

gene_functions_dt <- data.table(
  gene_functions = rep(c('Oxidative_phosphorylation',
                         'Cell_cycle',
                         'Immune_regulation',
                         'Signal_transduction',
                         'Transcription'), rep(N_GENES/5, 5))
)
row.names(gene_functions_dt) <- rownames(geData)
print(gene_functions_dt)
               gene_functions
                       <char>
 1: Oxidative_phosphorylation
 2: Oxidative_phosphorylation
 3: Oxidative_phosphorylation
 4: Oxidative_phosphorylation
 5: Oxidative_phosphorylation
 6: Oxidative_phosphorylation
 7: Oxidative_phosphorylation
 8: Oxidative_phosphorylation
 9: Oxidative_phosphorylation
10: Oxidative_phosphorylation
11:                Cell_cycle
12:                Cell_cycle
13:                Cell_cycle
14:                Cell_cycle
15:                Cell_cycle
16:                Cell_cycle
17:                Cell_cycle
18:                Cell_cycle
19:                Cell_cycle
20:                Cell_cycle
21:         Immune_regulation
22:         Immune_regulation
23:         Immune_regulation
24:         Immune_regulation
25:         Immune_regulation
26:         Immune_regulation
27:         Immune_regulation
28:         Immune_regulation
29:         Immune_regulation
30:         Immune_regulation
31:       Signal_transduction
32:       Signal_transduction
33:       Signal_transduction
34:       Signal_transduction
35:       Signal_transduction
36:       Signal_transduction
37:       Signal_transduction
38:       Signal_transduction
39:       Signal_transduction
40:       Signal_transduction
41:             Transcription
42:             Transcription
43:             Transcription
44:             Transcription
45:             Transcription
46:             Transcription
47:             Transcription
48:             Transcription
49:             Transcription
50:             Transcription
               gene_functions

create a list of classification colour schemes for various annotations

Reminder: named character vectors (or named vectors) allow you to associate names with values in a vector. This feature is useful when you want to access or refer to elements by name rather than by their position (index) in the vector.

ann_colors <- list(
  
  gene_functions = c(
    "Oxidative_phosphorylation" = "#F46D43",
    "Cell_cycle" = "#708238",
    "Immune_regulation" = "#9E0142",
    "Signal_transduction" = "beige",
    "Transcription" = "violet"),
  
  DCGroup = c("Disease" = "darkgreen",
              "Control" = "blueviolet"),
  
  Lymphocyte_count = brewer.pal(5, 'PuBu')
)

Basic Heatmap

heat_plot <- pheatmap(geData, 
                      col = brewer.pal(10, 'RdYlGn'), # choose a colour scale for your data
                      cluster_rows = T, cluster_cols = T, # set to FALSE if you want to remove the dendograms
                      clustering_distance_cols = 'euclidean',
                      clustering_distance_rows = 'euclidean',
                      clustering_method = 'ward.D',
                      annotation_row = gene_functions_dt, # row (gene) annotations
                      annotation_col = ann_dt, # column (sample) annotations
                      annotation_colors = ann_colors, # colours for your annotations
                      annotation_names_row = F, 
                      annotation_names_col = F,
                      fontsize_row = 10,          # row label font size
                      fontsize_col = 7,          # column label font size 
                      angle_col = 45, # sample names at an angle
                      legend_breaks = c(-2, 0, 2), # legend customisation
                      legend_labels = c("Low", "Medium", "High"), # legend customisation
                      show_colnames = T, show_rownames = F, # displaying column and row names
                      main = "simulated gene expressions")

simulated gene expressions