This R tutorial describes how to compute and visualize a correlation matrix using R software and ggplot2 package.
The columns of arms_data includes the GISTIC2 Z-scores of amp or deletion chromosomal arms for 6 cancer types.
We first get the correlation tables for amp and deletion separately.
library(Jerry)
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Warning: replacing previous import 'ggplot2::annotate' by 'ggpmisc::annotate'
## when loading 'Jerry'
library(tidyverse)
data(arms_data)
#For AMP, Get the correlation tables.
arms_data_dist_amp = arms_data %>%
select(starts_with("Amp")) %>%
#t() %>%
cor() %>%
as.matrix()
#For Del, Get the correlation tables.
arms_data_dist_del = arms_data %>%
select(starts_with("Del")) %>%
#t() %>%
cor() %>%
as.matrix()
Note that, a correlation matrix has redundant information. We’ll use the functions below to set half of it to NA.
#set upper.triangle to be NA
arms_data_dist_amp %>%
get_tri(diag = TRUE, upper.tri = TRUE)
## set the upper tri to be NA
## Amp_ACA Amp_NEC Amp_CIN Amp_EBV Amp_GS Amp_MSI
## Amp_ACA NA NA NA NA NA NA
## Amp_NEC 0.8971477 NA NA NA NA NA
## Amp_CIN 0.7313157 0.6677643 NA NA NA NA
## Amp_EBV 0.6380635 0.6225627 0.7994932 NA NA NA
## Amp_GS 0.6683191 0.6172034 0.7019668 0.6668315 NA NA
## Amp_MSI 0.4838802 0.5112640 0.5480767 0.5583757 0.8330379 NA
arms_data_long_amp = arms_data_dist_amp %>%
get_tri(diag = TRUE, upper.tri = TRUE) %>%
reshape2::melt(na.rm = TRUE) %>%
mutate(
Var1 = mapply(function(x) x[2], str_split(Var1, "_") ),
Var2 = mapply(function(x) x[2], str_split(Var2, "_") )
)
## set the upper tri to be NA
head(arms_data_long_amp)
## Var1 Var2 value
## 2 NEC ACA 0.8971477
## 3 CIN ACA 0.7313157
## 4 EBV ACA 0.6380635
## 5 GS ACA 0.6683191
## 6 MSI ACA 0.4838802
## 9 CIN NEC 0.6677643
arms_data_long_amp = arms_data_long_amp %>%
mutate(Var1 = factor(Var1, levels = c("ACA","NEC","CIN","EBV","GS","MSI"),
labels = c("MANEC-Aca","MANEC-Nec","STAD-CIN","STAD-EBV","STAD-GS","STAD-MSI")
),
Var2 = factor(Var2, levels = c("ACA","NEC","CIN","EBV","GS","MSI"),
labels = c("MANEC-Aca","MANEC-Nec","STAD-CIN","STAD-EBV","STAD-GS","STAD-MSI"))
)
# Create a ggheatmap
ggheatmap <- ggplot(arms_data_long_amp, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "grey100",
midpoint = 0.5, limit = c(0, 0.9), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1),
axis.text.y = element_text(
size = 10)
)+
coord_fixed() +
geom_rect(xmin = 0.5, xmax = 3.5, ymin = 0.5, ymax = 3.5, col = "black", alpha = 0)
ggheatmap
ggheatmap +
geom_text(aes(Var2, Var1, label = round(value, 2) ), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
#legend.justification = c(1, 0),
#legend.position = "none"
)
Here, we plot the amp and del in the upper and lower tri-angele, thus presenting the data into one full heatmap figure.
#set upper.triangle to be NA
arms_data_dist_amp %>%
get_tri(diag = TRUE, upper.tri = TRUE)
## set the upper tri to be NA
## Amp_ACA Amp_NEC Amp_CIN Amp_EBV Amp_GS Amp_MSI
## Amp_ACA NA NA NA NA NA NA
## Amp_NEC 0.8971477 NA NA NA NA NA
## Amp_CIN 0.7313157 0.6677643 NA NA NA NA
## Amp_EBV 0.6380635 0.6225627 0.7994932 NA NA NA
## Amp_GS 0.6683191 0.6172034 0.7019668 0.6668315 NA NA
## Amp_MSI 0.4838802 0.5112640 0.5480767 0.5583757 0.8330379 NA
arms_data_long_amp = arms_data_dist_amp %>%
get_tri(diag = TRUE, upper.tri = TRUE) %>%
reshape2::melt(na.rm = TRUE) %>%
mutate(
Var1 = mapply(function(x) x[2], str_split(Var1, "_") ),
Var2 = mapply(function(x) x[2], str_split(Var2, "_") )
)
## set the upper tri to be NA
head(arms_data_long_amp)
## Var1 Var2 value
## 2 NEC ACA 0.8971477
## 3 CIN ACA 0.7313157
## 4 EBV ACA 0.6380635
## 5 GS ACA 0.6683191
## 6 MSI ACA 0.4838802
## 9 CIN NEC 0.6677643
#same for del.
#set upper.triangle to be NA
arms_data_dist_del %>%
get_tri(diag = TRUE, lower.tri = TRUE)
## set the lower tri to be NA
## Del_ACA Del_NEC Del_CIN Del_EBV Del_GS Del_MSI
## Del_ACA NA 0.7399628 0.5359301 0.1106895 0.3296669 0.29815390
## Del_NEC NA NA 0.5443698 0.1922782 0.2971650 0.14946408
## Del_CIN NA NA NA 0.4490654 0.5694798 0.51634555
## Del_EBV NA NA NA NA 0.5038875 0.06097154
## Del_GS NA NA NA NA NA 0.30906615
## Del_MSI NA NA NA NA NA NA
arms_data_long_del = arms_data_dist_del %>%
get_tri(diag = TRUE, lower.tri = TRUE) %>%
reshape2::melt(na.rm = TRUE) %>%
mutate(
Var1 = mapply(function(x) x[2], str_split(Var1, "_") ),
Var2 = mapply(function(x) x[2], str_split(Var2, "_") )
)
## set the lower tri to be NA
head(arms_data_long_del)
## Var1 Var2 value
## 7 ACA NEC 0.7399628
## 13 ACA CIN 0.5359301
## 14 NEC CIN 0.5443698
## 19 ACA EBV 0.1106895
## 20 NEC EBV 0.1922782
## 21 CIN EBV 0.4490654
arms_data_long = rbind( arms_data_long_amp, arms_data_long_del) %>%
mutate(Var1 = factor(Var1, levels = c("ACA","NEC","CIN","EBV","GS","MSI"),
labels = c("MANEC-Aca","MANEC-Nec","STAD-CIN","STAD-EBV","STAD-GS","STAD-MSI")
),
Var2 = factor(Var2, levels = c("ACA","NEC","CIN","EBV","GS","MSI"),
labels = c("MANEC-Aca","MANEC-Nec","STAD-CIN","STAD-EBV","STAD-GS","STAD-MSI"))
)
# Create a ggheatmap
ggheatmap <- ggplot(arms_data_long, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "grey100",
midpoint = 0.5, limit = c(0, 0.9), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1),
axis.text.y = element_text(
size = 10)
)+
coord_fixed() +
geom_rect(xmin = 0.5, xmax = 3.5, ymin = 0.5, ymax = 3.5, col = "black", alpha = 0)
ggheatmap
ggheatmap +
geom_text(aes(Var2, Var1, label = round(value, 2) ), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
#legend.justification = c(1, 0),
#legend.position = "none"
)