library("factoextra")
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("cluster")
# Load the data
data("USArrests")
# Standardize the data
df <- scale(USArrests)
# Show the first 6 rows
head(df, nrow = 6)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
Debido a medidas similares se computa la “dissimilarity” en la matriz de valores del data frame.
# df = the standardized data
res.dist <- dist(df, method = "euclidean")
as.matrix(res.dist)[1:6, 1:6]
## Alabama Alaska Arizona Arkansas California Colorado
## Alabama 0.000000 2.703754 2.293520 1.289810 3.263110 2.651067
## Alaska 2.703754 0.000000 2.700643 2.826039 3.012541 2.326519
## Arizona 2.293520 2.700643 0.000000 2.717758 1.310484 1.365031
## Arkansas 1.289810 2.826039 2.717758 0.000000 3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641 0.000000 1.287619
## Colorado 2.651067 2.326519 1.365031 2.831051 1.287619 0.000000
Se emplea a funciĂłn hclust() para realizar el dendrograma.
# hclust() can be used as follow:
res.hc <- hclust(d = res.dist, method = "ward.D2")
# Dendrogram
# cex: label size
fviz_dend(res.hc, cex = 0.5)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Posterior a esto se verifica el árbol del cluster.
# Verify the cluster tree
# Compute cophentic distance
res.coph <- cophenetic(res.hc)
# Correlation between cophenetic distance and
# the original distance
cor(res.dist, res.coph)
## [1] 0.6975266
Ante esto se realiza lo siguiente para confirmar el valor adecuado.
res.hc2 <- hclust(res.dist, method = "average")
cor(res.dist, cophenetic(res.hc2))
## [1] 0.7180382
Se corta el dendrograma en diferentes grupos, más especificamente, en cuatro grupos.
# Cut tree into 4 groups
grp <- cutree(res.hc, k = 4)
head(grp, n = 4)
## Alabama Alaska Arizona Arkansas
## 1 2 2 3
También, se realiza una tabla con los grupos.
# Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 7 12 19 12
Y se plantean los nombres de cada uno de los grupos.
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Georgia" "Louisiana" "Mississippi"
## [5] "North Carolina" "South Carolina" "Tennessee"
Se realiza una gráfica teniendo en cuenta lo realizado anteriormente.
# Cut in 4 groups and color by groups
fviz_dend(res.hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
También se desarrolla otra manera de visualización de datos empleando la función fviz_cluster()
# Using the function fviz_cluster()
fviz_cluster(list(data = df, cluster = grp),
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "convex", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
show.clust.cent = FALSE, ggtheme = theme_minimal())
El resultado final es un Agglomerative nesting (Hierarical clustering) lo cual se presenta a como un dendrograma.
# Cluster R package
# Agglomerative Nesting (Hierarchical Clustering)
res.agnes <- agnes(x = USArrests, # data matrix
stand = TRUE, # Standardize the data
metric = "euclidean", # metric for distance matrix
method = "ward" # Linkage method
)
# DIvisive ANAlysis Clustering
res.diana <- diana(x = USArrests, # data matrix
stand = TRUE, # standardize the data
metric = "euclidean" # metric for distance matrix
)
# After running agnes() and diana(), you can use the function fviz_dend()[in factoextra] to visualize the output:
fviz_dend(res.agnes, cex = 0.6, k = 4)
Se hace una preparación de los datos por medio de un dataframe, además, se hace un subset que contenga 10 columnas para posteriormente plantear una comparación entre dendrogramas con su respectiva gráfica.
#install.packages("dendextend")
#install.packages("corrplot")
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(corrplot)
## corrplot 0.92 loaded
#Data preparation
df <- scale(USArrests)
# Subset containing 10 rows
set.seed(123)
ss <- sample(1:50, 10)
df <- df[ss,]
#Comparing dendrograms
# Compute distance matrix
res.dist <- dist(df, method = "euclidean")
# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "average")
hc2 <- hclust(res.dist, method = "ward.D2")
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
# Create a list to hold dendrograms
dend_list <- dendlist(dend1, dend2)
#Visual comparison of two dendrograms
#Draw a tanglegram:
tanglegram(dend1, dend2)
Al haber realizado ahora se genera el mismo gráfico pero cambiando sus caracterĂsticas estĂ©ticas de la siguiente forma.
#Customized the tanglegram using many other options as follow:
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE, # Turn-off dashed lines
common_subtrees_color_lines = FALSE, # Turn-off line colors
common_subtrees_color_branches = TRUE, # Color common branches
main = paste("Entanglement =", round(entanglement(dend_list), 2))
)
Se consigue la matriz de correlaciĂłn.
#Correlation matrix between a list of dendrograms
# Cophenetic correlation matrix
cor.dendlist(dend_list, method = "cophenetic")
## [,1] [,2]
## [1,] 1.0000000 0.9925544
## [2,] 0.9925544 1.0000000
También se crea una matriz de correlación con el método de Baker
# Baker correlation matrix
cor.dendlist(dend_list, method = "baker")
## [,1] [,2]
## [1,] 1.0000000 0.9895528
## [2,] 0.9895528 1.0000000
Se halla el coeficiente de correlaciĂłn de Cophenetic
# Cophenetic correlation coefficient
cor_cophenetic(dend1, dend2)
## [1] 0.9925544
También se halla el coeficiente de correlación de Baker
# Baker correlation coefficient
cor_bakers_gamma(dend1, dend2)
## [1] 0.9895528
Entonces, con estos datos se comparan los dendogramas
#COMPARING DENDROGRAMS
# Create multiple dendrograms by chaining
dend1 <- df %>% dist %>% hclust("complete") %>% as.dendrogram
dend2 <- df %>% dist %>% hclust("single") %>% as.dendrogram
dend3 <- df %>% dist %>% hclust("average") %>% as.dendrogram
dend4 <- df %>% dist %>% hclust("centroid") %>% as.dendrogram
# Compute correlation matrix
dend_list <- dendlist("Complete" = dend1, "Single" = dend2,
"Average" = dend3, "Centroid" = dend4)
cors <- cor.dendlist(dend_list)
# Print correlation matrix
round(cors, 2)
## Complete Single Average Centroid
## Complete 1.00 0.46 0.45 0.30
## Single 0.46 1.00 0.23 0.17
## Average 0.45 0.23 1.00 0.31
## Centroid 0.30 0.17 0.31 1.00
Se visualiza la matriz de correlaciĂłn usando una funciĂłn del paquete corrplot
# Visualize the correlation matrix using corrplot package
corrplot(cors, "pie", "lower")
En este capĂtulo se explican y presentan mĂ©todos para gráficar o presentar dendogramas de distintas clases.
#install.packages("igraph")
library(factoextra)
library(dendextend)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
# Load data
data(USArrests)
# Compute distances and hierarchical clustering
dd <- dist(scale(USArrests), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
#Visualizing dendrograms
fviz_dend(hc, cex = 0.5)
Es posible usar los argumentos de main, sub, xlab y ylab para cambiar las caracterĂsticas como se muestra a continuaciĂłn.
fviz_dend(hc, cex = 0.5,
main = "Dendrogram - ward.D2",
xlab = "Objects", ylab = "Distance", sub = "")
También se puede visualizar de manera horizontal.
#To draw a horizontal dendrogram, type this:
fviz_dend(hc, cex = 0.5, horiz = TRUE)
TambiĂ©n es posible cortar el árbol a una altura determinada para particionar los datos en varios grupos, como se describe en el capĂtulo anterior: AgrupaciĂłn jerárquica (CapĂtulo 7). En este caso, es posible colorear ramas alrededor de cada grupo.
#To change the plot theme
fviz_dend(hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
ggtheme = theme_gray() # Change theme
)
Para un cambio estético, en el código a continuación, se cambiaran los colores del grupo usando la paleta de colores “jco” (journal of clinical oncology):
#
fviz_dend(hc, cex = 0.5, k = 4, # Cut in four groups
k_colors = "jco")
También se puede representar con cuadros y de manera horizontal.
fviz_dend(hc, k = 4, cex = 0.4, horiz = TRUE, k_colors = "jco",
rect = TRUE, rect_border = "jco", rect_fill = TRUE)
Además, se puece crear un dendograma horizontal añdiendo la opción _circular” al código.
fviz_dend(hc, cex = 0.5, k = 4,
k_colors = "jco", type = "circular")
Otras formas de visualizaciĂłn como la siguiente son posibles.
fviz_dend(hc, k = 4, k_colors = "jco",
type = "phylogenic", repel = TRUE)
# Case of dendrogram with large data sets
#Zooming in the dendrogram
fviz_dend(hc, xlim = c(1, 20), ylim = c(1, 8))
#Plotting a sub-tree of dendrograms
# Create a plot of the whole dendrogram,
# and extract the dendrogram data
dend_plot <- fviz_dend(hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = "jco"
)
dend_data <- attr(dend_plot, "dendrogram") # Extract dendrogram data
# Cut the dendrogram at height h = 10
dend_cuts <- cut(dend_data, h = 10)
# Visualize the truncated version containing
# two branches
fviz_dend(dend_cuts$upper)
## Warning in min(-diff(our_dend_heights)): no non-missing arguments to min;
## returning Inf
A pesar de lo realizado anteriormente ahora se presenta el digarama completo.
# Plot the whole dendrogram
print(dend_plot)
Ahora se realiza el primer sub-árbol.
# Plot subtree 1
fviz_dend(dend_cuts$lower[[1]], main = "Subtree 1")
A continuación, de la misma manera, se presenta el segundo sub-árbol.
# Plot subtree 2
fviz_dend(dend_cuts$lower[[2]], main = "Subtree 2")
Esto se puede hacer de manera circular también.
#You can also plot circular trees as follow:
fviz_dend(dend_cuts$lower[[2]], type = "circular")
Con el paquete dendextend también es posible manipular y crear dendogramas. A continuación se presenta un ejemplo de como realizar esto.
#Manipulating dendrograms using dendextend
#Standard R code for creating a dendrogram:
data <- scale(USArrests)
dist.res <- dist(data)
hc <- hclust(dist.res, method = "ward.D2")
dend <- as.dendrogram(hc)
plot(dend)
#R code for creating a dendrogram using chaining operator
library(dendextend)
dend <- USArrests[1:5,] %>% # data
scale %>% # Scale the data
dist %>% # calculate a distance matrix,
hclust(method = "ward.D2") %>% # Hierarchical clustering
as.dendrogram # Turn the object into a dendrogram.
plot(dend)
Otro ejemplo del uso de dendextend se presenta aquĂ.
#Example
library(dendextend)
# 1. Create a customized dendrogram
mycols <- c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")
dend <- as.dendrogram(hc) %>%
set("branches_lwd", 1) %>% # Branches line width
set("branches_k_color", mycols, k = 4) %>% # Color branches by groups
set("labels_colors", mycols, k = 4) %>% # Color labels by groups
set("labels_cex", 0.5) # Change label size
# 2. Create plot
fviz_dend(dend)
Como en los casos anteriores se preparan los datos y se descargan los paquetes correspondientes.
df <- scale(mtcars)
# Default plot
heatmap(df, scale = "none")
Se pueden añadir distintos colores de la siguiente forma
col<- colorRampPalette(c("red", "white", "blue"))(256)
#install.packages("RColorBrewer")
library("RColorBrewer")
col <- colorRampPalette(brewer.pal(10, "RdYlBu"))(256)
# Use RColorBrewer color palette names
library("RColorBrewer")
col <- colorRampPalette(brewer.pal(10, "RdYlBu"))(256)
heatmap(df, scale = "none", col = col,
RowSideColors = rep(c("blue", "pink"), each = 16),
ColSideColors = c(rep("purple", 5), rep("orange", 6)))
También es posible utilizar la función del paquete ggplot llamada heatmap.2()
# install.packages("gplots")
library("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(df, scale = "none", col = bluered(100),
trace = "none", density.info = "none")
Al igual que en los casos anteriores es posible mejorar la imagen de estos mapas por medio de la funciĂłn pheatmap() como se muestra a continuaciĂłn.
#install.packages("pheatmap")
library("pheatmap")
pheatmap(df, cutree_rows = 4)
También es posible usar el paquete dentextend para mejorar los heatmaps
library(dendextend)
# order for rows
Rowv <- mtcars %>% scale %>% dist %>% hclust %>% as.dendrogram %>%
set("branches_k_color", k = 3) %>% set("branches_lwd", 1.2) %>%
ladderize
# Order for columns: We must transpose the data
Colv <- mtcars %>% scale %>% t %>% dist %>% hclust %>% as.dendrogram %>%
set("branches_k_color", k = 2, value = c("orange", "blue")) %>%
set("branches_lwd", 1.2) %>%
ladderize
En la anteriores lĂneas de cĂłdigo se organizĂł la informaciĂłn, a continuaciĂłn se realizará el gráfico.
library(gplots)
heatmap.2(scale(mtcars), scale = "none", col = bluered(100),
Rowv = Rowv, Colv = Colv,
trace = "none", density.info = "none")
Para realizar heatmaps complejos es necesario empleear la siguiente lĂnea de cĂłdigo.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ComplexHeatmap")
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.16.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
## in the original pheatmap() are identically supported in the new function. You
## can still use the original function by explicitly calling pheatmap::pheatmap().
##
## Attaching package: 'ComplexHeatmap'
## The following object is masked from 'package:pheatmap':
##
## pheatmap
Heatmap(df,
name = "mtcars", #title of legend
column_title = "Variables", row_title = "Samples",
row_names_gp = gpar(fontsize = 7) # Text size for row names
)
library(circlize)
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
##
## Attaching package: 'circlize'
## The following object is masked from 'package:igraph':
##
## degree
mycols <- colorRamp2(breaks = c(-2, 0, 2),
colors = c("green", "white", "red"))
Heatmap(df, name = "mtcars", col = mycols)
library("circlize")
library("RColorBrewer")
Heatmap(df, name = "mtcars",
col = colorRamp2(c(-2, 0, 2), brewer.pal(n=3, name="RdBu")))
library(dendextend)
row_dend = hclust(dist(df)) # row clustering
col_dend = hclust(dist(t(df))) # column clustering
Heatmap(df, name = "mtcars",
row_names_gp = gpar(fontsize = 6.5),
cluster_rows = color_branches(row_dend, k = 4),
cluster_columns = color_branches(col_dend, k = 2))
Para dividir en dos o más parte un heatmap el procedimiento a seguir es el siguiente.
# Divide into 2 groups
set.seed(2)
Heatmap(df, name = "mtcars", k = 2)
# split by a vector specifying rowgroups
Heatmap(df, name = "mtcars", split = mtcars$cyl,
row_names_gp = gpar(fontsize = 7))
También es posible hacer particiones en una serie definida de la siguiente forma.
# Split by combining multiple variables
Heatmap(df, name ="mtcars",
split = data.frame(cyl = mtcars$cyl, am = mtcars$am),
row_names_gp = gpar(fontsize = 7))
Es posible realizar una o una serie de anotaciones dentro del “heatmap”.
# Define some graphics to display the distribution of columns
.hist = anno_histogram(df, gp = gpar(fill = "lightblue"))
.density = anno_density(df, type = "line", gp = gpar(col = "blue"))
ha_mix_top = HeatmapAnnotation(
hist = .hist, density = .density,
height = unit(3.8, "cm")
)
# Define some graphics to display the distribution of rows
.violin = anno_density(df, type = "violin",
gp = gpar(fill = "lightblue"), which = "row")
.boxplot = anno_boxplot(df, which = "row")
ha_mix_right = HeatmapAnnotation(violin = .violin, bxplt = .boxplot,
which = "row", width = unit(4, "cm"))
# Combine annotation with heatmap
Heatmap(df, name = "mtcars",
column_names_gp = gpar(fontsize = 8),
top_annotation = ha_mix_top) + ha_mix_right
Es posible combinar varios heatmaps por medio del siguiente cĂłdigo.
# Heatmap 1
ht1 = Heatmap(df, name = "ht1", km = 2,
column_names_gp = gpar(fontsize = 9))
# Heatmap 2
ht2 = Heatmap(df, name = "ht2",
col = circlize::colorRamp2(c(-2, 0, 2), c("green", "white", "red")),
column_names_gp = gpar(fontsize = 9))
# Combine the two heatmaps
ht1 + ht2
Además de lo visto anteriormente también es factible realizar una matriz de expresión génica. Esto significa que en los datos de expresión génica las filas son genes y las columans son muestras. Esto implica que se puede adjuntar más información sobre los genes después del mapa de calor de expresiones, como la longitud del gent y el tipo de genes.
expr <- readRDS(paste0(system.file(package = "ComplexHeatmap"),
"/extdata/gene_expression.rds"))
mat <- as.matrix(expr[, grep("cell", colnames(expr))])
type <- gsub("s\\d+_", "", colnames(mat))
ha = HeatmapAnnotation(
df = data.frame(type = type),
annotation_height = unit(4, "mm")
)
Heatmap(mat, name = "expression", km = 5, top_annotation = ha,
show_row_names = FALSE, show_column_names = FALSE) +
Heatmap(expr$length, name = "length", width = unit(5, "mm"),
col = circlize::colorRamp2(c(0, 100000), c("white", "orange"))) +
Heatmap(expr$type, name = "type", width = unit(5, "mm")) +
Heatmap(expr$chr, name = "chr", width = unit(5, "mm"),
col = circlize::rand_color(length(unique(expr$chr))))
## There are 23 unique colors in the vector `col` and 23 unique values in
## `matrix`. `Heatmap()` will treat it as an exact discrete one-to-one
## mapping. If this is not what you want, slightly change the number of
## colors, e.g. by adding one more color or removing a color.