Capítulo 8: clostering aglomerativo

Se hace una preparación de los datos por medio de un dataframe, además, se hace un subset que contenga 10 columnas para posteriormente plantear una comparación entre dendrogramas con su respectiva gráfica.

#install.packages("dendextend")
#install.packages("corrplot")
library(dendextend)

## Warning: package 'dendextend' was built under R version 4.2.3

## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.2.3

## corrplot 0.92 loaded

#Data preparation
df <- scale(USArrests)
# Subset containing 10 rows
set.seed(123)
ss <- sample(1:50, 10)
df <- df[ss,]
#Comparing dendrograms
# Compute distance matrix
res.dist <- dist(df, method = "euclidean")
# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "average")
hc2 <- hclust(res.dist, method = "ward.D2")
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
# Create a list to hold dendrograms
dend_list <- dendlist(dend1, dend2)
#Visual comparison of two dendrograms
#Draw a tanglegram:
tanglegram(dend1, dend2)

Al haber realizado ahora se genera el mismo gráfico pero cambiando sus características estéticas de la siguiente forma.

#Customized the tanglegram using many other options as follow:
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE, # Turn-off dashed lines
common_subtrees_color_lines = FALSE, # Turn-off line colors
common_subtrees_color_branches = TRUE, # Color common branches
main = paste("Entanglement =", round(entanglement(dend_list), 2))
)

Se consigue la matriz de correlación.

#Correlation matrix between a list of dendrograms
# Cophenetic correlation matrix
cor.dendlist(dend_list, method = "cophenetic")

##           [,1]      [,2]
## [1,] 1.0000000 0.9925544
## [2,] 0.9925544 1.0000000

También se crea una matriz de correlación con el método de Baker

# Baker correlation matrix
cor.dendlist(dend_list, method = "baker")

##           [,1]      [,2]
## [1,] 1.0000000 0.9895528
## [2,] 0.9895528 1.0000000

Se halla el coeficiente de correlación de Cophenetic

# Cophenetic correlation coefficient
cor_cophenetic(dend1, dend2)

## [1] 0.9925544

También se halla el coeficiente de correlación de Baker

# Baker correlation coefficient
cor_bakers_gamma(dend1, dend2)

## [1] 0.9895528

Entonces, con estos datos se comparan los dendogramas

#COMPARING DENDROGRAMS
# Create multiple dendrograms by chaining
dend1 <- df %>% dist %>% hclust("complete") %>% as.dendrogram
dend2 <- df %>% dist %>% hclust("single") %>% as.dendrogram
dend3 <- df %>% dist %>% hclust("average") %>% as.dendrogram
dend4 <- df %>% dist %>% hclust("centroid") %>% as.dendrogram
# Compute correlation matrix
dend_list <- dendlist("Complete" = dend1, "Single" = dend2,
"Average" = dend3, "Centroid" = dend4)
cors <- cor.dendlist(dend_list)
# Print correlation matrix
round(cors, 2)

##          Complete Single Average Centroid
## Complete     1.00   0.46    0.45     0.30
## Single       0.46   1.00    0.23     0.17
## Average      0.45   0.23    1.00     0.31
## Centroid     0.30   0.17    0.31     1.00

Se visualiza la matriz de correlación usando una función del paquete corrplot

# Visualize the correlation matrix using corrplot package
corrplot(cors, "pie", "lower")

Capítulo 9: Visualización de Dendogramas

En este capítulo se explican y presentan métodos para gráficar o presentar dendogramas de distintas clases.

#install.packages("igraph")
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.2.3

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(dendextend)
library(igraph)

## Warning: package 'igraph' was built under R version 4.2.3

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

# Load data
data(USArrests)
# Compute distances and hierarchical clustering
dd <- dist(scale(USArrests), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
#Visualizing dendrograms
fviz_dend(hc, cex = 0.5)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.

Es posible usar los argumentos de main, sub, xlab y ylab para cambiar las características como se muestra a continuación.

fviz_dend(hc, cex = 0.5,
main = "Dendrogram - ward.D2",
xlab = "Objects", ylab = "Distance", sub = "")

También se puede visualizar de manera horizontal.

#To draw a horizontal dendrogram, type this:
fviz_dend(hc, cex = 0.5, horiz = TRUE)

También es posible cortar el árbol a una altura determinada para particionar los datos en varios grupos, como se describe en el capítulo anterior: Agrupación jerárquica (Capítulo 7). En este caso, es posible colorear ramas alrededor de cada grupo.

#To change the plot theme
fviz_dend(hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
ggtheme = theme_gray() # Change theme
)

Para un cambio estético, en el código a continuación, se cambiaran los colores del grupo usando la paleta de colores “jco” (journal of clinical oncology):

#
fviz_dend(hc, cex = 0.5, k = 4, # Cut in four groups
k_colors = "jco")

También se puede representar con cuadros y de manera horizontal.

fviz_dend(hc, k = 4, cex = 0.4, horiz = TRUE, k_colors = "jco",
rect = TRUE, rect_border = "jco", rect_fill = TRUE)

Además, se puece crear un dendograma horizontal añdiendo la opción _circular” al código.

fviz_dend(hc, cex = 0.5, k = 4,
k_colors = "jco", type = "circular")

Otras formas de visualización como la siguiente son posibles.

fviz_dend(hc, k = 4, k_colors = "jco",
type = "phylogenic", repel = TRUE)

# Case of dendrogram with large data sets
#Zooming in the dendrogram
fviz_dend(hc, xlim = c(1, 20), ylim = c(1, 8))

#Plotting a sub-tree of dendrograms
# Create a plot of the whole dendrogram,
# and extract the dendrogram data
dend_plot <- fviz_dend(hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = "jco"
)
dend_data <- attr(dend_plot, "dendrogram") # Extract dendrogram data
# Cut the dendrogram at height h = 10
dend_cuts <- cut(dend_data, h = 10)
# Visualize the truncated version containing
# two branches
fviz_dend(dend_cuts$upper)

## Warning in min(-diff(our_dend_heights)): ningún argumento finito para min;
## retornando Inf

A pesar de lo realizado anteriormente ahora se presenta el digarama completo.

# Plot the whole dendrogram
print(dend_plot)

Ahora se realiza el primer sub-árbol.

# Plot subtree 1
fviz_dend(dend_cuts$lower[[1]], main = "Subtree 1")

A continuación, de la misma manera, se presenta el segundo sub-árbol.

# Plot subtree 2
fviz_dend(dend_cuts$lower[[2]], main = "Subtree 2")

Esto se puede hacer de manera circular también.

#You can also plot circular trees as follow:
fviz_dend(dend_cuts$lower[[2]], type = "circular")

Con el paquete dendextend también es posible manipular y crear dendogramas. A continuación se presenta un ejemplo de como realizar esto.

#Manipulating dendrograms using dendextend
#Standard R code for creating a dendrogram:
data <- scale(USArrests)
dist.res <- dist(data)
hc <- hclust(dist.res, method = "ward.D2")
dend <- as.dendrogram(hc)
plot(dend)

#R code for creating a dendrogram using chaining operator
library(dendextend)
dend <- USArrests[1:5,] %>% # data
scale %>% # Scale the data
dist %>% # calculate a distance matrix,
hclust(method = "ward.D2") %>% # Hierarchical clustering
as.dendrogram # Turn the object into a dendrogram.
plot(dend)

Otro ejemplo del uso de dendextend se presenta aquí.

#Example
library(dendextend)
# 1. Create a customized dendrogram
mycols <- c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")
dend <- as.dendrogram(hc) %>%
set("branches_lwd", 1) %>% # Branches line width
set("branches_k_color", mycols, k = 4) %>% # Color branches by groups
set("labels_colors", mycols, k = 4) %>% # Color labels by groups
set("labels_cex", 0.5) # Change label size
# 2. Create plot
fviz_dend(dend)

Capítulo 10: Heatmap, estadísticas interactivas

Como en los casos anteriores se preparan los datos y se descargan los paquetes correspondientes.

df <- scale(mtcars)

# Default plot
heatmap(df, scale = "none")

Se pueden añadir distintos colores de la siguiente forma

col<- colorRampPalette(c("red", "white", "blue"))(256)

#install.packages("RColorBrewer")
library("RColorBrewer")
col <- colorRampPalette(brewer.pal(10, "RdYlBu"))(256)
# Use RColorBrewer color palette names
library("RColorBrewer")
col <- colorRampPalette(brewer.pal(10, "RdYlBu"))(256)
heatmap(df, scale = "none", col =  col, 
        RowSideColors = rep(c("blue", "pink"), each = 16),
        ColSideColors = c(rep("purple", 5), rep("orange", 6)))

También es posible utilizar la función del paquete ggplot llamada heatmap.2()

# install.packages("gplots")
library("gplots")

## Warning: package 'gplots' was built under R version 4.2.3

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

heatmap.2(df, scale = "none", col = bluered(100), 
          trace = "none", density.info = "none")

Al igual que en los casos anteriores es posible mejorar la imagen de estos mapas por medio de la función pheatmap() como se muestra a continuación.

#install.packages("pheatmap")
library("pheatmap")

## Warning: package 'pheatmap' was built under R version 4.2.3

pheatmap(df, cutree_rows = 4)

También es posible usar el paquete dentextend para mejorar los heatmaps

library(dendextend)
# order for rows
Rowv  <- mtcars %>% scale %>% dist %>% hclust %>% as.dendrogram %>%
   set("branches_k_color", k = 3) %>% set("branches_lwd", 1.2) %>%
   ladderize
# Order for columns: We must transpose the data
Colv  <- mtcars %>% scale %>% t %>% dist %>% hclust %>% as.dendrogram %>%
   set("branches_k_color", k = 2, value = c("orange", "blue")) %>%
   set("branches_lwd", 1.2) %>%
   ladderize

En la anteriores líneas de código se organizó la información, a continuación se realizará el gráfico.

library(gplots)
heatmap.2(scale(mtcars), scale = "none", col = bluered(100), 
          Rowv = Rowv, Colv = Colv,
          trace = "none", density.info = "none")

Para realizar heatmaps complejos es necesario empleear la siguiente línea de código.

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("ComplexHeatmap")

library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.14.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
##    in the original pheatmap() are identically supported in the new function. You 
##    can still use the original function by explicitly calling pheatmap::pheatmap().

## 
## Attaching package: 'ComplexHeatmap'

## The following object is masked from 'package:pheatmap':
## 
##     pheatmap

Heatmap(df, 
        name = "mtcars", #title of legend
        column_title = "Variables", row_title = "Samples",
        row_names_gp = gpar(fontsize = 7) # Text size for row names
        )

library(circlize)

## Warning: package 'circlize' was built under R version 4.2.3

## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

## 
## Attaching package: 'circlize'

## The following object is masked from 'package:igraph':
## 
##     degree

mycols <- colorRamp2(breaks = c(-2, 0, 2), 
                    colors = c("green", "white", "red"))
Heatmap(df, name = "mtcars", col = mycols)

library("circlize")
library("RColorBrewer")
Heatmap(df, name = "mtcars",
        col = colorRamp2(c(-2, 0, 2), brewer.pal(n=3, name="RdBu")))

library(dendextend)
row_dend = hclust(dist(df)) # row clustering
col_dend = hclust(dist(t(df))) # column clustering
Heatmap(df, name = "mtcars", 
        row_names_gp = gpar(fontsize = 6.5),
        cluster_rows = color_branches(row_dend, k = 4),
        cluster_columns = color_branches(col_dend, k = 2))

Para dividir en dos o más parte un heatmap el procedimiento a seguir es el siguiente.

# Divide into 2 groups
set.seed(2)
Heatmap(df, name = "mtcars", k = 2)

# split by a vector specifying rowgroups
Heatmap(df, name = "mtcars", split = mtcars$cyl,
        row_names_gp = gpar(fontsize = 7))

También es posible hacer particiones en una serie definida de la siguiente forma.

# Split by combining multiple variables
Heatmap(df, name ="mtcars", 
        split = data.frame(cyl = mtcars$cyl, am = mtcars$am),
        row_names_gp = gpar(fontsize = 7))

Es posible realizar una o una serie de anotaciones dentro del “heatmap”.

# Define some graphics to display the distribution of columns
.hist = anno_histogram(df, gp = gpar(fill = "lightblue"))
.density = anno_density(df, type = "line", gp = gpar(col = "blue"))
ha_mix_top = HeatmapAnnotation(
  hist = .hist, density = .density,
  height = unit(3.8, "cm")
  )
# Define some graphics to display the distribution of rows
.violin = anno_density(df, type = "violin", 
                       gp = gpar(fill = "lightblue"), which = "row")
.boxplot = anno_boxplot(df, which = "row")
ha_mix_right = HeatmapAnnotation(violin = .violin, bxplt = .boxplot,
                              which = "row", width = unit(4, "cm"))
# Combine annotation with heatmap
Heatmap(df, name = "mtcars", 
        column_names_gp = gpar(fontsize = 8),
        top_annotation = ha_mix_top) + ha_mix_right

Es posible combinar varios heatmaps por medio del siguiente código.

# Heatmap 1
ht1 = Heatmap(df, name = "ht1", km = 2,
              column_names_gp = gpar(fontsize = 9))
# Heatmap 2
ht2 = Heatmap(df, name = "ht2", 
        col = circlize::colorRamp2(c(-2, 0, 2), c("green", "white", "red")),
        column_names_gp = gpar(fontsize = 9))
# Combine the two heatmaps
ht1 + ht2

Además de lo visto anteriormente también es factible realizar una matriz de expresión génica. Esto significa que en los datos de expresión génica las filas son genes y las columans son muestras. Esto implica que se puede adjuntar más información sobre los genes después del mapa de calor de expresiones, como la longitud del gent y el tipo de genes.

expr <- readRDS(paste0(system.file(package = "ComplexHeatmap"),
                      "/extdata/gene_expression.rds"))
mat <- as.matrix(expr[, grep("cell", colnames(expr))])
type <- gsub("s\\d+_", "", colnames(mat))
ha = HeatmapAnnotation(
  df = data.frame(type = type),
   annotation_height = unit(4, "mm")
  )

Heatmap(mat, name = "expression", km = 5, top_annotation = ha,
    show_row_names = FALSE, show_column_names = FALSE) +
Heatmap(expr$length, name = "length", width = unit(5, "mm"),
    col = circlize::colorRamp2(c(0, 100000), c("white", "orange"))) +
Heatmap(expr$type, name = "type", width = unit(5, "mm")) +
Heatmap(expr$chr, name = "chr", width = unit(5, "mm"),
    col = circlize::rand_color(length(unique(expr$chr))))

Cuarto informe - Capítulos 7 al 10

2023-05-28

Capítulo 8: clostering aglomerativo

Capítulo 9: Visualización de Dendogramas

Capítulo 10: Heatmap, estadísticas interactivas