Capítulo 7: Clustering aglomerativo

library("factoextra")

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library("cluster")
# Load the data
data("USArrests")
# Standardize the data
df <- scale(USArrests)
# Show the first 6 rows
head(df, nrow = 6)

##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

Debido a medidas similares se computa la “dissimilarity” en la matriz de valores del data frame.

# df = the standardized data
res.dist <- dist(df, method = "euclidean")
as.matrix(res.dist)[1:6, 1:6]

##             Alabama   Alaska  Arizona Arkansas California Colorado
## Alabama    0.000000 2.703754 2.293520 1.289810   3.263110 2.651067
## Alaska     2.703754 0.000000 2.700643 2.826039   3.012541 2.326519
## Arizona    2.293520 2.700643 0.000000 2.717758   1.310484 1.365031
## Arkansas   1.289810 2.826039 2.717758 0.000000   3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641   0.000000 1.287619
## Colorado   2.651067 2.326519 1.365031 2.831051   1.287619 0.000000

Se emplea a función hclust() para realizar el dendrograma.

# hclust() can be used as follow:
res.hc <- hclust(d = res.dist, method = "ward.D2")
# Dendrogram
# cex: label size
fviz_dend(res.hc, cex = 0.5)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Posterior a esto se verifica el árbol del cluster.

# Verify the cluster tree
# Compute cophentic distance
res.coph <- cophenetic(res.hc)
# Correlation between cophenetic distance and
# the original distance
cor(res.dist, res.coph)

## [1] 0.6975266

Ante esto se realiza lo siguiente para confirmar el valor adecuado.

res.hc2 <- hclust(res.dist, method = "average")
cor(res.dist, cophenetic(res.hc2))

## [1] 0.7180382

Se corta el dendrograma en diferentes grupos, más especificamente, en cuatro grupos.

# Cut tree into 4 groups
grp <- cutree(res.hc, k = 4)
head(grp, n = 4)

##  Alabama   Alaska  Arizona Arkansas 
##        1        2        2        3

También, se realiza una tabla con los grupos.

# Number of members in each cluster
table(grp)

## grp
##  1  2  3  4 
##  7 12 19 12

Y se plantean los nombres de cada uno de los grupos.

# Get the names for the members of cluster 1
rownames(df)[grp == 1]

## [1] "Alabama"        "Georgia"        "Louisiana"      "Mississippi"   
## [5] "North Carolina" "South Carolina" "Tennessee"

Se realiza una gráfica teniendo en cuenta lo realizado anteriormente.

# Cut in 4 groups and color by groups
fviz_dend(res.hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)

También se desarrolla otra manera de visualización de datos empleando la función fviz_cluster()

# Using the function fviz_cluster()
fviz_cluster(list(data = df, cluster = grp),
palette = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
ellipse.type = "convex", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
show.clust.cent = FALSE, ggtheme = theme_minimal())

El resultado final es un Agglomerative nesting (Hierarical clustering) lo cual se presenta a como un dendrograma.

# Cluster R package
# Agglomerative Nesting (Hierarchical Clustering)
res.agnes <- agnes(x = USArrests, # data matrix
stand = TRUE, # Standardize the data
metric = "euclidean", # metric for distance matrix
method = "ward" # Linkage method
)
# DIvisive ANAlysis Clustering
res.diana <- diana(x = USArrests, # data matrix
stand = TRUE, # standardize the data
metric = "euclidean" # metric for distance matrix
)
# After running agnes() and diana(), you can use the function fviz_dend()[in factoextra] to visualize the output:
fviz_dend(res.agnes, cex = 0.6, k = 4)

Capítulo 8: clostering aglomerativo

Se hace una preparación de los datos por medio de un dataframe, además, se hace un subset que contenga 10 columnas para posteriormente plantear una comparación entre dendrogramas con su respectiva gráfica.

#install.packages("dendextend")
#install.packages("corrplot")
library(dendextend)

## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library(corrplot)

## corrplot 0.92 loaded

#Data preparation
df <- scale(USArrests)
# Subset containing 10 rows
set.seed(123)
ss <- sample(1:50, 10)
df <- df[ss,]
#Comparing dendrograms
# Compute distance matrix
res.dist <- dist(df, method = "euclidean")
# Compute 2 hierarchical clusterings
hc1 <- hclust(res.dist, method = "average")
hc2 <- hclust(res.dist, method = "ward.D2")
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
# Create a list to hold dendrograms
dend_list <- dendlist(dend1, dend2)
#Visual comparison of two dendrograms
#Draw a tanglegram:
tanglegram(dend1, dend2)

Al haber realizado ahora se genera el mismo gráfico pero cambiando sus características estéticas de la siguiente forma.

#Customized the tanglegram using many other options as follow:
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE, # Turn-off dashed lines
common_subtrees_color_lines = FALSE, # Turn-off line colors
common_subtrees_color_branches = TRUE, # Color common branches
main = paste("Entanglement =", round(entanglement(dend_list), 2))
)

Se consigue la matriz de correlación.

#Correlation matrix between a list of dendrograms
# Cophenetic correlation matrix
cor.dendlist(dend_list, method = "cophenetic")

##           [,1]      [,2]
## [1,] 1.0000000 0.9925544
## [2,] 0.9925544 1.0000000

También se crea una matriz de correlación con el método de Baker

# Baker correlation matrix
cor.dendlist(dend_list, method = "baker")

##           [,1]      [,2]
## [1,] 1.0000000 0.9895528
## [2,] 0.9895528 1.0000000

Se halla el coeficiente de correlación de Cophenetic

# Cophenetic correlation coefficient
cor_cophenetic(dend1, dend2)

## [1] 0.9925544

También se halla el coeficiente de correlación de Baker

# Baker correlation coefficient
cor_bakers_gamma(dend1, dend2)

## [1] 0.9895528

Entonces, con estos datos se comparan los dendogramas

#COMPARING DENDROGRAMS
# Create multiple dendrograms by chaining
dend1 <- df %>% dist %>% hclust("complete") %>% as.dendrogram
dend2 <- df %>% dist %>% hclust("single") %>% as.dendrogram
dend3 <- df %>% dist %>% hclust("average") %>% as.dendrogram
dend4 <- df %>% dist %>% hclust("centroid") %>% as.dendrogram
# Compute correlation matrix
dend_list <- dendlist("Complete" = dend1, "Single" = dend2,
"Average" = dend3, "Centroid" = dend4)
cors <- cor.dendlist(dend_list)
# Print correlation matrix
round(cors, 2)

##          Complete Single Average Centroid
## Complete     1.00   0.46    0.45     0.30
## Single       0.46   1.00    0.23     0.17
## Average      0.45   0.23    1.00     0.31
## Centroid     0.30   0.17    0.31     1.00

Se visualiza la matriz de correlación usando una función del paquete corrplot

# Visualize the correlation matrix using corrplot package
corrplot(cors, "pie", "lower")

Capítulo 9: Visualización de Dendogramas

En este capítulo se explican y presentan métodos para gráficar o presentar dendogramas de distintas clases.

#install.packages("igraph")
library(factoextra)
library(dendextend)
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

# Load data
data(USArrests)
# Compute distances and hierarchical clustering
dd <- dist(scale(USArrests), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
#Visualizing dendrograms
fviz_dend(hc, cex = 0.5)

Es posible usar los argumentos de main, sub, xlab y ylab para cambiar las características como se muestra a continuación.

fviz_dend(hc, cex = 0.5,
main = "Dendrogram - ward.D2",
xlab = "Objects", ylab = "Distance", sub = "")

También se puede visualizar de manera horizontal.

#To draw a horizontal dendrogram, type this:
fviz_dend(hc, cex = 0.5, horiz = TRUE)

También es posible cortar el árbol a una altura determinada para particionar los datos en varios grupos, como se describe en el capítulo anterior: Agrupación jerárquica (Capítulo 7). En este caso, es posible colorear ramas alrededor de cada grupo.

#To change the plot theme
fviz_dend(hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, # color labels by groups
ggtheme = theme_gray() # Change theme
)

Para un cambio estético, en el código a continuación, se cambiaran los colores del grupo usando la paleta de colores “jco” (journal of clinical oncology):

#
fviz_dend(hc, cex = 0.5, k = 4, # Cut in four groups
k_colors = "jco")

También se puede representar con cuadros y de manera horizontal.

fviz_dend(hc, k = 4, cex = 0.4, horiz = TRUE, k_colors = "jco",
rect = TRUE, rect_border = "jco", rect_fill = TRUE)

Además, se puece crear un dendograma horizontal añdiendo la opción _circular” al código.

fviz_dend(hc, cex = 0.5, k = 4,
k_colors = "jco", type = "circular")

Otras formas de visualización como la siguiente son posibles.

fviz_dend(hc, k = 4, k_colors = "jco",
type = "phylogenic", repel = TRUE)

# Case of dendrogram with large data sets
#Zooming in the dendrogram
fviz_dend(hc, xlim = c(1, 20), ylim = c(1, 8))

#Plotting a sub-tree of dendrograms
# Create a plot of the whole dendrogram,
# and extract the dendrogram data
dend_plot <- fviz_dend(hc, k = 4, # Cut in four groups
cex = 0.5, # label size
k_colors = "jco"
)
dend_data <- attr(dend_plot, "dendrogram") # Extract dendrogram data
# Cut the dendrogram at height h = 10
dend_cuts <- cut(dend_data, h = 10)
# Visualize the truncated version containing
# two branches
fviz_dend(dend_cuts$upper)

## Warning in min(-diff(our_dend_heights)): no non-missing arguments to min;
## returning Inf

A pesar de lo realizado anteriormente ahora se presenta el digarama completo.

# Plot the whole dendrogram
print(dend_plot)

Ahora se realiza el primer sub-árbol.

# Plot subtree 1
fviz_dend(dend_cuts$lower[[1]], main = "Subtree 1")

A continuación, de la misma manera, se presenta el segundo sub-árbol.

# Plot subtree 2
fviz_dend(dend_cuts$lower[[2]], main = "Subtree 2")

Esto se puede hacer de manera circular también.

#You can also plot circular trees as follow:
fviz_dend(dend_cuts$lower[[2]], type = "circular")

Con el paquete dendextend también es posible manipular y crear dendogramas. A continuación se presenta un ejemplo de como realizar esto.

#Manipulating dendrograms using dendextend
#Standard R code for creating a dendrogram:
data <- scale(USArrests)
dist.res <- dist(data)
hc <- hclust(dist.res, method = "ward.D2")
dend <- as.dendrogram(hc)
plot(dend)

#R code for creating a dendrogram using chaining operator
library(dendextend)
dend <- USArrests[1:5,] %>% # data
scale %>% # Scale the data
dist %>% # calculate a distance matrix,
hclust(method = "ward.D2") %>% # Hierarchical clustering
as.dendrogram # Turn the object into a dendrogram.
plot(dend)

Otro ejemplo del uso de dendextend se presenta aquí.

#Example
library(dendextend)
# 1. Create a customized dendrogram
mycols <- c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07")
dend <- as.dendrogram(hc) %>%
set("branches_lwd", 1) %>% # Branches line width
set("branches_k_color", mycols, k = 4) %>% # Color branches by groups
set("labels_colors", mycols, k = 4) %>% # Color labels by groups
set("labels_cex", 0.5) # Change label size
# 2. Create plot
fviz_dend(dend)

Capítulo 10: Heatmap, estadísticas interactivas

Como en los casos anteriores se preparan los datos y se descargan los paquetes correspondientes.

df <- scale(mtcars)

# Default plot
heatmap(df, scale = "none")

Se pueden añadir distintos colores de la siguiente forma

col<- colorRampPalette(c("red", "white", "blue"))(256)

#install.packages("RColorBrewer")
library("RColorBrewer")
col <- colorRampPalette(brewer.pal(10, "RdYlBu"))(256)
# Use RColorBrewer color palette names
library("RColorBrewer")
col <- colorRampPalette(brewer.pal(10, "RdYlBu"))(256)
heatmap(df, scale = "none", col =  col, 
        RowSideColors = rep(c("blue", "pink"), each = 16),
        ColSideColors = c(rep("purple", 5), rep("orange", 6)))

También es posible utilizar la función del paquete ggplot llamada heatmap.2()

# install.packages("gplots")
library("gplots")

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

heatmap.2(df, scale = "none", col = bluered(100), 
          trace = "none", density.info = "none")

Al igual que en los casos anteriores es posible mejorar la imagen de estos mapas por medio de la función pheatmap() como se muestra a continuación.

#install.packages("pheatmap")
library("pheatmap")
pheatmap(df, cutree_rows = 4)

También es posible usar el paquete dentextend para mejorar los heatmaps

library(dendextend)
# order for rows
Rowv  <- mtcars %>% scale %>% dist %>% hclust %>% as.dendrogram %>%
   set("branches_k_color", k = 3) %>% set("branches_lwd", 1.2) %>%
   ladderize
# Order for columns: We must transpose the data
Colv  <- mtcars %>% scale %>% t %>% dist %>% hclust %>% as.dendrogram %>%
   set("branches_k_color", k = 2, value = c("orange", "blue")) %>%
   set("branches_lwd", 1.2) %>%
   ladderize

En la anteriores líneas de código se organizó la información, a continuación se realizará el gráfico.

library(gplots)
heatmap.2(scale(mtcars), scale = "none", col = bluered(100), 
          Rowv = Rowv, Colv = Colv,
          trace = "none", density.info = "none")

Para realizar heatmaps complejos es necesario empleear la siguiente línea de código.

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("ComplexHeatmap")

library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.16.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
##    in the original pheatmap() are identically supported in the new function. You 
##    can still use the original function by explicitly calling pheatmap::pheatmap().

## 
## Attaching package: 'ComplexHeatmap'

## The following object is masked from 'package:pheatmap':
## 
##     pheatmap

Heatmap(df, 
        name = "mtcars", #title of legend
        column_title = "Variables", row_title = "Samples",
        row_names_gp = gpar(fontsize = 7) # Text size for row names
        )

library(circlize)

## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

## 
## Attaching package: 'circlize'

## The following object is masked from 'package:igraph':
## 
##     degree

mycols <- colorRamp2(breaks = c(-2, 0, 2), 
                    colors = c("green", "white", "red"))
Heatmap(df, name = "mtcars", col = mycols)

library("circlize")
library("RColorBrewer")
Heatmap(df, name = "mtcars",
        col = colorRamp2(c(-2, 0, 2), brewer.pal(n=3, name="RdBu")))

library(dendextend)
row_dend = hclust(dist(df)) # row clustering
col_dend = hclust(dist(t(df))) # column clustering
Heatmap(df, name = "mtcars", 
        row_names_gp = gpar(fontsize = 6.5),
        cluster_rows = color_branches(row_dend, k = 4),
        cluster_columns = color_branches(col_dend, k = 2))

Para dividir en dos o más parte un heatmap el procedimiento a seguir es el siguiente.

# Divide into 2 groups
set.seed(2)
Heatmap(df, name = "mtcars", k = 2)

# split by a vector specifying rowgroups
Heatmap(df, name = "mtcars", split = mtcars$cyl,
        row_names_gp = gpar(fontsize = 7))

También es posible hacer particiones en una serie definida de la siguiente forma.

# Split by combining multiple variables
Heatmap(df, name ="mtcars", 
        split = data.frame(cyl = mtcars$cyl, am = mtcars$am),
        row_names_gp = gpar(fontsize = 7))

Es posible realizar una o una serie de anotaciones dentro del “heatmap”.

# Define some graphics to display the distribution of columns
.hist = anno_histogram(df, gp = gpar(fill = "lightblue"))
.density = anno_density(df, type = "line", gp = gpar(col = "blue"))
ha_mix_top = HeatmapAnnotation(
  hist = .hist, density = .density,
  height = unit(3.8, "cm")
  )
# Define some graphics to display the distribution of rows
.violin = anno_density(df, type = "violin", 
                       gp = gpar(fill = "lightblue"), which = "row")
.boxplot = anno_boxplot(df, which = "row")
ha_mix_right = HeatmapAnnotation(violin = .violin, bxplt = .boxplot,
                              which = "row", width = unit(4, "cm"))
# Combine annotation with heatmap
Heatmap(df, name = "mtcars", 
        column_names_gp = gpar(fontsize = 8),
        top_annotation = ha_mix_top) + ha_mix_right

Es posible combinar varios heatmaps por medio del siguiente código.

# Heatmap 1
ht1 = Heatmap(df, name = "ht1", km = 2,
              column_names_gp = gpar(fontsize = 9))
# Heatmap 2
ht2 = Heatmap(df, name = "ht2", 
        col = circlize::colorRamp2(c(-2, 0, 2), c("green", "white", "red")),
        column_names_gp = gpar(fontsize = 9))
# Combine the two heatmaps
ht1 + ht2

Además de lo visto anteriormente también es factible realizar una matriz de expresión génica. Esto significa que en los datos de expresión génica las filas son genes y las columans son muestras. Esto implica que se puede adjuntar más información sobre los genes después del mapa de calor de expresiones, como la longitud del gent y el tipo de genes.

expr <- readRDS(paste0(system.file(package = "ComplexHeatmap"),
                      "/extdata/gene_expression.rds"))
mat <- as.matrix(expr[, grep("cell", colnames(expr))])
type <- gsub("s\\d+_", "", colnames(mat))
ha = HeatmapAnnotation(
  df = data.frame(type = type),
   annotation_height = unit(4, "mm")
  )

Heatmap(mat, name = "expression", km = 5, top_annotation = ha,
    show_row_names = FALSE, show_column_names = FALSE) +
Heatmap(expr$length, name = "length", width = unit(5, "mm"),
    col = circlize::colorRamp2(c(0, 100000), c("white", "orange"))) +
Heatmap(expr$type, name = "type", width = unit(5, "mm")) +
Heatmap(expr$chr, name = "chr", width = unit(5, "mm"),
    col = circlize::rand_color(length(unique(expr$chr))))

## There are 23 unique colors in the vector `col` and 23 unique values in
## `matrix`. `Heatmap()` will treat it as an exact discrete one-to-one
## mapping. If this is not what you want, slightly change the number of
## colors, e.g. by adding one more color or removing a color.

Cuarto informe - Capítulos 7 al 10

2023-05-28

Capítulo 7: Clustering aglomerativo

Capítulo 8: clostering aglomerativo

Capítulo 9: Visualización de Dendogramas

Capítulo 10: Heatmap, estadísticas interactivas