apedendextendlabels_cex(dend) <- value, set(dend, "labels_cex", value)labels_colors(dend, labels)by_labels_brances_col, labels_colorsset(dend, "branches_k_color", value)by_labels_branches_*dendextend::rotatestats::distを用いて距離行列を作成する場合euclidean[default], maximum, manhattan, canberra, binary, minkowskiから選択.
binaryではjaccard係数による距離行列(1-jaccard係数).hclustはdistオブジェクトを第一引数に受け取り階層的クラスタリングを実行する.
ward.D, ward.D2, single, complete[default], average(= UPGMA), mcquitty(= WPGMA), median(= WPGMC) centroid (= UPGMC)の中から選択.amap::Distを用いて距離行列を作成する場合.euclidean[default], maximum, manhattan, canberra, binary, pearson, abspearson, correlation, abscorrelation, spearman, kendallから選択amap::Distのメソッドpearsonは1-cosine類似度, correlationは1-pearsonの積率相関係数であることに注意# クラスタリング. 単位が変数毎に異なるのでcorrelationにした.
distm = "correlation"
clm = "average"
cl_cor <- hclust(amap::Dist(as.matrix(mtcars), method = distm), method = clm)
# hclust object
str(cl_cor)## List of 7
## $ merge : int [1:31, 1:2] -12 -1 -14 -10 -5 -7 -22 -15 -16 -4 ...
## $ height : num [1:31] 5.32e-06 5.43e-06 1.78e-05 3.44e-05 5.95e-05 ...
## $ order : int [1:32] 28 30 31 19 20 18 26 8 4 6 ...
## $ labels : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ method : chr "average"
## $ call : language hclust(d = amap::Dist(as.matrix(mtcars), method = distm), method = clm)
## $ dist.method: chr "correlation"
## - attr(*, "class")= chr "hclust"
proxy::distを用いて距離行列を作る場合.summary(pr_DB)で一覧が見えるproxy::dist, 類似度行列はproxy::siml## * Similarity measures:
## Braun-Blanquet, Chi-squared, correlation, cosine, Cramer, Dice,
## eDice, eJaccard, Fager, Faith, Gower, Hamman, Jaccard,
## Kulczynski1, Kulczynski2, Michael, Mountford, Mozley, Ochiai,
## Pearson, Phi, Phi-squared, Russel, simple matching, Simpson,
## Stiles, Tanimoto, Tschuprow, Yule, Yule2
##
## * Distance measures:
## Bhjattacharyya, Bray, Canberra, Chord, divergence, Euclidean,
## fJaccard, Geodesic, Hellinger, Kullback, Levenshtein, Mahalanobis,
## Manhattan, Minkowski, Podani, Soergel, supremum, Wave, Whittaker
stats::dist(method="binary"), amap::Dist(method="binary")はいずれもjaccard係数による距離行列が作られている# サンプルデータ
x <- list(v1 = c("a", "b", "c", "d"), v2 = c("b", "d", "a", "c"),
v3 = c("a", "b", "c", "e"), v4 = c("b", "c", "e", "f"),
v5 = c("a", "e", "f", "g"), v6 = c("f", "h", "i", "j")
)
str(x)## List of 6
## $ v1: chr [1:4] "a" "b" "c" "d"
## $ v2: chr [1:4] "b" "d" "a" "c"
## $ v3: chr [1:4] "a" "b" "c" "e"
## $ v4: chr [1:4] "b" "c" "e" "f"
## $ v5: chr [1:4] "a" "e" "f" "g"
## $ v6: chr [1:4] "f" "h" "i" "j"
sparse_mat <- function(vlist, vorder = F) {
# unique elements of all sets
id <- sort(unique(unlist(vlist)))
# If 'vlist' consists of an ordered vector, the size of each set must be the same
if (vorder == T) {
# size of an set
vl <- unique(sapply(vlist, length))
# sparse matrix from a list consist of ordered vectors
if (length(vl) == 1) {
vdummy <- as.vector(outer(id, 1:vl, function(x, y) paste(x, y, sep = "_")))
binlist <- lapply(seq_along(vlist), function(i) {
as.vector(do.call(rbind, lapply(id, function(x) {
sapply(vlist[[i]], function(y) ifelse(y == x, 1, 0))
})))
})
dat <- data.frame(vdummy, do.call(cbind, binlist))
} else {
stop("argument 'vlist' consist from ordered vectors, which length must to be all the same.")
}
} else {# sparse matrix from a list consist of multiple sets
cnt_list <- lapply(vlist, function(x) {
sapply(seq_along(id), function(i) sum(x %in% id[i]))
})
dat <- data.frame(id, do.call(cbind, cnt_list), stringsAsFactors = F)
if (!is.null(names(vlist))) {
dat <- setNames(dat, c("id", names(vlist)))
}
}
# return data.frame
return(dat)
}
spmat1 <- sparse_mat(x, F)
spmat2 <- sparse_mat(x, T) | v1 | v2 | v3 | v4 | v5 | v6 |
|---|---|---|---|---|---|
| a | b | a | b | a | f |
| b | d | b | c | e | h |
| c | a | c | e | f | i |
| d | c | e | f | g | j |
| id | v1 | v2 | v3 | v4 | v5 | v6 |
|---|---|---|---|---|---|---|
| a | 1 | 1 | 1 | 0 | 1 | 0 |
| b | 1 | 1 | 1 | 1 | 0 | 0 |
| c | 1 | 1 | 1 | 1 | 0 | 0 |
| d | 1 | 1 | 0 | 0 | 0 | 0 |
| e | 0 | 0 | 1 | 1 | 1 | 0 |
| f | 0 | 0 | 0 | 1 | 1 | 1 |
| g | 0 | 0 | 0 | 0 | 1 | 0 |
| h | 0 | 0 | 0 | 0 | 0 | 1 |
| i | 0 | 0 | 0 | 0 | 0 | 1 |
| j | 0 | 0 | 0 | 0 | 0 | 1 |
| vdummy | X1 | X2 | X3 | X4 | X5 | X6 |
|---|---|---|---|---|---|---|
| a_1 | 1 | 0 | 1 | 0 | 1 | 0 |
| b_1 | 0 | 1 | 0 | 1 | 0 | 0 |
| c_1 | 0 | 0 | 0 | 0 | 0 | 0 |
| d_1 | 0 | 0 | 0 | 0 | 0 | 0 |
| e_1 | 0 | 0 | 0 | 0 | 0 | 0 |
| f_1 | 0 | 0 | 0 | 0 | 0 | 1 |
| g_1 | 0 | 0 | 0 | 0 | 0 | 0 |
| h_1 | 0 | 0 | 0 | 0 | 0 | 0 |
| i_1 | 0 | 0 | 0 | 0 | 0 | 0 |
| j_1 | 0 | 0 | 0 | 0 | 0 | 0 |
| a_2 | 0 | 0 | 0 | 0 | 0 | 0 |
| b_2 | 1 | 0 | 1 | 0 | 0 | 0 |
qdapTools::mtabulateを使う.| a | b | c | d | e | f | g | h | i | j | |
|---|---|---|---|---|---|---|---|---|---|---|
| v1 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| v2 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| v3 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| v4 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 |
| v5 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
| v6 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
cutree目的のクラスタ数k,または枝長(h)を指定して、グループ分け.dendextend::cutreeは stats::cutreeとほぼ同じだが、デンドログラムのラベル順に出力できる.
order_clusters_as_data = Fleaflab “perpendicular” ,“textlike”, “none”horiz, 水平に書くか否か。type “rectangle”か“triangle”. 省略して“r”,“t”でも良いnodePar ノードのパラメータnodePar = list(pch, cex, col, xpd, bg)のようにリストで与えるedgePar エッジのパラメータedgePar = list(col, lty, lwd, p.col, p.lwd, p.lty, t.col) nodeParとedgeParいずれも、内部ノードと末端ノードを区別する場合に2つの値をpch=c(inner nodes, leaves)のように書く。# dendrogramクラスオブジェクトに変換
cl <- hclust(amap::Dist(mtcars, "spearman"), "average")
den <- as.dendrogram(cl)
# dendrogramクラスオブジェクトのplot. ?plot.dendrogram
par(mfrow = c(1,2))
plot(den,
type = "r",
horiz = T,
nodePar = list(pch = NA, lab.cex = 0.5),
edgePar = list(col = 1:2, lty = 1:2, lwd = c(1,3))
# dLeaf = 1, # leafラベルとの距離
# edge.root = TRUE
)
plot(den, horiz = T, leaflab = "none",
nodePar = list(pch = c(2,20), cex = c(0.5, 1), col = c(2,4))
)apeape::as.phylophyloクラスオブジェクトに変換# クラスタリング
cl <- hclust(amap::Dist(mtcars, "spearman"), "average")
# hclustオブジェクトもしくはdendrogramオブジェクトをphyloクラスオブジェクトに変換
phy <- as.phylo(cl)
str(phy)## List of 4
## $ edge : int [1:62, 1:2] 33 34 38 38 53 58 58 53 59 59 ...
## $ edge.length: num [1:62] 6.86 3.39 1.5 0.5 1 ...
## $ tip.label : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ Nnode : int 31
## - attr(*, "class")= chr "phylo"
## - attr(*, "order")= chr "cladewise"
# デンドログラム描画
types <- c("phylogram", "cladogram","fan", "unrooted","radial")
ncols <- sub(3,4, cutree(cl, 3))
par(mfrow = c(3,2), mar = c(0,0,0,0))
invisible(lapply(types, function(x){
plot.phylo(phy,
type = x,
label.offset = 0.1,
cex = 0.7,
show.tip.label = T,
edge.width = 2,
font = 4,
edge.lty = 1,
adj = 0,
tip.color = ncols
)
}))
# デンドログラム描画 ラベル追加
phy$tip.label <- sprintf("t%d", 1:32)
plot.phylo(phy, label.offset = 0.5)
nodelabels(cex=0.4)
edgelabels(cex = 0.5)
tiplabels(cex = 0.5)# par(mfrow=c(1,2))
# plot.phylo(phy, show.tip.label = F)
# tiplabels(frame="circle", adj = 0,cex=0.1)
# tiplabels(labels(phy), col = 2, bg = NULL, adj=c(1,1))
# nodelabels(cex=0.5, horiz = T)
# edgelabels(cex = 0.5)
#
# # rotate
# rot.phy <- rotate(phy, node=41)
# plot.phylo(rot.phy, show.tip.label = F)
# nodelabels(cex=0.5, horiz = T)
# edgelabels(cex = 0.5)dendextendlabels(den)デンドログラムのラベルlabels_colors, labels_cexbranches_k_color, branches_lwd, by_labels_branches_col# irisデータ(行ラベルをユニークなものに変える)
dat <- iris %>%
group_by(Species) %>%
mutate(n_gp = row_number()) %>%
ungroup() %>%
mutate(Species = paste0(substr(Species, 1, 3), n_gp)) %>%
tibble::column_to_rownames("Species") %>%
select(1:4)| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | |
|---|---|---|---|---|
| set1 | 5.1 | 3.5 | 1.4 | 0.2 |
| set2 | 4.9 | 3.0 | 1.4 | 0.2 |
| set3 | 4.7 | 3.2 | 1.3 | 0.2 |
| set4 | 4.6 | 3.1 | 1.5 | 0.2 |
| set5 | 5.0 | 3.6 | 1.4 | 0.2 |
| set6 | 5.4 | 3.9 | 1.7 | 0.4 |
labels_cex(dend) <- value, set(dend, "labels_cex", value)dendextend::nleaves(den)末端ノード数, dendextend::nnodes(den) ノードの数labels_cex(den) ラベル文字サイズ# dendrogramオブジェクト
cl_iris <- hclust(dist(dat, "euclidean"), "average")
den <- as.dendrogram(cl_iris)
# set で変更する場合(Virginicaのみ0.7に変更)
cex.den <- ifelse(substr(labels(den), 1, 3) == "vir", 0.7, 0.5)
den1 <- den %>% dendextend::set("labels_cex", cex.den)
# labels_cex()で変更する場合
den2 <- den
labels_cex(den2) <- cex.den
# プロット
par(mfrow = c(2,1))
plot(den1); plot(den2)labels_colors(dend, labels)stats::order.dendrogram()デンドログラムの末端ノードラベルの順番dendextend::labels_colors(dend, labels) ラベル色# 全ノードのカラーコード
col3 <- RColorBrewer::brewer.pal(3, "Set1")
lab.col <- col3[iris$Species][order.dendrogram(den1)]
# setを使う場合
den3 <- den1 %>% dendextend::set("labels_colors", value = lab.col)
# labels_colorsを使う場合
den4 <- den1
labels_colors(den4) <- ifelse(substr(labels(den), 1,3) == "vir", "magenta","grey")
par(mfrow = c(2,1))
plot(den3); plot(den4) by_labels_brances_col, labels_colorsby_labels_branches_colで書き換え# ラベルの種類でbranchの色分け ----
sp <- split(labels(den), factor(substr(labels(den),1,3))) # カテゴリ別のラベルリスト
col3 <- brewer.pal(3, "Set1") # カテゴリに対応したカラーコード
# mapplyを使ってデンドログラムオブジェクトにエッジカラーを追加していく
den5 <- den1
invisible(mapply(function(x, y){
den5 <<- dendextend::set(den5, "by_labels_branches_col", value = x, TF_values = y)
}, x = sp, y = col3))
# ラベルの色を追加
den5 <- den5 %>% dendextend::set("labels_colors", value = lab.col)
# プロット
plot(den5)
legend("topright", legend = c("setosa", "versicolor", "virginica"),
box.lty = 0, bg = "transparent", pch = 15, col = col3)set(dend, "branches_k_color", value)stats::cutree()ではなく、dendextend::cutreeを使ってカラーコードのベクトルを作る.order_clusters_as_data = Fとするとleafラベル順のクラスタ番号がわかる# クラスター数(k)でラベルとブランチの色を変更
col.cl3 <- col3[dendextend::cutree(cl_iris, k = 3, order_clusters_as_data = F)]
den1 %>%
dendextend::set("labels_colors", value = col.cl3) %>%
dendextend::set("branches_k_color", k = 3, col3) %>%
plot
legend("topright",
legend = c("cluster1", "cluster2", "cluster3"),
box.lty = 0, bg = "transparent", pch = 15, col = col3)by_labels_branches_*by_labels_branches_col branch色を指定ラベルと色コードで指定by_labels_branches_lwd branch幅, by_labels_branches_lty 線種# 指定のラベル及びbranchの色を変更 (cluster3に含まれるvirginicaのみ)----
ct <- cutree(cl_iris, k = 3, order_clusters_as_data = F) # クラスタ番号(ラベル順)
cl3.vir <- names(ct)[ct == 3 & grepl("vir", names(ct))] # ブランチ色を指定したいノード
col.cl3 <- labels_col(den5) # クラスタ別の色ベクトル
col.cl3[which(names(ct) %in% cl3.vir)] <- "magenta" # 指定ラベルの色を変更
den5 %>%
dendextend::set("by_labels_branches_col", value = cl3.vir, TF_values = "magenta") %>%
dendextend::set("by_labels_branches_lwd", value = cl3.vir, TF_values = 2) %>%
dendextend::set("by_labels_branches_lty", value = cl3.vir, TF_values = 2) %>%
dendextend::set("labels_colors", value = col.cl3) %>%
plot
legend("topright",
legend = c("setosa", "versicolor", "virginica_cl2", "virginica_cl3"),
box.lty = 0, bg = "transparent", pch = 15, col = append(col3, "magenta", after = 2))nodes_col, nodes_pch, nodes_cexget_nodes_attr(den, attribute="nodePar")[[2]]# Internal node(内部節)の色・形・サイズ。 場所はstr(den)で確認。
v.nc <- ifelse(1:nnodes(den) %in% c(2,102), "darkorange",
ifelse(1:nnodes(den) %in% 173, "magenta", NA))
v.np <- ifelse(1:nnodes(den) %in% c(2,102), 16,
ifelse(1:nnodes(den) %in% 173, 17, NA))
# ノードのサイズにクラスタメンバ数を反映させる
ncex <- ceiling(table(ct)/20)
v.ncx <- ifelse(1:nnodes(den) %in% 2, ncex[1],
ifelse(1:nnodes(den) %in% 102, ncex[2],
ifelse(1:nnodes(den) %in% 173, ncex[3], NA)))
# setを使って変更
den5 %>%
dendextend::set("nodes_col", v.nc) %>%
dendextend::set("nodes_pch", v.np) %>%
dendextend::set("nodes_cex", v.ncx) %>%
plot dendextend::rotateden5 %>%
dendextend::rotate(c(labels(cl_iris)[ct == 2 | ct == 3], labels(cl_iris)[ct == 1])) %>%
plotprune 指定データを取り除くleaves_pch, leaves_col, labelslab.col3 <- lab.col[!labels(den) %in% cl3.vir]
dendextend::prune(den5, leaves = cl3.vir) %>%
dendextend::set("nodes_col", v.nc) %>%
dendextend::set("nodes_pch", v.np) %>%
dendextend::set("nodes_cex", v.ncx) %>%
dendextend::set("leaves_col",lab.col3) %>%
dendextend::set("leaves_pch", 17) %>%
dendextend::set("leaves_cex", 0.7) %>%
dendextend::set("labels", c(rep(NA, 150))) %>%
plot# labels - set the labels (using labels<-.dendrogram)
# labels_colors - set the labels’ colors (using color_labels)
# labels_cex - set the labels’ size (using assign_values_to_leaves_nodePar)
# labels_to_character - set the labels’ to be characters
# leaves_pch - set the leaves’ point type (using assign_values_to_leaves_nodePar)
# leaves_cex - set the leaves’ point size (using assign_values_to_leaves_nodePar)
# leaves_col - set the leaves’ point color (using assign_values_to_leaves_nodePar)
# nodes_pch - set the nodes’ point type (using assign_values_to_nodes_nodePar)
# nodes_cex - set the nodes’ point size (using assign_values_to_nodes_nodePar)
# nodes_col - set the nodes’ point color (using assign_values_to_nodes_nodePar)
# hang_leaves - hang the leaves (using hang.dendrogram)
# branches_k_color - color the branches (using color_branches)
# branches_col - set the color of branches (using assign_values_to_branches_edgePar)
# branches_lwd - set the line width of branches (using assign_values_to_branches_edgePar)
# branches_lty - set the line type of branches (using assign_values_to_branches_edgePar)
# by_labels_branches_col - set the color of branches with specific labels (using branches_attr_by_labels)
# by_labels_branches_lwd - set the line width of branches with specific labels (using branches_attr_by_labels)
# by_labels_branches_lty - set the line type of branches with specific labels (using branches_attr_by_labels)
# clear_branches - clear branches’ attributes (using remove_branches_edgePar)
# clear_leaves - clear leaves’ attributes (using remove_branches_edgePar)# iris各種20個体づつ
set.seed(2)
dat <- iris %>%
group_by(Species) %>%
sample_n(size = 20, replace = F) %>%
mutate(n_gp = row_number()) %>%
ungroup() %>%
mutate(Species = paste0(substr(Species, 1, 3), n_gp)) %>%
tibble::column_to_rownames("Species") %>%
select(1:4)
# クラスタリング
dm1 <- "euclidean"; dm2 <- "manhattan"
den1 <- as.dendrogram(hclust(amap::Dist(dat, dm1), "average"))
den2 <- as.dendrogram(hclust(amap::Dist(dat, dm2), "average"))
dlist <- dendextend::dendlist(den1, den2)
# タングルグラム
xlm <- signif(max(attributes(den1)$height, attributes(den2)$height), 2)
dendextend::tanglegram(dlist,
which = 2:1,
lab.cex = 0.8,
edge.lwd = 2,
margin_inner = 5,
center = T,
color_lines = "gray80",
common_subtrees_color_branches = F,
dLeaf = -0.1,
xlim = c(xlm, 0),
k_branches = 3,
main_left = dm2,
main_right = dm1)# 種ごとにleafとedgeの色分けする関数
sp <- split(labels(den1), factor(substr(labels(den1),1,3)))
ecol <- RColorBrewer::brewer.pal(3, "Set1")
lcol <- ecol[factor(substr(rownames(dat), 1, 3))]
colored_den <- function(den, llab, ecol, lcol){
invisible(mapply(function(x, y){
den <<- dendextend::set(den, "by_labels_branches_col", value = x, TF_values = y)
}, x = llab, y = ecol))
lcol <- lcol[order.dendrogram(den)]
den <- dendextend::set(den, "labels_colors", value = lcol)
return(den)
}
c_den1 <- colored_den(den1, sp, ecol, lcol)
c_den2 <- colored_den(den2, sp, ecol, lcol)
dlist <- dendextend::dendlist(c_den1, c_den2)
# タングルグラム
dendextend::tanglegram(dlist,
which = 2:1,
lab.cex = 1,
edge.lwd = 2,
margin_inner = 3,
center = T,
color_lines = "gray80",
common_subtrees_color_branches = F,
dLeaf = -0.1,
xlim = c(xlm, 0),
main_left = dm2,
main_right = dm1)## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tibble_2.1.3 dendextend_1.12.0 RColorBrewer_1.1-2
## [4] ape_5.3 dplyr_0.8.1 proxy_0.4-23
## [7] amap_0.8-16
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 highr_0.8 plyr_1.8.4
## [4] pillar_1.4.2 compiler_3.5.2 bitops_1.0-6
## [7] qdapTools_1.3.3 viridis_0.5.1 tools_3.5.2
## [10] digest_0.6.19 viridisLite_0.3.0 evaluate_0.14
## [13] nlme_3.1-140 gtable_0.3.0 lattice_0.20-38
## [16] pkgconfig_2.0.2 rlang_0.4.0 rstudioapi_0.10
## [19] yaml_2.2.0 parallel_3.5.2 xfun_0.8
## [22] rsko_0.1.0 kableExtra_1.1.0 gridExtra_2.3
## [25] xml2_1.2.0 httr_1.4.0 stringr_1.4.0
## [28] knitr_1.23 hms_0.4.2 webshot_0.5.1
## [31] grid_3.5.2 tidyselect_0.2.5 data.table_1.12.2
## [34] glue_1.3.1 R6_2.4.0 rmarkdown_1.13
## [37] pacman_0.5.1 readr_1.3.1 purrr_0.3.2
## [40] ggplot2_3.2.0 magrittr_1.5 scales_1.0.0
## [43] htmltools_0.3.6 rvest_0.3.4 assertthat_0.2.1
## [46] colorspace_1.4-1 stringi_1.4.3 RCurl_1.95-4.12
## [49] lazyeval_0.2.2 munsell_0.5.0 chron_2.3-53
## [52] crayon_1.3.4