ДЗ 17. Консенсусное дерево (линейная дендрограмма)

if (requireNamespace("rstudioapi", quietly = TRUE) && rstudioapi::isAvailable()) {
  knitr::opts_knit$set(root.dir = dirname(rstudioapi::getActiveDocumentContext()$path))
}

library(ape)
library(purrr)
library(stringr)

has_ggsci <- requireNamespace("ggsci", quietly = TRUE)

path <- "table_with_frequencies.txt"
if (!file.exists(path)) {
  stop(
    "Файл не найден: ", path,
    "\nПапка Knit: ", getwd(),
    "\nФайлы рядом: ", paste(list.files(), collapse = ", ")
  )
}

freq <- read.table(
  file = path,
  header = TRUE,
  sep = "",
  quote = "\"",
  check.names = FALSE,
  stringsAsFactors = FALSE,
  fileEncoding = "UTF-8",
  comment.char = ""
)

names(freq)[1] <- "word"

freq[-1] <- lapply(freq[-1], function(x) as.numeric(gsub(",", ".", as.character(x))))

mx_words_by_docs <- as.matrix(freq[,-1])
rownames(mx_words_by_docs) <- freq$word

mx_docs_by_words <- t(mx_words_by_docs)
rownames(mx_docs_by_words) <- colnames(freq)[-1]
colnames(mx_docs_by_words) <- freq$word

cat("документов =", nrow(mx_docs_by_words), "; слов =", ncol(mx_docs_by_words), "\n")

## документов = 29 ; слов = 5000

df <- as.data.frame(mx_docs_by_words)

mfw_pool <- min(1500, ncol(df))
word_score <- colMeans(df, na.rm = TRUE)
top_idx <- order(word_score, decreasing = TRUE)[1:mfw_pool]

df_mfw <- df[, top_idx, drop = FALSE]
df_mfw <- as.data.frame(scale(df_mfw))

cat("пул предикторов =", ncol(df_mfw), "\n")

## пул предикторов = 1500

get_tree <- function(df, size = 120) {
  size <- min(size, ncol(df))
  X <- df[, sample(ncol(df), replace = FALSE, size = size), drop = FALSE]
  distmx <- dist(X, method = "manhattan")
  as.phylo(hclust(distmx, method = "average"))
}

set.seed(123)
n_trees <- 200
k_words <- 120

trees_result <- map(1:n_trees, ~ get_tree(df_mfw, size = k_words))

par(mfrow = c(2, 2), mar = c(2, 2, 2, 6), cex = 0.75)
walk(
  trees_result[1:4],
  plot.phylo,
  type = "phylogram",
  use.edge.length = FALSE,
  edge.width = 1.2,
  font = 2
)

cons <- consensus(trees_result, p = 0.5, rooted = FALSE)
cons$node.label <- round(as.numeric(cons$node.label), 2)

authors <- str_remove(cons$tip.label, "_.+")
uniq_auth <- unique(authors)

if (has_ggsci) {
  pal <- ggsci::pal_igv()(length(uniq_auth))
} else {
  pal <- grDevices::hcl.colors(length(uniq_auth), palette = "Dark 3")
}
tip_cols <- pal[match(authors, uniq_auth)]

par(mfrow = c(1, 1), mar = c(2, 2, 3, 10), cex = 0.85)

plot.phylo(
  cons,
  type = "phylogram",
  use.edge.length = FALSE,
  edge.width = 1.6,
  font = 2,
  tip.color = tip_cols,
  main = "Консенсусное дерево (p = 0.5)\nчастотности «Тихий Дон» и современная проза"
)

nodelabels(
  text = sprintf("%.2f", cons$node.label),
  node = (1:cons$Nnode) + Ntip(cons),
  frame = "none",
  cex = 0.7,
  adj = c(1.2, -0.2)
)

legend(
  "topleft",
  inset = c(-0.25, 0),
  xpd = TRUE,
  legend = uniq_auth,
  col = pal,
  pch = 19,
  bty = "n",
  cex = 0.85
)

Островский 1 и 2 объединяются с 0.98 → части «Как закалялась сталь» очень близки и почти всегда вместе. Серафимович 1 и 2 объединяются с 1.00 → ещё более устойчиво. Поднятая целина 1 и 2 объединяются с 1.00 → идеально стабильно. «Dubia Тихий Дон» фрагменты собираются в блок рядом с другими шолоховскими текстами, но не все связи одинаково устойчивы (есть 0.60 / 0.62 / 0.80 и т.п.) → это нормальная история: разные куски «Тихого Дона» могут отличаться, а кластеризация зависит от того, какие слова попали в подвыборку. На основе таблицы частотностей построен «лес» из 200 дендрограмм по случайным подвыборкам предикторов (120 слов из пула топ-1500). Далее рассчитано консенсусное дерево при p = 0.5. Подписи на узлах показывают долю деревьев, в которых соответствующий кластер воспроизводится. Наиболее устойчивые пары (например, части одного произведения) имеют поддержку 0.98–1.00, что подтверждает корректность метода на данных; менее устойчивые связи (≈0.5–0.6) отражают зависимость группировок от выбора предикторов и меньшую «жёсткость границ между кластерами. ```

ДЗ 17. Консенсусное дерево (линейная дендрограмма)

Итыгилова Юмжит, ЦМГН251

2026-02-15