Консенсусные деревья

Author

Лесниченко Михаил

Published

February 15, 2026

# Загрузка библиотек
library(stylo)

Warning: пакет 'stylo' был собран под R версии 4.6.0


### stylo version: 0.7.5 ###

If you plan to cite this software (please do!), use the following reference:
    Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
    a package for computational text analysis. R Journal 8(1): 107-121.
    <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>

To get full BibTeX entry, type: citation("stylo")

library(ape)

Warning: пакет 'ape' был собран под R версии 4.5.2

library(tidyverse)

Warning: пакет 'tidyverse' был собран под R версии 4.5.2

Warning: пакет 'ggplot2' был собран под R версии 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::where()  masks ape::where()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

url <- "https://raw.githubusercontent.com/locusclassicus/text_analysis_2024/main/files/table_with_frequencies.txt"
df <- read.table(url, header = TRUE, row.names = 1, check.names = FALSE)

# Транспонируем: строки = тексты, столбцы = слова
df_t <- as.data.frame(t(df))

#Выбираем топ-20 слов с наибольшей дисперсией -------------------------
# Убираем возможные нечисловые колонки
numeric_cols <- df_t[, sapply(df_t, is.numeric)]
words_var <- apply(numeric_cols, 2, var, na.rm = TRUE)
top_words <- names(sort(words_var, decreasing = TRUE)[1:20])

# Матрица только с топ-словами
data_matrix <- df_t[, top_words]
rownames(data_matrix) <- rownames(df_t)

# Кластеризация --------------------------------------------------------
# Расстояние на основе корреляции
dist_matrix <- as.dist(1 - cor(t(data_matrix)))
hc <- hclust(dist_matrix, method = "ward.D2")

# Преобразуем в дерево для ape
phy <- as.phylo(hc)

# Извлекаем авторов
authors <- gsub("_.*", "", phy$tip.label)
unique_authors <- unique(authors)

# Цвета
colors <- c("red", "blue", "green3", "purple", "orange", "brown", "pink", 
            "turquoise", "magenta", "darkgreen", "navy", "gold")
names(colors) <- unique_authors[1:min(length(unique_authors), length(colors))]

# Рисуем дерево
plot(phy, 
     tip.color = colors[authors],
     main = "Консенсусное дерево (топ-20 слов)",
     font = 1,
     cex = 0.7,
     align.tip.label = TRUE)

legend("topleft", 
       legend = unique_authors,
       col = colors[unique_authors],
       pch = 15,
       cex = 0.7,
       title = "Авторы")

#Дерево построено на основе частотности 20 наиболее вариативных слов.
#Расстояние: 1 - корреляция Пирсона, метод кластеризации: Ward.D2.
#Цвета узлов соответствуют авторам. Тексты одного автора группируются вместе