consensys_tree

Author

Ксения Войтова

library(dendextend)

---------------------
Welcome to dendextend version 1.19.1
Type citation('dendextend') for how to cite the package.

Type browseVignettes(package = 'dendextend') for the package vignette.
The github page is: https://github.com/talgalili/dendextend/

Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
You may ask questions at stackoverflow, use the r and dendextend tags: 
     https://stackoverflow.com/questions/tagged/dendextend

    To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
---------------------

Attaching package: 'dendextend'
The following object is masked from 'package:stats':

    cutree
library(stylo)

### stylo version: 0.7.5 ###

If you plan to cite this software (please do!), use the following reference:
    Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
    a package for computational text analysis. R Journal 8(1): 107-121.
    <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>

To get full BibTeX entry, type: citation("stylo")
library(ape)

Attaching package: 'ape'
The following objects are masked from 'package:dendextend':

    ladderize, rotate
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.1     ✔ stringr   1.5.2
✔ ggplot2   4.0.0     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.2.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::where()  masks ape::where()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggsci) 
Warning: package 'ggsci' was built under R version 4.5.2
library(phangorn)
library(TreeTools)
Warning: package 'TreeTools' was built under R version 4.5.2
Registered S3 method overwritten by 'TreeTools':
  method   from    
  [.phyDat phangorn
library(rgl)
Warning: package 'rgl' was built under R version 4.5.2
Warning in rgl.init(initValue, onlyNULL): X11 error: GLXBadContext
Warning: 'rgl.init' failed, will use the null device.
See '?rgl.useNULL' for ways to avoid this warning.
library(igraph)
Warning: package 'igraph' was built under R version 4.5.2

Attaching package: 'igraph'

The following object is masked from 'package:phangorn':

    diversity

The following objects are masked from 'package:lubridate':

    %--%, union

The following objects are masked from 'package:dplyr':

    as_data_frame, groups, union

The following objects are masked from 'package:purrr':

    compose, simplify

The following object is masked from 'package:tidyr':

    crossing

The following object is masked from 'package:tibble':

    as_data_frame

The following objects are masked from 'package:ape':

    degree, edges, mst, ring

The following objects are masked from 'package:stats':

    decompose, spectrum

The following object is masked from 'package:base':

    union
library(ggraph)
library(philentropy)


#читаем файл и преобразовываем таблицу с частотностями

data <- read.table("/Users/az/Documents/R scripts/table_with_frequencies.txt", 
                      header = TRUE,
                      quote = "\"",
                      check.names = FALSE
                      )
data_t <- t(data) |> as.data.frame()

dist_data <- data_t |> 
  philentropy::distance(method = "cosine", use.row.names = TRUE)
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
dist_data <- as.dist(1 - dist_data)
hc <- hclust(dist_data)
plot(hc)

#функция для консенсусного дерева

get_tree <- function(df, n_features = 100) {
  # 1. Случайная выборка столбцов
  selected <- df[, sample(ncol(df), size = n_features)]
  
  # 2. Расстояние
  dist_mx <- selected |> 
    philentropy::distance(method = "cosine", 
                          use.row.names = TRUE, 
                          mute.message = TRUE) |> 
    as.dist()
  dist_mx <- 1 - dist_mx
  
  # 3. Кластеризация 
  tr <- hclust(dist_mx) |> as.phylo()
  
  tr
}

#тестовая проверка
test_tree <- get_tree(data_t)
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
plot(test_tree)

#строим 100 деевьев
set.seed(123)
trees <- map(1:100, ~get_tree(data_t))
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
Metric: 'cosine' with unit: 'log'; comparing: 30 vectors
cons <- consensus(trees, p = 0.5, rooted = FALSE)
plot(cons, type = "fan")

#извлекаем авторов из названий
authors <- str_extract(cons$tip.label, "^[^_]+")

#цвета
authors <- str_extract(cons$tip.label, "^[^_]+")

cols <- tibble(author = authors) |> 
  mutate(color = case_when(
    author == "Шолохов" ~ "#E41A1C",
    author == "Крюков" ~ "#377EB8",
    author == "Булгаков" ~ "#4DAF4A",
    author == "Dubia" ~ "#984EA3",
    author == "Платонов" ~ "#FF7F00",
    author == "Леонов" ~ "#A65628",
    author == "Серафимович" ~ "#F781BF",
    author == "Островский" ~ "#999999",
    author == "Иванов" ~ "#66C2A5",
    author == "Фадеев" ~ "#FC8D62",
    author == "Фурманов" ~ "#8DA0CB",
    author == "Севский" ~ "#E78AC3",
    TRUE ~ "black"
  ))

#график
par(mar = c(0,0,0,0))

plot.phylo(cons, 
           type = "fan",
           tip.color = cols$color,
           font = 2,
           cex = 0.7,
           no.margin = TRUE)

#силы консенсуса на узлах
nodelabels(text = sprintf("%.2f", cons$node.label),
           cex = 0.5,
           frame = "circle",
           bg = "white")
#легенда
legend("bottomleft", 
       legend = unique(cols$author),
       fill = unique(cols$color),
       cex = 0.6,
       bty = "n")