Анализ частотности слов

library(tidyverse)
library(igraph)
library(ggraph)

url <- "https://raw.githubusercontent.com/locusclassicus/text_analysis_2024/main/files/table_with_frequencies.txt"
df <- read.table(url, header = TRUE, row.names = 1, check.names = FALSE)
df_t <- as.data.frame(t(df))
df_t$text <- rownames(df_t)
df_t$author <- gsub("_.*", "", df_t$text)

# Выбор топ-20 слов
words_var <- apply(df_t[, 1:ncol(df)], 2, var)
top_words <- names(sort(words_var, decreasing = TRUE)[1:20])

# Матрица
data_matrix <- df_t[, top_words]
cor_matrix <- cor(t(data_matrix))

# Построение сети
threshold <- 0.85
adj_matrix <- cor_matrix
adj_matrix[adj_matrix < threshold] <- 0
diag(adj_matrix) <- 0

g <- graph_from_adjacency_matrix(adj_matrix, mode = "undirected", weighted = TRUE)
g <- delete_vertices(g, which(degree(g) == 0))

V(g)$author <- df_t$author[match(V(g)$name, df_t$text)]
V(g)$label <- gsub(".*_", "", V(g)$name)

authors <- unique(V(g)$author)
colors <- c("red", "blue", "green3", "purple", "orange", "brown", "pink")
names(colors) <- authors[1:min(length(authors), length(colors))]

# Визуализация
set.seed(123)
ggraph(g, layout = "fr") +
  geom_edge_link(aes(alpha = weight), color = "gray70") +
  geom_node_point(aes(color = author), size = 5) +
  geom_node_text(aes(label = label), repel = TRUE, size = 3) +
  scale_color_manual(values = colors) +
  labs(title = "Консенсусная сеть текстов",
       subtitle = paste("Корреляция >", threshold)) +
  theme_void() +
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5, face = "bold"))