library(tidyverse)
library(fs)
library(tidytext)
library(tokenizers)
library(tidymodels)
library(ggrepel)
library(patchwork)
library(knitr)
library(kableExtra)
library(broom)
tidymodels_prefer()
set.seed(6769)
meta <- read_tsv(
"data/overview.tsv",
col_types = cols(.default = col_character())
) |>
mutate(across(where(is.character), str_squish)) |>
rename(
text_id = textID,
author_id = authorID,
year = `1stPubl`,
gender = author_gender
) |>
mutate(
text_id = as.integer(text_id),
year = as.integer(year),
gender = if_else(gender == "1", "female", "male")
)
meta |>
select(text_id, author, author_id, title, year, gender) |>
kable(caption = "Метаданные корпуса") |>
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
)
| text_id | author | author_id | title | year | gender |
|---|---|---|---|---|---|
| 1 | Austen, Jane | JA | Emma | 1815 | female |
| 2 | Austen, Jane | JA | Pride | 1813 | female |
| 3 | Austen, Jane | JA | Sense | 1811 | female |
| 4 | Bronte, Anne | AB | Agnes Grey | 1847 | female |
| 5 | Bronte, Anne | AB | Tentant of Wildfell Hall | 1848 | female |
| 6 | Bronte, Charlotte | CB | Jane Eyre | 1847 | female |
| 7 | Bronte, Charlotte | CB | Professor | 1845 | female |
| 8 | Bronte, Charlotte | CB | Villette | 1853 | female |
| 9 | Bronte, Emily | EB | Wuthering Heights | 1847 | female |
| 10 | Dickens, Charles | CD | Bleak House | 1852 | male |
| 11 | Dickens, Charles | CD | David Copperfield | 1849 | male |
| 12 | Dickens, Charles | CD | Hard Times | 1854 | male |
| 13 | Eliot, George | GE | Adam Bede | 1859 | female |
| 14 | Eliot, George | GE | Middlemarch | 1871 | female |
| 15 | Eliot, George | GE | The Mill on the Floss | 1860 | female |
| 16 | Fielding, Henry | HF | Joseph Andrews | 1742 | male |
| 17 | Fielding, Henry | HF | Tom Jones | 1749 | male |
| 18 | Richardson, Samuel | SR | Clarissa, or, the History of a Young Lady | 1748 | male |
| 19 | Richardson, Samuel | SR | Pamela | 1740 | male |
| 20 | Sterne, Laurence | LS | Tristam Shandy | 1759 | male |
| 21 | Sterne, Laurence | LS | Sentimental | 1768 | male |
| 22 | Trollope, Antony | AT | Prime Minister | 1876 | male |
| 23 | Trollope, Antony | AT | Barchester Towers | 1857 | male |
| 24 | Trollope, Antony | AT | Phineas Finn | 1869 | male |
| 25 | Thackeray, William Makepeace | WT | Vanity Fair | 1848 | male |
| 26 | Thackeray, William Makepeace | WT | The History of Pendennis | 1848 | male |
| 27 | Thackeray, William Makepeace | WT | The Luck of Barry Lyndon | 1844 | male |
read_novel <- function(path) {
read_lines(path, locale = locale(encoding = "UTF-8")) |>
paste(collapse = " ")
}
txt_files <- dir_ls("data/british_fiction", glob = "*.txt")
texts_raw <- tibble(
filepath = txt_files,
filename = path_file(txt_files),
raw_text = map_chr(txt_files, read_novel)
)
cat("Файлов:", nrow(texts_raw), "\n")
## Файлов: 27
print(texts_raw$filename)
## [1] "ABronte_Agnes.txt" "ABronte_Tenant.txt"
## [3] "Austen_Emma.txt" "Austen_Pride.txt"
## [5] "Austen_Sense.txt" "CBronte_Jane.txt"
## [7] "CBronte_Professor.txt" "CBronte_Villette.txt"
## [9] "Dickens_Bleak.txt" "Dickens_David.txt"
## [11] "Dickens_Hard.txt" "EBronte_Wuthering.txt"
## [13] "Eliot_Adam.txt" "Eliot_Middlemarch.txt"
## [15] "Eliot_Mill.txt" "Fielding_Joseph.txt"
## [17] "Fielding_Tom.txt" "Richardson_Clarissa.txt"
## [19] "Richardson_Pamela.txt" "Sterne_Sentimental.txt"
## [21] "Sterne_Tristram.txt" "Thackeray_Barry.txt"
## [23] "Thackeray_Pendennis.txt" "Thackeray_Vanity.txt"
## [25] "Trollope_Barchester.txt" "Trollope_Phineas.txt"
## [27] "Trollope_Prime.txt"
Лучше это сделать не кодом, а вручную (вернее, с помощью LLM) сопоставив названия файлов.
file_map <- tribble(
~filename, ~author_id, ~short_title,
# ── Austen, Jane (JA) ──────────────────────────────────────────────────────
"Austen_Emma.txt", "JA", "Emma",
"Austen_Pride.txt", "JA", "Pride & Prejudice",
"Austen_Sense.txt", "JA", "Sense & Sensibility",
# ── Bronte, Anne (AB) ──────────────────────────────────────────────────────
"ABronte_Agnes.txt", "AB", "Agnes Grey",
"ABronte_Tenant.txt", "AB", "Tenant of Wildfell Hall",
# ── Bronte, Charlotte (CB) ─────────────────────────────────────────────────
"CBronte_Jane.txt", "CB", "Jane Eyre",
"CBronte_Professor.txt", "CB", "The Professor",
"CBronte_Villette.txt", "CB", "Villette",
# ── Bronte, Emily (EB) ─────────────────────────────────────────────────────
"EBronte_Wuthering.txt", "EB", "Wuthering Heights",
# ── Dickens, Charles (CD) ──────────────────────────────────────────────────
"Dickens_Bleak.txt", "CD", "Bleak House",
"Dickens_David.txt", "CD", "David Copperfield",
"Dickens_Hard.txt", "CD", "Hard Times",
# ── Eliot, George (GE) ─────────────────────────────────────────────────────
"Eliot_Adam.txt", "GE", "Adam Bede",
"Eliot_Middlemarch.txt", "GE", "Middlemarch",
"Eliot_Mill.txt", "GE", "Mill on the Floss",
# ── Fielding, Henry (HF) ───────────────────────────────────────────────────
"Fielding_Joseph.txt", "HF", "Joseph Andrews",
"Fielding_Tom.txt", "HF", "Tom Jones",
# ── Richardson, Samuel (SR) ────────────────────────────────────────────────
"Richardson_Clarissa.txt", "SR", "Clarissa",
"Richardson_Pamela.txt", "SR", "Pamela",
# ── Sterne, Laurence (LS) ──────────────────────────────────────────────────
"Sterne_Tristram.txt", "LS", "Tristram Shandy",
"Sterne_Sentimental.txt", "LS", "Sentimental Journey",
# ── Thackeray, William Makepeace (WT) ──────────────────────────────────────
"Thackeray_Vanity.txt", "WT", "Vanity Fair",
"Thackeray_Pendennis.txt", "WT", "History of Pendennis",
"Thackeray_Barry.txt", "WT", "Barry Lyndon",
# ── Trollope, Anthony (AT) ─────────────────────────────────────────────────
"Trollope_Barchester.txt", "AT", "Barchester Towers",
"Trollope_Phineas.txt", "AT", "Phineas Finn",
"Trollope_Prime.txt", "AT", "The Prime Minister"
)
corpus <- texts_raw |>
left_join(file_map, by = "filename") |>
left_join(
meta |>
select(author_id, author, gender) |>
distinct(author_id, .keep_all = TRUE),
by = "author_id"
) |>
mutate(doc_id = row_number())
corpus |>
select(doc_id, filename, author, author_id, short_title) |>
kable(caption = "Итоговый корпус") |>
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
)
| doc_id | filename | author | author_id | short_title |
|---|---|---|---|---|
| 1 | ABronte_Agnes.txt | Bronte, Anne | AB | Agnes Grey |
| 2 | ABronte_Tenant.txt | Bronte, Anne | AB | Tenant of Wildfell Hall |
| 3 | Austen_Emma.txt | Austen, Jane | JA | Emma |
| 4 | Austen_Pride.txt | Austen, Jane | JA | Pride & Prejudice |
| 5 | Austen_Sense.txt | Austen, Jane | JA | Sense & Sensibility |
| 6 | CBronte_Jane.txt | Bronte, Charlotte | CB | Jane Eyre |
| 7 | CBronte_Professor.txt | Bronte, Charlotte | CB | The Professor |
| 8 | CBronte_Villette.txt | Bronte, Charlotte | CB | Villette |
| 9 | Dickens_Bleak.txt | Dickens, Charles | CD | Bleak House |
| 10 | Dickens_David.txt | Dickens, Charles | CD | David Copperfield |
| 11 | Dickens_Hard.txt | Dickens, Charles | CD | Hard Times |
| 12 | EBronte_Wuthering.txt | Bronte, Emily | EB | Wuthering Heights |
| 13 | Eliot_Adam.txt | Eliot, George | GE | Adam Bede |
| 14 | Eliot_Middlemarch.txt | Eliot, George | GE | Middlemarch |
| 15 | Eliot_Mill.txt | Eliot, George | GE | Mill on the Floss |
| 16 | Fielding_Joseph.txt | Fielding, Henry | HF | Joseph Andrews |
| 17 | Fielding_Tom.txt | Fielding, Henry | HF | Tom Jones |
| 18 | Richardson_Clarissa.txt | Richardson, Samuel | SR | Clarissa |
| 19 | Richardson_Pamela.txt | Richardson, Samuel | SR | Pamela |
| 20 | Sterne_Sentimental.txt | Sterne, Laurence | LS | Sentimental Journey |
| 21 | Sterne_Tristram.txt | Sterne, Laurence | LS | Tristram Shandy |
| 22 | Thackeray_Barry.txt | Thackeray, William Makepeace | WT | Barry Lyndon |
| 23 | Thackeray_Pendennis.txt | Thackeray, William Makepeace | WT | History of Pendennis |
| 24 | Thackeray_Vanity.txt | Thackeray, William Makepeace | WT | Vanity Fair |
| 25 | Trollope_Barchester.txt | Trollope, Antony | AT | Barchester Towers |
| 26 | Trollope_Phineas.txt | Trollope, Antony | AT | Phineas Finn |
| 27 | Trollope_Prime.txt | Trollope, Antony | AT | The Prime Minister |
Признаки предложений
feat_sentences <- function(text) {
sents <- tokenize_sentences(text)[[1]]
sent_lens <- map_int(sents, ~ length(tokenize_words(.x)[[1]]))
tibble(
n_sentences = length(sents),
mean_sent_len = mean(sent_lens, na.rm = TRUE),
median_sent_len = median(sent_lens, na.rm = TRUE),
sd_sent_len = sd(sent_lens, na.rm = TRUE),
max_sent_len = max(sent_lens, na.rm = TRUE),
long_sent_ratio = mean(sent_lens > 40, na.rm = TRUE)
)
}
Лексические признаки
feat_words <- function(text) {
words <- tokenize_words(text)[[1]]
n_words <- length(words)
n_unique <- n_distinct(words)
window <- 500L
mattr <- if (n_words >= window) {
idx <- seq(1, n_words - window + 1, by = 50)
mean(map_dbl(idx, ~ n_distinct(words[.x:(.x + window - 1)]) / window))
} else {
n_unique / n_words
}
word_lens <- nchar(words)
tibble(
n_words = n_words,
ttr = n_unique / n_words,
mattr = mattr,
mean_word_len = mean(word_lens, na.rm = TRUE),
median_word_len = median(word_lens, na.rm = TRUE),
sd_word_len = sd(word_lens, na.rm = TRUE),
long_word_ratio = mean(word_lens > 7, na.rm = TRUE)
)
}
Пунктуационные признаки
feat_punct <- function(text) {
n_ch <- nchar(text)
tibble(
comma_rate = str_count(text, ",") / n_ch * 1000,
semicol_rate = str_count(text, ";") / n_ch * 1000,
colon_rate = str_count(text, ":") / n_ch * 1000,
excl_rate = str_count(text, "!") / n_ch * 1000,
quest_rate = str_count(text, "\\?") / n_ch * 1000,
dash_rate = str_count(text, "\u2014|--") / n_ch * 1000,
quote_rate = str_count(
text, '"|\u2018|\u2019|\u201c|\u201d'
) / n_ch * 1000
)
}
Функциональные слова
# Соберем их список
FW <- c(
"the", "a", "an", "and", "but", "or", "in", "of", "to",
"that", "it", "he", "she", "they", "i", "you", "we",
"his", "her", "their", "my", "your", "our",
"with", "for", "on", "at", "by", "from", "as",
"not", "no", "all", "this", "which", "who", "what",
"be", "been", "have", "had", "has", "was", "were", "is", "are",
"said", "would", "could", "should", "upon", "shall", "must",
"though", "yet", "very", "quite", "rather", "indeed",
"however", "therefore", "then", "than", "when", "where", "if",
"so", "more", "such", "after", "before", "over", "again"
)
feat_funwords <- function(text) {
words <- tokenize_words(text)[[1]]
n_words <- length(words)
counts <- table(words)
# Частота на 10000 слов для каждого функционального слова
fw_tbl <- map_dfc(FW, function(w) {
cnt <- if (w %in% names(counts)) as.numeric(counts[[w]]) else 0
tibble(!!paste0("fw_", w) := cnt / n_words * 10000)
})
fw_tbl |>
mutate(stopword_ratio = sum(words %in% FW) / n_words)
}
Матрица признаков
sf <- map_dfr(corpus$raw_text, feat_sentences)
wf <- map_dfr(corpus$raw_text, feat_words)
pf <- map_dfr(corpus$raw_text, feat_punct)
ff <- map_dfr(corpus$raw_text, feat_funwords)
features_df <- corpus |>
select(doc_id, filename, author, author_id, short_title, gender) |>
bind_cols(sf, wf, pf, ff) |>
mutate(author_id = factor(author_id)) # целевая переменная — фактор
cat(
"Итоговая матрица:", nrow(features_df),
"строк ×", ncol(features_df), "столбцов\n"
)
## Итоговая матрица: 27 строк × 100 столбцов
cat("Классы:\n")
## Классы:
print(table(features_df$author_id))
##
## AB AT CB CD EB GE HF JA LS SR WT
## 2 3 3 3 1 3 2 3 2 2 3
Сводная таблица
features_df |>
select(author, short_title, n_words, n_sentences,
mean_sent_len, mattr, stopword_ratio) |>
mutate(across(where(is.numeric), ~ round(.x, 2))) |>
arrange(author) |>
kable(caption = "Основные стилистические характеристики текстов") |>
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
) |>
scroll_box(width = "100%", height = "380px")
| author | short_title | n_words | n_sentences | mean_sent_len | mattr | stopword_ratio |
|---|---|---|---|---|---|---|
| Austen, Jane | Emma | 160997 | 9126 | 17.64 | 0.49 | 0.50 |
| Austen, Jane | Pride & Prejudice | 122108 | 7058 | 17.30 | 0.50 | 0.50 |
| Austen, Jane | Sense & Sensibility | 119962 | 5483 | 21.88 | 0.50 | 0.50 |
| Bronte, Anne | Agnes Grey | 68598 | 2610 | 26.28 | 0.52 | 0.49 |
| Bronte, Anne | Tenant of Wildfell Hall | 167993 | 7693 | 21.84 | 0.51 | 0.49 |
| Bronte, Charlotte | Jane Eyre | 188202 | 10637 | 17.69 | 0.53 | 0.47 |
| Bronte, Charlotte | The Professor | 89379 | 3671 | 24.35 | 0.55 | 0.46 |
| Bronte, Charlotte | Villette | 195680 | 10780 | 18.15 | 0.56 | 0.45 |
| Bronte, Emily | Wuthering Heights | 117113 | 7211 | 16.24 | 0.56 | 0.46 |
| Dickens, Charles | Bleak House | 357476 | 24249 | 14.74 | 0.49 | 0.47 |
| Dickens, Charles | David Copperfield | 359112 | 22498 | 15.96 | 0.49 | 0.49 |
| Dickens, Charles | Hard Times | 104048 | 7044 | 14.77 | 0.50 | 0.47 |
| Eliot, George | Adam Bede | 217018 | 9701 | 22.37 | 0.51 | 0.46 |
| Eliot, George | Middlemarch | 320198 | 17450 | 18.35 | 0.52 | 0.47 |
| Eliot, George | Mill on the Floss | 209330 | 10496 | 19.94 | 0.52 | 0.46 |
| Fielding, Henry | Joseph Andrews | 137415 | 4098 | 33.53 | 0.51 | 0.49 |
| Fielding, Henry | Tom Jones | 349148 | 11262 | 31.00 | 0.50 | 0.49 |
| Richardson, Samuel | Clarissa | 969910 | 44817 | 21.64 | 0.48 | 0.51 |
| Richardson, Samuel | Pamela | 440450 | 17289 | 25.48 | 0.47 | 0.51 |
| Sterne, Laurence | Sentimental Journey | 40849 | 986 | 41.43 | 0.49 | 0.49 |
| Sterne, Laurence | Tristram Shandy | 188668 | 4074 | 46.31 | 0.50 | 0.48 |
| Thackeray, William Makepeace | Barry Lyndon | 128308 | 4627 | 27.73 | 0.51 | 0.49 |
| Thackeray, William Makepeace | History of Pendennis | 358520 | 17131 | 20.93 | 0.52 | 0.47 |
| Thackeray, William Makepeace | Vanity Fair | 306927 | 14835 | 20.69 | 0.53 | 0.46 |
| Trollope, Antony | Barchester Towers | 198811 | 9875 | 20.13 | 0.48 | 0.50 |
| Trollope, Antony | Phineas Finn | 264592 | 17730 | 14.92 | 0.46 | 0.51 |
| Trollope, Antony | The Prime Minister | 284645 | 19089 | 14.91 | 0.46 | 0.51 |
Средние длины предложений
p_sent <- features_df |>
group_by(author_id, author) |>
summarise(
m = mean(mean_sent_len),
se = sd(mean_sent_len) / sqrt(n()),
.groups = "drop"
) |>
ggplot(aes(x = reorder(author_id, m), y = m, fill = author_id)) +
geom_col(alpha = 0.85, width = 0.65) +
geom_errorbar(aes(ymin = m - se, ymax = m + se),
width = 0.25, linewidth = 0.8) +
geom_text(aes(label = round(m, 1)), hjust = -1.3, size = 3.5) +
coord_flip() +
scale_fill_brewer(palette = "Set3") +
labs(
title = "Средняя длина предложения по авторам",
subtitle = "Слов на предложение (среднее ± SE по произведениям автора)",
x = NULL, y = "Слов на предложение"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")
p_sent
Интересно, что заметное различие есть даже по такому простому параметру. Отличается длинными предложениями Laurence Sterne - и действительно, он этим известен, есть даже знаменитое длинное предложение, открывающее его произведение The Life and Opinions of Tristram Shandy, Gentleman.
Заметим, что оба автора, отличающиеся наиболее длинными предложениями - Laurence Sterne и Henry Fielding - писатели 18 века, а Anthony Trollope, Emily Brontë и Чарльз Диккенс, отличающиеся краткостью предложений - писатели 19 века. Впрочем, такой анализ - тема отдельной работы.
Лексическое разнообразие
p_mattr <- features_df |>
ggplot(aes(
x = reorder(author_id, mattr, median),
y = mattr,
fill = author_id,
colour = author_id
)) +
geom_boxplot(alpha = 0.45, outlier.shape = NA, linewidth = 0.7) +
geom_jitter(width = 0.15, size = 3.5, alpha = 0.9) +
scale_fill_brewer(palette = "Set3") +
scale_colour_brewer(palette = "Set3") +
labs(
title = "Лексическое разнообразие (MATTR)",
subtitle = "Moving Average Type-Token Ratio, окно = 500 слов",
x = NULL, y = "MATTR"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")
p_mattr
Тут заметно, что распределения лексических разнообразий у авторов неплохо разделяется, но надо учитывать, что это все в пределах сотых долей + текстов у нас довольно мало, поэтому возможно этот график описывает скорее произведения, чем авторов.
Пунктуационные профили
punct_long <- features_df |>
select(author_id, comma_rate, semicol_rate,
colon_rate, excl_rate, quest_rate, dash_rate) |>
pivot_longer(
cols = -author_id,
names_to = "punct",
values_to = "rate"
) |>
mutate(punct = recode(punct,
comma_rate = "Запятые",
semicol_rate = "Точки с запятой",
colon_rate = "Двоеточия",
excl_rate = "Восклицат. знаки",
quest_rate = "Вопросит. знаки",
dash_rate = "Тире"
))
p_punct <- punct_long |>
group_by(author_id, punct) |>
summarise(mean_rate = mean(rate), .groups = "drop") |>
ggplot(aes(x = author_id, y = mean_rate, fill = author_id)) +
geom_col(alpha = 0.85) +
facet_wrap(~punct, scales = "free_y", ncol = 3) +
scale_fill_brewer(palette = "Set3") +
labs(
title = "Пунктуационный профиль авторов",
subtitle = "Частота на 1 000 символов",
x = NULL, y = "Частота"
) +
theme_minimal(base_size = 11) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
p_punct
По вопросительным знакам и запятым заметных различий не видно, но зато по восклицательным знакам и двоеточиям выделяется Эмилия Бронте, а по частоте тире - Laurence Sterne.
Тепловая карта функциональных слов
fw_cols <- names(features_df)[str_starts(names(features_df), "fw_")]
# Топ-30 слов по межавторской дисперсии
top30_fw <- features_df |>
select(author_id, all_of(fw_cols)) |>
pivot_longer(-author_id, names_to = "word", values_to = "rate") |>
group_by(word) |>
summarise(bw_var = var(rate), .groups = "drop") |>
slice_max(bw_var, n = 30) |>
pull(word)
heatmap_data <- features_df |>
select(author_id, all_of(top30_fw)) |>
group_by(author_id) |>
summarise(across(everything(), mean), .groups = "drop") |>
pivot_longer(-author_id, names_to = "word", values_to = "rate") |>
mutate(word = str_remove(word, "^fw_"))
p_heat <- heatmap_data |>
ggplot(aes(x = author_id, y = reorder(word, rate), fill = rate)) +
geom_tile(colour = "white", linewidth = 0.4) +
scale_fill_distiller(
palette = "RdYlBu",
direction = 1,
name = "Частота\n(на 10 000 сл.)"
) +
labs(
title = "Профиль функциональных слов",
subtitle = "Топ-30 слов с наибольшей вариацией между авторами",
x = "Автор", y = "Слово"
) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
p_heat
Заметим, что даже самые частотные служебные слова (the, and, to, of) распределены неравномерно. Это хорошо для стилометрии, поскольку эти слова не связаны с темами произведений, а характеризуют писательские привычки авторов.
Поскольку текстов мало, разобъем их на фрагменты по примерно 5000 слов и будем с ними работать
chunk_size <- 5000
min_chunk_size <- 3000
make_chunks <- function(text, chunk_size = 5000, min_chunk_size = 3000) {
sents <- tokenize_sentences(text)[[1]]
sent_n_words <- map_int(
sents,
~ length(tokenize_words(.x)[[1]])
)
chunks <- list()
chunk_word_counts <- integer()
current_sents <- character()
current_n_words <- 0
chunk_id <- 1
for (i in seq_along(sents)) {
current_sents <- c(current_sents, sents[i])
current_n_words <- current_n_words + sent_n_words[i]
if (current_n_words >= chunk_size) {
chunks[[chunk_id]] <- paste(current_sents, collapse = " ")
chunk_word_counts[chunk_id] <- current_n_words
chunk_id <- chunk_id + 1
current_sents <- character()
current_n_words <- 0
}
}
if (length(current_sents) > 0) {
chunks[[chunk_id]] <- paste(current_sents, collapse = " ")
chunk_word_counts[chunk_id] <- current_n_words
}
tibble(
chunk_id_local = seq_along(chunks),
chunk_text = unlist(chunks),
chunk_n_words = chunk_word_counts
) |>
filter(chunk_n_words >= min_chunk_size)
}
chunks <- corpus |>
select(doc_id, filename, author, author_id, short_title, gender, raw_text) |>
mutate(
chunks = map(
raw_text,
make_chunks,
chunk_size = chunk_size,
min_chunk_size = min_chunk_size
)
) |>
select(-raw_text) |>
unnest(chunks) |>
mutate(
chunk_id = row_number(),
author_id = factor(author_id),
doc_id = factor(doc_id)
)
cat("Всего фрагментов:", nrow(chunks), "\n")
## Всего фрагментов: 1286
С этим уже можно работать
chunks |>
count(author_id, author, name = "n_chunks") |>
arrange(n_chunks) |>
kable(caption = "Число фрагментов по авторам") |>
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
)
| author_id | author | n_chunks |
|---|---|---|
| EB | Bronte, Emily | 23 |
| LS | Sterne, Laurence | 45 |
| AB | Bronte, Anne | 47 |
| JA | Austen, Jane | 80 |
| CB | Bronte, Charlotte | 94 |
| HF | Fielding, Henry | 96 |
| GE | Eliot, George | 149 |
| AT | Trollope, Antony | 150 |
| WT | Thackeray, William Makepeace | 157 |
| CD | Dickens, Charles | 164 |
| SR | Richardson, Samuel | 281 |
chunks |>
count(author_id, short_title, name = "n_chunks") |>
arrange(author_id, short_title) |>
kable(caption = "Число фрагментов по произведениям") |>
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
)
| author_id | short_title | n_chunks |
|---|---|---|
| AB | Agnes Grey | 14 |
| AB | Tenant of Wildfell Hall | 33 |
| AT | Barchester Towers | 40 |
| AT | Phineas Finn | 53 |
| AT | The Prime Minister | 57 |
| CB | Jane Eyre | 37 |
| CB | The Professor | 18 |
| CB | Villette | 39 |
| CD | Bleak House | 71 |
| CD | David Copperfield | 72 |
| CD | Hard Times | 21 |
| EB | Wuthering Heights | 23 |
| GE | Adam Bede | 43 |
| GE | Middlemarch | 64 |
| GE | Mill on the Floss | 42 |
| HF | Joseph Andrews | 27 |
| HF | Tom Jones | 69 |
| JA | Emma | 32 |
| JA | Pride & Prejudice | 24 |
| JA | Sense & Sensibility | 24 |
| LS | Sentimental Journey | 8 |
| LS | Tristram Shandy | 37 |
| SR | Clarissa | 193 |
| SR | Pamela | 88 |
| WT | Barry Lyndon | 25 |
| WT | History of Pendennis | 71 |
| WT | Vanity Fair | 61 |
Извлекаем признаки для фрагментов
chunk_sf <- map_dfr(chunks$chunk_text, feat_sentences)
chunk_wf <- map_dfr(chunks$chunk_text, feat_words)
chunk_pf <- map_dfr(chunks$chunk_text, feat_punct)
chunk_ff <- map_dfr(chunks$chunk_text, feat_funwords)
Создаем таблицу признаков
chunk_features_df <- chunks |>
select(
chunk_id,
doc_id,
filename,
author,
author_id,
short_title,
gender,
chunk_id_local,
chunk_n_words
) |>
bind_cols(chunk_sf, chunk_wf, chunk_pf, chunk_ff) |>
mutate(
author_id = factor(author_id),
doc_id = factor(doc_id)
)
cat(
"Матрица признаков по фрагментам:",
nrow(chunk_features_df),
"строк ×", ncol(chunk_features_df), "столбцов\n"
)
## Матрица признаков по фрагментам: 1286 строк × 103 столбцов
table(chunk_features_df$author_id)
##
## AB AT CB CD EB GE HF JA LS SR WT
## 47 150 94 164 23 149 96 80 45 281 157
Готовим данные для моделей
model_df <- chunk_features_df |>
select(author_id, where(is.numeric)) |>
select(-chunk_id, -chunk_id_local) |>
drop_na() |>
mutate(author_id = factor(author_id))
model_df |>
count(author_id, name = "n_fragments") |>
arrange(n_fragments) |>
kable(caption = "Число фрагментов по авторам в модельной таблице") |>
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
)
| author_id | n_fragments |
|---|---|
| EB | 23 |
| LS | 45 |
| AB | 47 |
| JA | 80 |
| CB | 94 |
| HF | 96 |
| GE | 149 |
| AT | 150 |
| WT | 157 |
| CD | 164 |
| SR | 281 |
Выбираем кол-во фолдов
min_fragments_per_author <- model_df |>
count(author_id) |>
summarise(min_n = min(n)) |>
pull(min_n)
min_fragments_per_author
## [1] 23
Возьмем например 10
v <- 10
set.seed(6769)
folds <- vfold_cv(
model_df,
v = v,
strata = author_id
)
Проверим, что каждый автор есть и в train и test
check_split_authors <- function(split, id, all_authors) {
train_data <- analysis(split)
test_data <- assessment(split)
train_counts <- train_data |>
count(author_id, name = "n_train") |>
complete(author_id = all_authors, fill = list(n_train = 0))
test_counts <- test_data |>
count(author_id, name = "n_test") |>
complete(author_id = all_authors, fill = list(n_test = 0))
train_counts |>
full_join(test_counts, by = "author_id") |>
mutate(
fold = id,
train_has_author = n_train > 0,
test_has_author = n_test > 0
) |>
select(fold, author_id, n_train, n_test, train_has_author, test_has_author)
}
all_authors <- levels(model_df$author_id)
fold_check <- map2_dfr(
folds$splits,
folds$id,
~ check_split_authors(.x, .y, all_authors)
)
cv_check_summary <- fold_check |>
summarise(
all_train_have_all_authors = all(train_has_author),
all_test_have_all_authors = all(test_has_author)
)
cv_check_summary
## # A tibble: 1 × 2
## all_train_have_all_authors all_test_have_all_authors
## <lgl> <lgl>
## 1 TRUE TRUE
Все есть!
Будем использовать Multinomial логистическую регрессию
Используем рецепт:
удаляем признаки с нулевой дисперсией; удаляем признаки с почти нулевой дисперсией; нормализуем числовые признаки.
base_rec <- recipe(author_id ~ ., data = model_df) |>
step_zv(all_predictors()) |>
step_nzv(all_predictors()) |>
step_normalize(all_predictors())
Возьмем метрики - accuracy, log loss и ROC-AUC
multi_metrics <- metric_set(
accuracy,
mn_log_loss,
roc_auc
)
log_spec <- multinom_reg(
penalty = tune(),
mixture = 1
) |>
set_engine("glmnet") |>
set_mode("classification")
log_wf <- workflow() |>
add_recipe(base_rec) |>
add_model(log_spec)
Сетка гиперпараметров
log_grid <- grid_regular(
penalty(range = c(-5, 1)),
levels = 20
)
log_grid
## # A tibble: 20 × 1
## penalty
## <dbl>
## 1 0.00001
## 2 0.0000207
## 3 0.0000428
## 4 0.0000886
## 5 0.000183
## 6 0.000379
## 7 0.000785
## 8 0.00162
## 9 0.00336
## 10 0.00695
## 11 0.0144
## 12 0.0298
## 13 0.0616
## 14 0.127
## 15 0.264
## 16 0.546
## 17 1.13
## 18 2.34
## 19 4.83
## 20 10
Поехали!
ctrl <- control_grid(
save_pred = TRUE,
save_workflow = TRUE,
verbose = TRUE
)
set.seed(6769)
log_res <- tune_grid(
log_wf,
resamples = folds,
grid = log_grid,
metrics = multi_metrics,
control = ctrl
)
Смотрим метрики:
collect_metrics(log_res) |>
arrange(.metric, desc(mean))
## # A tibble: 60 × 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00001 accuracy multiclass 0.998 10 0.00164 pre0_mod01_post0
## 2 0.0000207 accuracy multiclass 0.998 10 0.00164 pre0_mod02_post0
## 3 0.0000428 accuracy multiclass 0.998 10 0.00164 pre0_mod03_post0
## 4 0.0000886 accuracy multiclass 0.998 10 0.00164 pre0_mod04_post0
## 5 0.000183 accuracy multiclass 0.998 10 0.00164 pre0_mod05_post0
## 6 0.000379 accuracy multiclass 0.997 10 0.00170 pre0_mod06_post0
## 7 0.000785 accuracy multiclass 0.997 10 0.00170 pre0_mod07_post0
## 8 0.00162 accuracy multiclass 0.995 10 0.00234 pre0_mod08_post0
## 9 0.00336 accuracy multiclass 0.995 10 0.00230 pre0_mod09_post0
## 10 0.00695 accuracy multiclass 0.987 10 0.00285 pre0_mod10_post0
## # ℹ 50 more rows
Лучшие параметры по accuracy:
best_log <- select_best(log_res, metric = "accuracy")
best_log
## # A tibble: 1 × 2
## penalty .config
## <dbl> <chr>
## 1 0.00001 pre0_mod01_post0
Топ-5 настроек:
show_best(log_res, metric = "accuracy", n = 5) |>
kable(caption = "Лучшие настройки логистической регрессии по accuracy") |>
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
)
| penalty | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|
| 0.0000100 | accuracy | multiclass | 0.9976859 | 10 | 0.0016382 | pre0_mod01_post0 |
| 0.0000207 | accuracy | multiclass | 0.9976859 | 10 | 0.0016382 | pre0_mod02_post0 |
| 0.0000428 | accuracy | multiclass | 0.9976859 | 10 | 0.0016382 | pre0_mod03_post0 |
| 0.0000886 | accuracy | multiclass | 0.9976859 | 10 | 0.0016382 | pre0_mod04_post0 |
| 0.0001833 | accuracy | multiclass | 0.9976859 | 10 | 0.0016382 | pre0_mod05_post0 |
autoplot(log_res) +
labs(
title = "Настройка мультиномиальной логистической регрессии",
subtitle = "Регуляризация LASSO",
x = "Penalty",
y = "Значение метрики"
) +
theme_minimal(base_size = 13)
Вышло отлично! Посмотрим матрицу ошибок:
log_predictions <- log_res |>
collect_predictions(parameters = best_log)
log_predictions |>
select(author_id, .pred_class) |>
head()
## # A tibble: 6 × 2
## author_id .pred_class
## <fct> <fct>
## 1 AB AB
## 2 AB AB
## 3 AB AB
## 4 AB AB
## 5 AB AB
## 6 JA JA
log_conf_mat <- log_predictions |>
conf_mat(
truth = author_id,
estimate = .pred_class
)
log_conf_mat
## Truth
## Prediction AB AT CB CD EB GE HF JA LS SR WT
## AB 47 0 0 0 0 0 0 0 0 0 0
## AT 0 150 0 0 0 0 0 0 0 0 0
## CB 0 0 94 0 0 0 0 0 0 0 0
## CD 0 0 0 163 0 1 0 0 0 0 0
## EB 0 0 0 0 23 0 0 0 0 0 0
## GE 0 0 0 1 0 148 0 0 0 0 0
## HF 0 0 0 0 0 0 95 0 0 0 0
## JA 0 0 0 0 0 0 0 80 0 0 0
## LS 0 0 0 0 0 0 0 0 45 0 0
## SR 0 0 0 0 0 0 0 0 0 281 0
## WT 0 0 0 0 0 0 1 0 0 0 157
autoplot(log_conf_mat, type = "heatmap") +
scale_fill_gradient(low = "white", high = "#2C7FB8") +
labs(
title = "Матрица ошибок: логистическая регрессия",
subtitle = "Стратифицированная кросс-валидация по фрагментам",
x = "Предсказанный автор",
y = "Истинный автор"
) +
theme_minimal(base_size = 13) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
axis.text.y = element_text(size = 10)
)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
почти без ошибок! отличный результат
log_predictions <- log_res |>
collect_predictions(parameters = best_log)
pred_cols <- names(log_predictions) |>
str_subset("^\\.pred_") |>
setdiff(".pred_class")
log_roc_curve <- log_predictions |>
roc_curve(
truth = author_id,
!!!syms(pred_cols)
)
log_roc_curve |>
ggplot(aes(x = 1 - specificity, y = sensitivity, color = .level)) +
geom_abline(
slope = 1,
intercept = 0,
color = "gray50",
lty = 2,
alpha = 0.8
) +
geom_path(linewidth = 1.2, alpha = 0.75) +
labs(
title = "ROC-кривые для логистической регрессии",
subtitle = "One-vs-all классификация авторов",
x = "1 - Specificity",
y = "Sensitivity",
color = "Автор"
) +
theme_light(base_size = 13)
Попробуем обучить RF
rf_spec <- rand_forest(
mtry = tune(),
min_n = tune(),
trees = 300
) |>
set_engine("ranger", importance = "impurity") |>
set_mode("classification")
rf_wf <- workflow() |>
add_recipe(base_rec) |>
add_model(rf_spec)
n_pred <- ncol(model_df) - 1
rf_grid <- grid_regular(
mtry(range = c(2, min(30, n_pred))),
min_n(range = c(2, 10)),
levels = 4
)
Поехали!
set.seed(6769)
rf_res <- tune_grid(
rf_wf,
resamples = folds,
grid = rf_grid,
metrics = multi_metrics,
control = ctrl
)
Посмотрим лучшие результаты
show_best(rf_res, metric = "accuracy", n = 5)
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 11 4 accuracy multiclass 0.992 10 0.00261 pre0_mod06_post0
## 2 20 2 accuracy multiclass 0.992 10 0.00261 pre0_mod09_post0
## 3 30 4 accuracy multiclass 0.991 10 0.00338 pre0_mod14_post0
## 4 11 2 accuracy multiclass 0.991 10 0.00297 pre0_mod05_post0
## 5 30 7 accuracy multiclass 0.991 10 0.00297 pre0_mod15_post0
show_best(rf_res, metric = "accuracy", n = 5)
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 11 4 accuracy multiclass 0.992 10 0.00261 pre0_mod06_post0
## 2 20 2 accuracy multiclass 0.992 10 0.00261 pre0_mod09_post0
## 3 30 4 accuracy multiclass 0.991 10 0.00338 pre0_mod14_post0
## 4 11 2 accuracy multiclass 0.991 10 0.00297 pre0_mod05_post0
## 5 30 7 accuracy multiclass 0.991 10 0.00297 pre0_mod15_post0
Сравним с ЛогРегом
model_compare <- bind_rows(
collect_metrics(log_res) |>
mutate(model = "Logistic regression"),
collect_metrics(rf_res) |>
mutate(model = "Random Forest")
)
График сравнения accuracy
model_compare |>
filter(.metric == "accuracy") |>
group_by(model) |>
slice_max(mean, n = 1, with_ties = FALSE) |>
ungroup() |>
ggplot(aes(x = model, y = mean, fill = model)) +
geom_col(alpha = 0.85, width = 0.6) +
geom_text(aes(label = round(mean, 3)), vjust = -0.5) +
scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
labs(
title = "Сравнение моделей по accuracy",
x = NULL,
y = "Accuracy"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "none")
LogReg даже лучшее accuracy показал чем RF.
Для логрег посмотрим на коэффициенты
final_log_wf <- finalize_workflow(
log_wf,
best_log
)
final_log_fit <- final_log_wf |>
fit(data = model_df)
final_model <- extract_fit_parsnip(final_log_fit)
log_terms <- tidy(final_model)
top_terms <- log_terms |>
filter(term != "(Intercept)") |>
group_by(class) |>
slice_max(abs(estimate), n = 6) |>
ungroup() |>
mutate(
term = str_replace(term, "^fw_", "fw: "),
term = fct_reorder(term, abs(estimate))
)
top_terms
## # A tibble: 66 × 4
## class term estimate penalty
## <chr> <fct> <dbl> <dbl>
## 1 AB fw: but 1.84 0.00001
## 2 AB fw: and 1.25 0.00001
## 3 AB fw: or 0.959 0.00001
## 4 AB sd_word_len 0.697 0.00001
## 5 AB quote_rate -0.554 0.00001
## 6 AB fw: who -0.490 0.00001
## 7 AT mattr -1.84 0.00001
## 8 AT fw: that 1.38 0.00001
## 9 AT fw: where -1.23 0.00001
## 10 AT fw: the 0.941 0.00001
## # ℹ 56 more rows
top_terms |>
mutate(
estimate = round(estimate, 3)
) |>
arrange(class, desc(abs(estimate))) |>
kable(caption = "Наиболее важные признаки логистической регрессии для каждого автора") |>
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
) |>
scroll_box(width = "100%", height = "500px")
| class | term | estimate | penalty |
|---|---|---|---|
| AB | fw: but | 1.837 | 1e-05 |
| AB | fw: and | 1.247 | 1e-05 |
| AB | fw: or | 0.959 | 1e-05 |
| AB | sd_word_len | 0.697 | 1e-05 |
| AB | quote_rate | -0.554 | 1e-05 |
| AB | fw: who | -0.490 | 1e-05 |
| AT | mattr | -1.843 | 1e-05 |
| AT | fw: that | 1.377 | 1e-05 |
| AT | fw: where | -1.232 | 1e-05 |
| AT | fw: the | 0.941 | 1e-05 |
| AT | fw: my | -0.772 | 1e-05 |
| AT | comma_rate | -0.697 | 1e-05 |
| CB | ttr | 2.320 | 1e-05 |
| CB | quote_rate | 2.264 | 1e-05 |
| CB | semicol_rate | 1.330 | 1e-05 |
| CB | colon_rate | 1.111 | 1e-05 |
| CB | fw: if | -0.864 | 1e-05 |
| CB | fw: yet | 0.816 | 1e-05 |
| CD | semicol_rate | -1.703 | 1e-05 |
| CD | dash_rate | -1.668 | 1e-05 |
| CD | colon_rate | -1.492 | 1e-05 |
| CD | comma_rate | 1.280 | 1e-05 |
| CD | median_sent_len | -0.815 | 1e-05 |
| CD | fw: in | 0.809 | 1e-05 |
| EB | mattr | 2.425 | 1e-05 |
| EB | excl_rate | 1.548 | 1e-05 |
| EB | colon_rate | 0.740 | 1e-05 |
| EB | dash_rate | -0.584 | 1e-05 |
| EB | fw: and | 0.560 | 1e-05 |
| EB | fw: of | -0.544 | 1e-05 |
| GE | fw: upon | -1.380 | 1e-05 |
| GE | fw: my | -1.317 | 1e-05 |
| GE | fw: if | 1.216 | 1e-05 |
| GE | fw: or | -0.910 | 1e-05 |
| GE | fw: so | -0.908 | 1e-05 |
| GE | comma_rate | -0.767 | 1e-05 |
| HF | fw: which | 2.115 | 1e-05 |
| HF | quote_rate | 1.292 | 1e-05 |
| HF | fw: indeed | 0.922 | 1e-05 |
| HF | semicol_rate | 0.735 | 1e-05 |
| HF | fw: were | -0.614 | 1e-05 |
| HF | fw: this | 0.590 | 1e-05 |
| JA | ttr | -1.534 | 1e-05 |
| JA | long_word_ratio | 1.416 | 1e-05 |
| JA | fw: could | 1.004 | 1e-05 |
| JA | fw: very | 0.901 | 1e-05 |
| JA | fw: must | 0.787 | 1e-05 |
| JA | fw: be | 0.567 | 1e-05 |
| LS | sd_sent_len | 1.478 | 1e-05 |
| LS | fw: upon | 0.985 | 1e-05 |
| LS | long_sent_ratio | 0.888 | 1e-05 |
| LS | fw: it | 0.792 | 1e-05 |
| LS | fw: in | 0.332 | 1e-05 |
| LS | dash_rate | 0.134 | 1e-05 |
| SR | comma_rate | 1.614 | 1e-05 |
| SR | fw: to | 1.443 | 1e-05 |
| SR | fw: yet | 1.094 | 1e-05 |
| SR | colon_rate | 1.092 | 1e-05 |
| SR | fw: upon | 0.812 | 1e-05 |
| SR | fw: no | -0.808 | 1e-05 |
| WT | fw: but | -1.888 | 1e-05 |
| WT | fw: and | 1.803 | 1e-05 |
| WT | fw: it | -1.393 | 1e-05 |
| WT | fw: who | 0.962 | 1e-05 |
| WT | sd_word_len | -0.890 | 1e-05 |
| WT | fw: which | 0.889 | 1e-05 |
top_terms |>
ggplot(aes(x = estimate, y = term, fill = class)) +
geom_col(show.legend = FALSE, alpha = 0.85) +
facet_wrap(~ class, scales = "free_y", ncol=3) +
scale_fill_brewer(palette = "Set3") +
labs(
title = "Наиболее важные признаки для каждого автора",
subtitle = "Коэффициенты регуляризованной логистической регрессии",
x = "Коэффициент",
y = "Признак"
) +
theme_minimal(base_size = 10)
Модель активно использует служебные слова: fw: but, fw: and, fw: which, fw: upon, fw: my, fw: he, fw: she, fw: that, fw: very и другие. Это хороший результат для стилометрии, тк служебные слова меньше связаны с темой произведения и сильнее отражают индивидуальные авторские привычки.
Также важными являются пунктуационные признаки: частотность запятых (quote_rate), точек с запятой (semicol_rate), двоеточий (colon_rate), кавычек (quote_rate), тире (dash_rate) и восклицательных знаков (excl_rate). Это значит, что модель различает авторов не только по лексике, но и по пунктуации.
В ходе предварительного анализа было замечено, что некоторые авторы имеют “любимые” знаки препинания - у Laurence Sterne это тире, а у Эмилии Бронте - восклицательные знаки и двоеточия. Финальная модель тоже использовала эти признаки для классификации.
Нам удалось успешно обучить модель для классификации произведений британской прозы 18-19 вв по авторам. Наилучшей моделью оказалась Multinomial Logistic Regression, accuracy которой получилась очень высокой - практически 100%. Для обучения произведения были разделены на фрагменты по ~5000 слов, чтобы иметь достаточное количество данных для train/test/validation split и 10 фолдов кросс-валидации. Для классификации лучшая модель часто использует служебные слова и знаки препинания, что показывает, что выбранные авторы хорошо разделимы по таким признакам.