Классификация авторов британской прозы конца XVIII — XIX веков

Загрузка пакетов

library(tidyverse)
library(fs)

library(tidytext)
library(tokenizers)
library(tidymodels)

library(ggrepel)
library(patchwork)
library(knitr)
library(kableExtra)
library(broom)

tidymodels_prefer()


set.seed(6769)

Загрузка данных

meta <- read_tsv(
  "data/overview.tsv",
  col_types = cols(.default = col_character())
) |>
  mutate(across(where(is.character), str_squish)) |>
  rename(
    text_id   = textID,
    author_id = authorID,
    year      = `1stPubl`,
    gender    = author_gender
  ) |>
  mutate(
    text_id = as.integer(text_id),
    year    = as.integer(year),
    gender  = if_else(gender == "1", "female", "male")
  )

meta |>
  select(text_id, author, author_id, title, year, gender) |>
  kable(caption = "Метаданные корпуса") |>
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  )

Метаданные корпуса
text_id	author	author_id	title	year	gender
1	Austen, Jane	JA	Emma	1815	female
2	Austen, Jane	JA	Pride	1813	female
3	Austen, Jane	JA	Sense	1811	female
4	Bronte, Anne	AB	Agnes Grey	1847	female
5	Bronte, Anne	AB	Tentant of Wildfell Hall	1848	female
6	Bronte, Charlotte	CB	Jane Eyre	1847	female
7	Bronte, Charlotte	CB	Professor	1845	female
8	Bronte, Charlotte	CB	Villette	1853	female
9	Bronte, Emily	EB	Wuthering Heights	1847	female
10	Dickens, Charles	CD	Bleak House	1852	male
11	Dickens, Charles	CD	David Copperfield	1849	male
12	Dickens, Charles	CD	Hard Times	1854	male
13	Eliot, George	GE	Adam Bede	1859	female
14	Eliot, George	GE	Middlemarch	1871	female
15	Eliot, George	GE	The Mill on the Floss	1860	female
16	Fielding, Henry	HF	Joseph Andrews	1742	male
17	Fielding, Henry	HF	Tom Jones	1749	male
18	Richardson, Samuel	SR	Clarissa, or, the History of a Young Lady	1748	male
19	Richardson, Samuel	SR	Pamela	1740	male
20	Sterne, Laurence	LS	Tristam Shandy	1759	male
21	Sterne, Laurence	LS	Sentimental	1768	male
22	Trollope, Antony	AT	Prime Minister	1876	male
23	Trollope, Antony	AT	Barchester Towers	1857	male
24	Trollope, Antony	AT	Phineas Finn	1869	male
25	Thackeray, William Makepeace	WT	Vanity Fair	1848	male
26	Thackeray, William Makepeace	WT	The History of Pendennis	1848	male
27	Thackeray, William Makepeace	WT	The Luck of Barry Lyndon	1844	male

read_novel <- function(path) {
  read_lines(path, locale = locale(encoding = "UTF-8")) |>
    paste(collapse = " ")
}

txt_files <- dir_ls("data/british_fiction", glob = "*.txt")

texts_raw <- tibble(
  filepath = txt_files,
  filename = path_file(txt_files),
  raw_text = map_chr(txt_files, read_novel)
)

cat("Файлов:", nrow(texts_raw), "\n")

## Файлов: 27

print(texts_raw$filename)

##  [1] "ABronte_Agnes.txt"       "ABronte_Tenant.txt"     
##  [3] "Austen_Emma.txt"         "Austen_Pride.txt"       
##  [5] "Austen_Sense.txt"        "CBronte_Jane.txt"       
##  [7] "CBronte_Professor.txt"   "CBronte_Villette.txt"   
##  [9] "Dickens_Bleak.txt"       "Dickens_David.txt"      
## [11] "Dickens_Hard.txt"        "EBronte_Wuthering.txt"  
## [13] "Eliot_Adam.txt"          "Eliot_Middlemarch.txt"  
## [15] "Eliot_Mill.txt"          "Fielding_Joseph.txt"    
## [17] "Fielding_Tom.txt"        "Richardson_Clarissa.txt"
## [19] "Richardson_Pamela.txt"   "Sterne_Sentimental.txt" 
## [21] "Sterne_Tristram.txt"     "Thackeray_Barry.txt"    
## [23] "Thackeray_Pendennis.txt" "Thackeray_Vanity.txt"   
## [25] "Trollope_Barchester.txt" "Trollope_Phineas.txt"   
## [27] "Trollope_Prime.txt"

Сопоставим с метаданными

Лучше это сделать не кодом, а вручную (вернее, с помощью LLM) сопоставив названия файлов.

file_map <- tribble(
  ~filename,                    ~author_id,  ~short_title,

  # ── Austen, Jane (JA) ──────────────────────────────────────────────────────
  "Austen_Emma.txt",            "JA",  "Emma",
  "Austen_Pride.txt",           "JA",  "Pride & Prejudice",
  "Austen_Sense.txt",           "JA",  "Sense & Sensibility",

  # ── Bronte, Anne (AB) ──────────────────────────────────────────────────────
  "ABronte_Agnes.txt",          "AB",  "Agnes Grey",
  "ABronte_Tenant.txt",         "AB",  "Tenant of Wildfell Hall",

  # ── Bronte, Charlotte (CB) ─────────────────────────────────────────────────
  "CBronte_Jane.txt",           "CB",  "Jane Eyre",
  "CBronte_Professor.txt",      "CB",  "The Professor",
  "CBronte_Villette.txt",       "CB",  "Villette",

  # ── Bronte, Emily (EB) ─────────────────────────────────────────────────────
  "EBronte_Wuthering.txt",      "EB",  "Wuthering Heights",

  # ── Dickens, Charles (CD) ──────────────────────────────────────────────────
  "Dickens_Bleak.txt",          "CD",  "Bleak House",
  "Dickens_David.txt",          "CD",  "David Copperfield",
  "Dickens_Hard.txt",           "CD",  "Hard Times",

  # ── Eliot, George (GE) ─────────────────────────────────────────────────────
  "Eliot_Adam.txt",             "GE",  "Adam Bede",
  "Eliot_Middlemarch.txt",      "GE",  "Middlemarch",
  "Eliot_Mill.txt",             "GE",  "Mill on the Floss",

  # ── Fielding, Henry (HF) ───────────────────────────────────────────────────
  "Fielding_Joseph.txt",        "HF",  "Joseph Andrews",
  "Fielding_Tom.txt",           "HF",  "Tom Jones",

  # ── Richardson, Samuel (SR) ────────────────────────────────────────────────
  "Richardson_Clarissa.txt",    "SR",  "Clarissa",
  "Richardson_Pamela.txt",      "SR",  "Pamela",

  # ── Sterne, Laurence (LS) ──────────────────────────────────────────────────
  "Sterne_Tristram.txt",        "LS",  "Tristram Shandy",
  "Sterne_Sentimental.txt",     "LS",  "Sentimental Journey",

  # ── Thackeray, William Makepeace (WT) ──────────────────────────────────────
  "Thackeray_Vanity.txt",       "WT",  "Vanity Fair",
  "Thackeray_Pendennis.txt",    "WT",  "History of Pendennis",
  "Thackeray_Barry.txt",        "WT",  "Barry Lyndon",

  # ── Trollope, Anthony (AT) ─────────────────────────────────────────────────
  "Trollope_Barchester.txt",    "AT",  "Barchester Towers",
  "Trollope_Phineas.txt",       "AT",  "Phineas Finn",
  "Trollope_Prime.txt",         "AT",  "The Prime Minister"
)

corpus <- texts_raw |>
  left_join(file_map, by = "filename") |>
  left_join(
    meta |>
      select(author_id, author, gender) |>
      distinct(author_id, .keep_all = TRUE),
    by = "author_id"
  ) |>
  mutate(doc_id = row_number())

corpus |>
  select(doc_id, filename, author, author_id, short_title) |>
  kable(caption = "Итоговый корпус") |>
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  )

Итоговый корпус
doc_id	filename	author	author_id	short_title
1	ABronte_Agnes.txt	Bronte, Anne	AB	Agnes Grey
2	ABronte_Tenant.txt	Bronte, Anne	AB	Tenant of Wildfell Hall
3	Austen_Emma.txt	Austen, Jane	JA	Emma
4	Austen_Pride.txt	Austen, Jane	JA	Pride & Prejudice
5	Austen_Sense.txt	Austen, Jane	JA	Sense & Sensibility
6	CBronte_Jane.txt	Bronte, Charlotte	CB	Jane Eyre
7	CBronte_Professor.txt	Bronte, Charlotte	CB	The Professor
8	CBronte_Villette.txt	Bronte, Charlotte	CB	Villette
9	Dickens_Bleak.txt	Dickens, Charles	CD	Bleak House
10	Dickens_David.txt	Dickens, Charles	CD	David Copperfield
11	Dickens_Hard.txt	Dickens, Charles	CD	Hard Times
12	EBronte_Wuthering.txt	Bronte, Emily	EB	Wuthering Heights
13	Eliot_Adam.txt	Eliot, George	GE	Adam Bede
14	Eliot_Middlemarch.txt	Eliot, George	GE	Middlemarch
15	Eliot_Mill.txt	Eliot, George	GE	Mill on the Floss
16	Fielding_Joseph.txt	Fielding, Henry	HF	Joseph Andrews
17	Fielding_Tom.txt	Fielding, Henry	HF	Tom Jones
18	Richardson_Clarissa.txt	Richardson, Samuel	SR	Clarissa
19	Richardson_Pamela.txt	Richardson, Samuel	SR	Pamela
20	Sterne_Sentimental.txt	Sterne, Laurence	LS	Sentimental Journey
21	Sterne_Tristram.txt	Sterne, Laurence	LS	Tristram Shandy
22	Thackeray_Barry.txt	Thackeray, William Makepeace	WT	Barry Lyndon
23	Thackeray_Pendennis.txt	Thackeray, William Makepeace	WT	History of Pendennis
24	Thackeray_Vanity.txt	Thackeray, William Makepeace	WT	Vanity Fair
25	Trollope_Barchester.txt	Trollope, Antony	AT	Barchester Towers
26	Trollope_Phineas.txt	Trollope, Antony	AT	Phineas Finn
27	Trollope_Prime.txt	Trollope, Antony	AT	The Prime Minister

Извлечение признаков

Функции подсчета

Признаки предложений

feat_sentences <- function(text) {
  sents     <- tokenize_sentences(text)[[1]]
  sent_lens <- map_int(sents, ~ length(tokenize_words(.x)[[1]]))

  tibble(
    n_sentences     = length(sents),
    mean_sent_len   = mean(sent_lens,          na.rm = TRUE),
    median_sent_len = median(sent_lens,         na.rm = TRUE),
    sd_sent_len     = sd(sent_lens,             na.rm = TRUE),
    max_sent_len    = max(sent_lens,            na.rm = TRUE),
    long_sent_ratio = mean(sent_lens > 40,      na.rm = TRUE)
  )
}

Лексические признаки

feat_words <- function(text) {
  words    <- tokenize_words(text)[[1]]
  n_words  <- length(words)
  n_unique <- n_distinct(words)

  window <- 500L
  mattr  <- if (n_words >= window) {
    idx <- seq(1, n_words - window + 1, by = 50)
    mean(map_dbl(idx, ~ n_distinct(words[.x:(.x + window - 1)]) / window))
  } else {
    n_unique / n_words
  }

  word_lens <- nchar(words)

  tibble(
    n_words         = n_words,
    ttr             = n_unique / n_words,
    mattr           = mattr,
    mean_word_len   = mean(word_lens,      na.rm = TRUE),
    median_word_len = median(word_lens,    na.rm = TRUE),
    sd_word_len     = sd(word_lens,        na.rm = TRUE),
    long_word_ratio = mean(word_lens > 7,  na.rm = TRUE)
  )
}

Пунктуационные признаки

feat_punct <- function(text) {
  n_ch <- nchar(text)

  tibble(
    comma_rate   = str_count(text, ",")           / n_ch * 1000,
    semicol_rate = str_count(text, ";")           / n_ch * 1000,
    colon_rate   = str_count(text, ":")           / n_ch * 1000,
    excl_rate    = str_count(text, "!")           / n_ch * 1000,
    quest_rate   = str_count(text, "\\?")         / n_ch * 1000,
    dash_rate    = str_count(text, "\u2014|--")   / n_ch * 1000,
    quote_rate   = str_count(
      text, '"|\u2018|\u2019|\u201c|\u201d'
    ) / n_ch * 1000
  )
}

Функциональные слова

# Соберем их список
FW <- c(
  "the", "a", "an", "and", "but", "or", "in", "of", "to",
  "that", "it", "he", "she", "they", "i", "you", "we",
  "his", "her", "their", "my", "your", "our",
  "with", "for", "on", "at", "by", "from", "as",
  "not", "no", "all", "this", "which", "who", "what",
  "be", "been", "have", "had", "has", "was", "were", "is", "are",
  "said", "would", "could", "should", "upon", "shall", "must",
  "though", "yet", "very", "quite", "rather", "indeed",
  "however", "therefore", "then", "than", "when", "where", "if",
  "so", "more", "such", "after", "before", "over", "again"
)

feat_funwords <- function(text) {
  words   <- tokenize_words(text)[[1]]
  n_words <- length(words)
  counts  <- table(words)

  # Частота на 10000 слов для каждого функционального слова
  fw_tbl <- map_dfc(FW, function(w) {
    cnt <- if (w %in% names(counts)) as.numeric(counts[[w]]) else 0
    tibble(!!paste0("fw_", w) := cnt / n_words * 10000)
  })

  fw_tbl |>
    mutate(stopword_ratio = sum(words %in% FW) / n_words)
}

Матрица признаков

sf <- map_dfr(corpus$raw_text, feat_sentences)

wf <- map_dfr(corpus$raw_text, feat_words)

pf <- map_dfr(corpus$raw_text, feat_punct)

ff <- map_dfr(corpus$raw_text, feat_funwords)

features_df <- corpus |>
  select(doc_id, filename, author, author_id, short_title, gender) |>
  bind_cols(sf, wf, pf, ff) |>
  mutate(author_id = factor(author_id))   # целевая переменная — фактор

cat(
  "Итоговая матрица:", nrow(features_df),
  "строк ×", ncol(features_df), "столбцов\n"
)

## Итоговая матрица: 27 строк × 100 столбцов

cat("Классы:\n")

## Классы:

print(table(features_df$author_id))

## 
## AB AT CB CD EB GE HF JA LS SR WT 
##  2  3  3  3  1  3  2  3  2  2  3

EDA

Сводная таблица

features_df |>
  select(author, short_title, n_words, n_sentences,
         mean_sent_len, mattr, stopword_ratio) |>
  mutate(across(where(is.numeric), ~ round(.x, 2))) |>
  arrange(author) |>
  kable(caption = "Основные стилистические характеристики текстов") |>
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  ) |>
  scroll_box(width = "100%", height = "380px")

Основные стилистические характеристики текстов
author	short_title	n_words	n_sentences	mean_sent_len	mattr	stopword_ratio
Austen, Jane	Emma	160997	9126	17.64	0.49	0.50
Austen, Jane	Pride & Prejudice	122108	7058	17.30	0.50	0.50
Austen, Jane	Sense & Sensibility	119962	5483	21.88	0.50	0.50
Bronte, Anne	Agnes Grey	68598	2610	26.28	0.52	0.49
Bronte, Anne	Tenant of Wildfell Hall	167993	7693	21.84	0.51	0.49
Bronte, Charlotte	Jane Eyre	188202	10637	17.69	0.53	0.47
Bronte, Charlotte	The Professor	89379	3671	24.35	0.55	0.46
Bronte, Charlotte	Villette	195680	10780	18.15	0.56	0.45
Bronte, Emily	Wuthering Heights	117113	7211	16.24	0.56	0.46
Dickens, Charles	Bleak House	357476	24249	14.74	0.49	0.47
Dickens, Charles	David Copperfield	359112	22498	15.96	0.49	0.49
Dickens, Charles	Hard Times	104048	7044	14.77	0.50	0.47
Eliot, George	Adam Bede	217018	9701	22.37	0.51	0.46
Eliot, George	Middlemarch	320198	17450	18.35	0.52	0.47
Eliot, George	Mill on the Floss	209330	10496	19.94	0.52	0.46
Fielding, Henry	Joseph Andrews	137415	4098	33.53	0.51	0.49
Fielding, Henry	Tom Jones	349148	11262	31.00	0.50	0.49
Richardson, Samuel	Clarissa	969910	44817	21.64	0.48	0.51
Richardson, Samuel	Pamela	440450	17289	25.48	0.47	0.51
Sterne, Laurence	Sentimental Journey	40849	986	41.43	0.49	0.49
Sterne, Laurence	Tristram Shandy	188668	4074	46.31	0.50	0.48
Thackeray, William Makepeace	Barry Lyndon	128308	4627	27.73	0.51	0.49
Thackeray, William Makepeace	History of Pendennis	358520	17131	20.93	0.52	0.47
Thackeray, William Makepeace	Vanity Fair	306927	14835	20.69	0.53	0.46
Trollope, Antony	Barchester Towers	198811	9875	20.13	0.48	0.50
Trollope, Antony	Phineas Finn	264592	17730	14.92	0.46	0.51
Trollope, Antony	The Prime Minister	284645	19089	14.91	0.46	0.51

Средние длины предложений

p_sent <- features_df |>
  group_by(author_id, author) |>
  summarise(
    m  = mean(mean_sent_len),
    se = sd(mean_sent_len) / sqrt(n()),
    .groups = "drop"
  ) |>
  ggplot(aes(x = reorder(author_id, m), y = m, fill = author_id)) +
  geom_col(alpha = 0.85, width = 0.65) +
  geom_errorbar(aes(ymin = m - se, ymax = m + se),
                width = 0.25, linewidth = 0.8) +
  geom_text(aes(label = round(m, 1)), hjust = -1.3, size = 3.5) +
  coord_flip() +
  scale_fill_brewer(palette = "Set3") +
  labs(
    title    = "Средняя длина предложения по авторам",
    subtitle = "Слов на предложение (среднее ± SE по произведениям автора)",
    x = NULL, y = "Слов на предложение"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

p_sent

Интересно, что заметное различие есть даже по такому простому параметру. Отличается длинными предложениями Laurence Sterne - и действительно, он этим известен, есть даже знаменитое длинное предложение, открывающее его произведение The Life and Opinions of Tristram Shandy, Gentleman.

Заметим, что оба автора, отличающиеся наиболее длинными предложениями - Laurence Sterne и Henry Fielding - писатели 18 века, а Anthony Trollope, Emily Brontë и Чарльз Диккенс, отличающиеся краткостью предложений - писатели 19 века. Впрочем, такой анализ - тема отдельной работы.

Лексическое разнообразие

p_mattr <- features_df |>
  ggplot(aes(
    x      = reorder(author_id, mattr, median),
    y      = mattr,
    fill   = author_id,
    colour = author_id
  )) +
  geom_boxplot(alpha = 0.45, outlier.shape = NA, linewidth = 0.7) +
  geom_jitter(width = 0.15, size = 3.5, alpha = 0.9) +
  scale_fill_brewer(palette   = "Set3") +
  scale_colour_brewer(palette = "Set3") +
  labs(
    title    = "Лексическое разнообразие (MATTR)",
    subtitle = "Moving Average Type-Token Ratio, окно = 500 слов",
    x = NULL, y = "MATTR"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

p_mattr

Тут заметно, что распределения лексических разнообразий у авторов неплохо разделяется, но надо учитывать, что это все в пределах сотых долей + текстов у нас довольно мало, поэтому возможно этот график описывает скорее произведения, чем авторов.

Пунктуационные профили

punct_long <- features_df |>
  select(author_id, comma_rate, semicol_rate,
         colon_rate, excl_rate, quest_rate, dash_rate) |>
  pivot_longer(
    cols      = -author_id,
    names_to  = "punct",
    values_to = "rate"
  ) |>
  mutate(punct = recode(punct,
    comma_rate   = "Запятые",
    semicol_rate = "Точки с запятой",
    colon_rate   = "Двоеточия",
    excl_rate    = "Восклицат. знаки",
    quest_rate   = "Вопросит. знаки",
    dash_rate    = "Тире"
  ))

p_punct <- punct_long |>
  group_by(author_id, punct) |>
  summarise(mean_rate = mean(rate), .groups = "drop") |>
  ggplot(aes(x = author_id, y = mean_rate, fill = author_id)) +
  geom_col(alpha = 0.85) +
  facet_wrap(~punct, scales = "free_y", ncol = 3) +
  scale_fill_brewer(palette = "Set3") +
  labs(
    title    = "Пунктуационный профиль авторов",
    subtitle = "Частота на 1 000 символов",
    x = NULL, y = "Частота"
  ) +
  theme_minimal(base_size = 11) +
  theme(
    legend.position = "none",
    axis.text.x     = element_text(angle = 45, hjust = 1)
  )

p_punct

По вопросительным знакам и запятым заметных различий не видно, но зато по восклицательным знакам и двоеточиям выделяется Эмилия Бронте, а по частоте тире - Laurence Sterne.

Тепловая карта функциональных слов

fw_cols <- names(features_df)[str_starts(names(features_df), "fw_")]

# Топ-30 слов по межавторской дисперсии
top30_fw <- features_df |>
  select(author_id, all_of(fw_cols)) |>
  pivot_longer(-author_id, names_to = "word", values_to = "rate") |>
  group_by(word) |>
  summarise(bw_var = var(rate), .groups = "drop") |>
  slice_max(bw_var, n = 30) |>
  pull(word)

heatmap_data <- features_df |>
  select(author_id, all_of(top30_fw)) |>
  group_by(author_id) |>
  summarise(across(everything(), mean), .groups = "drop") |>
  pivot_longer(-author_id, names_to = "word", values_to = "rate") |>
  mutate(word = str_remove(word, "^fw_"))

p_heat <- heatmap_data |>
  ggplot(aes(x = author_id, y = reorder(word, rate), fill = rate)) +
  geom_tile(colour = "white", linewidth = 0.4) +
  scale_fill_distiller(
    palette   = "RdYlBu",
    direction = 1,
    name      = "Частота\n(на 10 000 сл.)"
  ) +
  labs(
    title    = "Профиль функциональных слов",
    subtitle = "Топ-30 слов с наибольшей вариацией между авторами",
    x = "Автор", y = "Слово"
  ) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p_heat

Заметим, что даже самые частотные служебные слова (the, and, to, of) распределены неравномерно. Это хорошо для стилометрии, поскольку эти слова не связаны с темами произведений, а характеризуют писательские привычки авторов.

Машинное обучение

Подготовка и кросс-валидация

Поскольку текстов мало, разобъем их на фрагменты по примерно 5000 слов и будем с ними работать

chunk_size <- 5000
min_chunk_size <- 3000

make_chunks <- function(text, chunk_size = 5000, min_chunk_size = 3000) {
  
  sents <- tokenize_sentences(text)[[1]]
  
  sent_n_words <- map_int(
    sents,
    ~ length(tokenize_words(.x)[[1]])
  )
  
  chunks <- list()
  chunk_word_counts <- integer()
  
  current_sents <- character()
  current_n_words <- 0
  chunk_id <- 1
  
  for (i in seq_along(sents)) {
    
    current_sents <- c(current_sents, sents[i])
    current_n_words <- current_n_words + sent_n_words[i]
    
    if (current_n_words >= chunk_size) {
      chunks[[chunk_id]] <- paste(current_sents, collapse = " ")
      chunk_word_counts[chunk_id] <- current_n_words
      
      chunk_id <- chunk_id + 1
      current_sents <- character()
      current_n_words <- 0
    }
  }
  
  if (length(current_sents) > 0) {
    chunks[[chunk_id]] <- paste(current_sents, collapse = " ")
    chunk_word_counts[chunk_id] <- current_n_words
  }
  
  tibble(
    chunk_id_local = seq_along(chunks),
    chunk_text = unlist(chunks),
    chunk_n_words = chunk_word_counts
  ) |>
    filter(chunk_n_words >= min_chunk_size)
}

chunks <- corpus |>
  select(doc_id, filename, author, author_id, short_title, gender, raw_text) |>
  mutate(
    chunks = map(
      raw_text,
      make_chunks,
      chunk_size = chunk_size,
      min_chunk_size = min_chunk_size
    )
  ) |>
  select(-raw_text) |>
  unnest(chunks) |>
  mutate(
    chunk_id = row_number(),
    author_id = factor(author_id),
    doc_id = factor(doc_id)
  )

cat("Всего фрагментов:", nrow(chunks), "\n")

## Всего фрагментов: 1286

С этим уже можно работать

chunks |>
  count(author_id, author, name = "n_chunks") |>
  arrange(n_chunks) |>
  kable(caption = "Число фрагментов по авторам") |>
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  )

Число фрагментов по авторам
author_id	author	n_chunks
EB	Bronte, Emily	23
LS	Sterne, Laurence	45
AB	Bronte, Anne	47
JA	Austen, Jane	80
CB	Bronte, Charlotte	94
HF	Fielding, Henry	96
GE	Eliot, George	149
AT	Trollope, Antony	150
WT	Thackeray, William Makepeace	157
CD	Dickens, Charles	164
SR	Richardson, Samuel	281

chunks |>
  count(author_id, short_title, name = "n_chunks") |>
  arrange(author_id, short_title) |>
  kable(caption = "Число фрагментов по произведениям") |>
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  )

Число фрагментов по произведениям
author_id	short_title	n_chunks
AB	Agnes Grey	14
AB	Tenant of Wildfell Hall	33
AT	Barchester Towers	40
AT	Phineas Finn	53
AT	The Prime Minister	57
CB	Jane Eyre	37
CB	The Professor	18
CB	Villette	39
CD	Bleak House	71
CD	David Copperfield	72
CD	Hard Times	21
EB	Wuthering Heights	23
GE	Adam Bede	43
GE	Middlemarch	64
GE	Mill on the Floss	42
HF	Joseph Andrews	27
HF	Tom Jones	69
JA	Emma	32
JA	Pride & Prejudice	24
JA	Sense & Sensibility	24
LS	Sentimental Journey	8
LS	Tristram Shandy	37
SR	Clarissa	193
SR	Pamela	88
WT	Barry Lyndon	25
WT	History of Pendennis	71
WT	Vanity Fair	61

Извлекаем признаки для фрагментов

chunk_sf <- map_dfr(chunks$chunk_text, feat_sentences)

chunk_wf <- map_dfr(chunks$chunk_text, feat_words)

chunk_pf <- map_dfr(chunks$chunk_text, feat_punct)

chunk_ff <- map_dfr(chunks$chunk_text, feat_funwords)

Создаем таблицу признаков

chunk_features_df <- chunks |>
  select(
    chunk_id,
    doc_id,
    filename,
    author,
    author_id,
    short_title,
    gender,
    chunk_id_local,
    chunk_n_words
  ) |>
  bind_cols(chunk_sf, chunk_wf, chunk_pf, chunk_ff) |>
  mutate(
    author_id = factor(author_id),
    doc_id = factor(doc_id)
  )

cat(
  "Матрица признаков по фрагментам:",
  nrow(chunk_features_df),
  "строк ×", ncol(chunk_features_df), "столбцов\n"
)

## Матрица признаков по фрагментам: 1286 строк × 103 столбцов

table(chunk_features_df$author_id)

## 
##  AB  AT  CB  CD  EB  GE  HF  JA  LS  SR  WT 
##  47 150  94 164  23 149  96  80  45 281 157

Готовим данные для моделей

model_df <- chunk_features_df |>
  select(author_id, where(is.numeric)) |>
  select(-chunk_id, -chunk_id_local) |>
  drop_na() |>
  mutate(author_id = factor(author_id))

model_df |>
  count(author_id, name = "n_fragments") |>
  arrange(n_fragments) |>
  kable(caption = "Число фрагментов по авторам в модельной таблице") |>
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  )

Число фрагментов по авторам в модельной таблице
author_id	n_fragments
EB	23
LS	45
AB	47
JA	80
CB	94
HF	96
GE	149
AT	150
WT	157
CD	164
SR	281

Выбираем кол-во фолдов

min_fragments_per_author <- model_df |>
  count(author_id) |>
  summarise(min_n = min(n)) |>
  pull(min_n)

min_fragments_per_author

## [1] 23

Возьмем например 10

v <- 10

set.seed(6769)

folds <- vfold_cv(
  model_df,
  v = v,
  strata = author_id
)

Проверим, что каждый автор есть и в train и test

check_split_authors <- function(split, id, all_authors) {
  
  train_data <- analysis(split)
  test_data  <- assessment(split)
  
  train_counts <- train_data |>
    count(author_id, name = "n_train") |>
    complete(author_id = all_authors, fill = list(n_train = 0))
  
  test_counts <- test_data |>
    count(author_id, name = "n_test") |>
    complete(author_id = all_authors, fill = list(n_test = 0))
  
  train_counts |>
    full_join(test_counts, by = "author_id") |>
    mutate(
      fold = id,
      train_has_author = n_train > 0,
      test_has_author  = n_test > 0
    ) |>
    select(fold, author_id, n_train, n_test, train_has_author, test_has_author)
}

all_authors <- levels(model_df$author_id)

fold_check <- map2_dfr(
  folds$splits,
  folds$id,
  ~ check_split_authors(.x, .y, all_authors)
)

cv_check_summary <- fold_check |>
  summarise(
    all_train_have_all_authors = all(train_has_author),
    all_test_have_all_authors  = all(test_has_author)
  )

cv_check_summary

## # A tibble: 1 × 2
##   all_train_have_all_authors all_test_have_all_authors
##   <lgl>                      <lgl>                    
## 1 TRUE                       TRUE

Все есть!

Подготовка моделей

Будем использовать Multinomial логистическую регрессию

Используем рецепт:

удаляем признаки с нулевой дисперсией; удаляем признаки с почти нулевой дисперсией; нормализуем числовые признаки.

base_rec <- recipe(author_id ~ ., data = model_df) |>
  step_zv(all_predictors()) |>
  step_nzv(all_predictors()) |>
  step_normalize(all_predictors())

Возьмем метрики - accuracy, log loss и ROC-AUC

multi_metrics <- metric_set(
  accuracy,
  mn_log_loss,
  roc_auc
)

Модели

Логрег

log_spec <- multinom_reg(
  penalty = tune(),
  mixture = 1
) |>
  set_engine("glmnet") |>
  set_mode("classification")

log_wf <- workflow() |>
  add_recipe(base_rec) |>
  add_model(log_spec)

Сетка гиперпараметров

log_grid <- grid_regular(
  penalty(range = c(-5, 1)),
  levels = 20
)

log_grid

## # A tibble: 20 × 1
##       penalty
##         <dbl>
##  1  0.00001  
##  2  0.0000207
##  3  0.0000428
##  4  0.0000886
##  5  0.000183 
##  6  0.000379 
##  7  0.000785 
##  8  0.00162  
##  9  0.00336  
## 10  0.00695  
## 11  0.0144   
## 12  0.0298   
## 13  0.0616   
## 14  0.127    
## 15  0.264    
## 16  0.546    
## 17  1.13     
## 18  2.34     
## 19  4.83     
## 20 10

Поехали!

ctrl <- control_grid(
  save_pred = TRUE,
  save_workflow = TRUE,
  verbose = TRUE
)

set.seed(6769)

log_res <- tune_grid(
  log_wf,
  resamples = folds,
  grid = log_grid,
  metrics = multi_metrics,
  control = ctrl
)

Смотрим метрики:

collect_metrics(log_res) |>
  arrange(.metric, desc(mean))

## # A tibble: 60 × 7
##      penalty .metric  .estimator  mean     n std_err .config         
##        <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>           
##  1 0.00001   accuracy multiclass 0.998    10 0.00164 pre0_mod01_post0
##  2 0.0000207 accuracy multiclass 0.998    10 0.00164 pre0_mod02_post0
##  3 0.0000428 accuracy multiclass 0.998    10 0.00164 pre0_mod03_post0
##  4 0.0000886 accuracy multiclass 0.998    10 0.00164 pre0_mod04_post0
##  5 0.000183  accuracy multiclass 0.998    10 0.00164 pre0_mod05_post0
##  6 0.000379  accuracy multiclass 0.997    10 0.00170 pre0_mod06_post0
##  7 0.000785  accuracy multiclass 0.997    10 0.00170 pre0_mod07_post0
##  8 0.00162   accuracy multiclass 0.995    10 0.00234 pre0_mod08_post0
##  9 0.00336   accuracy multiclass 0.995    10 0.00230 pre0_mod09_post0
## 10 0.00695   accuracy multiclass 0.987    10 0.00285 pre0_mod10_post0
## # ℹ 50 more rows

Лучшие параметры по accuracy:

best_log <- select_best(log_res, metric = "accuracy")
best_log

## # A tibble: 1 × 2
##   penalty .config         
##     <dbl> <chr>           
## 1 0.00001 pre0_mod01_post0

Топ-5 настроек:

show_best(log_res, metric = "accuracy", n = 5) |>
  kable(caption = "Лучшие настройки логистической регрессии по accuracy") |>
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  )

Лучшие настройки логистической регрессии по accuracy
penalty	.metric	.estimator	mean	n	std_err	.config
0.0000100	accuracy	multiclass	0.9976859	10	0.0016382	pre0_mod01_post0
0.0000207	accuracy	multiclass	0.9976859	10	0.0016382	pre0_mod02_post0
0.0000428	accuracy	multiclass	0.9976859	10	0.0016382	pre0_mod03_post0
0.0000886	accuracy	multiclass	0.9976859	10	0.0016382	pre0_mod04_post0
0.0001833	accuracy	multiclass	0.9976859	10	0.0016382	pre0_mod05_post0

autoplot(log_res) +
  labs(
    title = "Настройка мультиномиальной логистической регрессии",
    subtitle = "Регуляризация LASSO",
    x = "Penalty",
    y = "Значение метрики"
  ) +
  theme_minimal(base_size = 13)

Вышло отлично! Посмотрим матрицу ошибок:

log_predictions <- log_res |>
  collect_predictions(parameters = best_log)

log_predictions |>
  select(author_id, .pred_class) |>
  head()

## # A tibble: 6 × 2
##   author_id .pred_class
##   <fct>     <fct>      
## 1 AB        AB         
## 2 AB        AB         
## 3 AB        AB         
## 4 AB        AB         
## 5 AB        AB         
## 6 JA        JA

log_conf_mat <- log_predictions |>
  conf_mat(
    truth = author_id,
    estimate = .pred_class
  )

log_conf_mat

##           Truth
## Prediction  AB  AT  CB  CD  EB  GE  HF  JA  LS  SR  WT
##         AB  47   0   0   0   0   0   0   0   0   0   0
##         AT   0 150   0   0   0   0   0   0   0   0   0
##         CB   0   0  94   0   0   0   0   0   0   0   0
##         CD   0   0   0 163   0   1   0   0   0   0   0
##         EB   0   0   0   0  23   0   0   0   0   0   0
##         GE   0   0   0   1   0 148   0   0   0   0   0
##         HF   0   0   0   0   0   0  95   0   0   0   0
##         JA   0   0   0   0   0   0   0  80   0   0   0
##         LS   0   0   0   0   0   0   0   0  45   0   0
##         SR   0   0   0   0   0   0   0   0   0 281   0
##         WT   0   0   0   0   0   0   1   0   0   0 157

autoplot(log_conf_mat, type = "heatmap") +
  scale_fill_gradient(low = "white", high = "#2C7FB8") +
  labs(
    title = "Матрица ошибок: логистическая регрессия",
    subtitle = "Стратифицированная кросс-валидация по фрагментам",
    x = "Предсказанный автор",
    y = "Истинный автор"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text.y = element_text(size = 10)
  )

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

почти без ошибок! отличный результат

log_predictions <- log_res |>
  collect_predictions(parameters = best_log)

pred_cols <- names(log_predictions) |>
  str_subset("^\\.pred_") |>
  setdiff(".pred_class")

log_roc_curve <- log_predictions |>
  roc_curve(
    truth = author_id,
    !!!syms(pred_cols)
  )

log_roc_curve |>
  ggplot(aes(x = 1 - specificity, y = sensitivity, color = .level)) +
  geom_abline(
    slope = 1,
    intercept = 0,
    color = "gray50",
    lty = 2,
    alpha = 0.8
  ) +
  geom_path(linewidth = 1.2, alpha = 0.75) +
  labs(
    title = "ROC-кривые для логистической регрессии",
    subtitle = "One-vs-all классификация авторов",
    x = "1 - Specificity",
    y = "Sensitivity",
    color = "Автор"
  ) +
  theme_light(base_size = 13)

Random Forest

Попробуем обучить RF

rf_spec <- rand_forest(
  mtry = tune(),
  min_n = tune(),
  trees = 300
) |>
  set_engine("ranger", importance = "impurity") |>
  set_mode("classification")

rf_wf <- workflow() |>
  add_recipe(base_rec) |>
  add_model(rf_spec)

n_pred <- ncol(model_df) - 1

rf_grid <- grid_regular(
  mtry(range = c(2, min(30, n_pred))),
  min_n(range = c(2, 10)),
  levels = 4
)

Поехали!

set.seed(6769)

rf_res <- tune_grid(
  rf_wf,
  resamples = folds,
  grid = rf_grid,
  metrics = multi_metrics,
  control = ctrl
)

Посмотрим лучшие результаты

show_best(rf_res, metric = "accuracy", n = 5)

## # A tibble: 5 × 8
##    mtry min_n .metric  .estimator  mean     n std_err .config         
##   <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>           
## 1    11     4 accuracy multiclass 0.992    10 0.00261 pre0_mod06_post0
## 2    20     2 accuracy multiclass 0.992    10 0.00261 pre0_mod09_post0
## 3    30     4 accuracy multiclass 0.991    10 0.00338 pre0_mod14_post0
## 4    11     2 accuracy multiclass 0.991    10 0.00297 pre0_mod05_post0
## 5    30     7 accuracy multiclass 0.991    10 0.00297 pre0_mod15_post0

show_best(rf_res, metric = "accuracy", n = 5)

## # A tibble: 5 × 8
##    mtry min_n .metric  .estimator  mean     n std_err .config         
##   <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>           
## 1    11     4 accuracy multiclass 0.992    10 0.00261 pre0_mod06_post0
## 2    20     2 accuracy multiclass 0.992    10 0.00261 pre0_mod09_post0
## 3    30     4 accuracy multiclass 0.991    10 0.00338 pre0_mod14_post0
## 4    11     2 accuracy multiclass 0.991    10 0.00297 pre0_mod05_post0
## 5    30     7 accuracy multiclass 0.991    10 0.00297 pre0_mod15_post0

Сравним с ЛогРегом

model_compare <- bind_rows(
  collect_metrics(log_res) |>
    mutate(model = "Logistic regression"),
  collect_metrics(rf_res) |>
    mutate(model = "Random Forest")
)

График сравнения accuracy

model_compare |>
  filter(.metric == "accuracy") |>
  group_by(model) |>
  slice_max(mean, n = 1, with_ties = FALSE) |>
  ungroup() |>
  ggplot(aes(x = model, y = mean, fill = model)) +
  geom_col(alpha = 0.85, width = 0.6) +
  geom_text(aes(label = round(mean, 3)), vjust = -0.5) +
  scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
  labs(
    title = "Сравнение моделей по accuracy",
    x = NULL,
    y = "Accuracy"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

LogReg даже лучшее accuracy показал чем RF.

Интерпретация коэффициентов

Для логрег посмотрим на коэффициенты

final_log_wf <- finalize_workflow(
  log_wf,
  best_log
)

final_log_fit <- final_log_wf |>
  fit(data = model_df)
final_model <- extract_fit_parsnip(final_log_fit)
log_terms <- tidy(final_model)

top_terms <- log_terms |>
  filter(term != "(Intercept)") |>
  group_by(class) |>
  slice_max(abs(estimate), n = 6) |>
  ungroup() |>
  mutate(
    term = str_replace(term, "^fw_", "fw: "),
    term = fct_reorder(term, abs(estimate))
  )

top_terms

## # A tibble: 66 × 4
##    class term        estimate penalty
##    <chr> <fct>          <dbl>   <dbl>
##  1 AB    fw: but        1.84  0.00001
##  2 AB    fw: and        1.25  0.00001
##  3 AB    fw: or         0.959 0.00001
##  4 AB    sd_word_len    0.697 0.00001
##  5 AB    quote_rate    -0.554 0.00001
##  6 AB    fw: who       -0.490 0.00001
##  7 AT    mattr         -1.84  0.00001
##  8 AT    fw: that       1.38  0.00001
##  9 AT    fw: where     -1.23  0.00001
## 10 AT    fw: the        0.941 0.00001
## # ℹ 56 more rows

top_terms |>
  mutate(
    estimate = round(estimate, 3)
  ) |>
  arrange(class, desc(abs(estimate))) |>
  kable(caption = "Наиболее важные признаки логистической регрессии для каждого автора") |>
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE
  ) |>
  scroll_box(width = "100%", height = "500px")

Наиболее важные признаки логистической регрессии для каждого автора
class	term	estimate	penalty
AB	fw: but	1.837	1e-05
AB	fw: and	1.247	1e-05
AB	fw: or	0.959	1e-05
AB	sd_word_len	0.697	1e-05
AB	quote_rate	-0.554	1e-05
AB	fw: who	-0.490	1e-05
AT	mattr	-1.843	1e-05
AT	fw: that	1.377	1e-05
AT	fw: where	-1.232	1e-05
AT	fw: the	0.941	1e-05
AT	fw: my	-0.772	1e-05
AT	comma_rate	-0.697	1e-05
CB	ttr	2.320	1e-05
CB	quote_rate	2.264	1e-05
CB	semicol_rate	1.330	1e-05
CB	colon_rate	1.111	1e-05
CB	fw: if	-0.864	1e-05
CB	fw: yet	0.816	1e-05
CD	semicol_rate	-1.703	1e-05
CD	dash_rate	-1.668	1e-05
CD	colon_rate	-1.492	1e-05
CD	comma_rate	1.280	1e-05
CD	median_sent_len	-0.815	1e-05
CD	fw: in	0.809	1e-05
EB	mattr	2.425	1e-05
EB	excl_rate	1.548	1e-05
EB	colon_rate	0.740	1e-05
EB	dash_rate	-0.584	1e-05
EB	fw: and	0.560	1e-05
EB	fw: of	-0.544	1e-05
GE	fw: upon	-1.380	1e-05
GE	fw: my	-1.317	1e-05
GE	fw: if	1.216	1e-05
GE	fw: or	-0.910	1e-05
GE	fw: so	-0.908	1e-05
GE	comma_rate	-0.767	1e-05
HF	fw: which	2.115	1e-05
HF	quote_rate	1.292	1e-05
HF	fw: indeed	0.922	1e-05
HF	semicol_rate	0.735	1e-05
HF	fw: were	-0.614	1e-05
HF	fw: this	0.590	1e-05
JA	ttr	-1.534	1e-05
JA	long_word_ratio	1.416	1e-05
JA	fw: could	1.004	1e-05
JA	fw: very	0.901	1e-05
JA	fw: must	0.787	1e-05
JA	fw: be	0.567	1e-05
LS	sd_sent_len	1.478	1e-05
LS	fw: upon	0.985	1e-05
LS	long_sent_ratio	0.888	1e-05
LS	fw: it	0.792	1e-05
LS	fw: in	0.332	1e-05
LS	dash_rate	0.134	1e-05
SR	comma_rate	1.614	1e-05
SR	fw: to	1.443	1e-05
SR	fw: yet	1.094	1e-05
SR	colon_rate	1.092	1e-05
SR	fw: upon	0.812	1e-05
SR	fw: no	-0.808	1e-05
WT	fw: but	-1.888	1e-05
WT	fw: and	1.803	1e-05
WT	fw: it	-1.393	1e-05
WT	fw: who	0.962	1e-05
WT	sd_word_len	-0.890	1e-05
WT	fw: which	0.889	1e-05

top_terms |>
  ggplot(aes(x = estimate, y = term, fill = class)) +
  geom_col(show.legend = FALSE, alpha = 0.85) +
  facet_wrap(~ class, scales = "free_y", ncol=3) +
  scale_fill_brewer(palette = "Set3") +
  labs(
    title = "Наиболее важные признаки для каждого автора",
    subtitle = "Коэффициенты регуляризованной логистической регрессии",
    x = "Коэффициент",
    y = "Признак"
  ) +
  theme_minimal(base_size = 10)

Модель активно использует служебные слова: fw: but, fw: and, fw: which, fw: upon, fw: my, fw: he, fw: she, fw: that, fw: very и другие. Это хороший результат для стилометрии, тк служебные слова меньше связаны с темой произведения и сильнее отражают индивидуальные авторские привычки.

Также важными являются пунктуационные признаки: частотность запятых (quote_rate), точек с запятой (semicol_rate), двоеточий (colon_rate), кавычек (quote_rate), тире (dash_rate) и восклицательных знаков (excl_rate). Это значит, что модель различает авторов не только по лексике, но и по пунктуации.

В ходе предварительного анализа было замечено, что некоторые авторы имеют “любимые” знаки препинания - у Laurence Sterne это тире, а у Эмилии Бронте - восклицательные знаки и двоеточия. Финальная модель тоже использовала эти признаки для классификации.

Выводы

Нам удалось успешно обучить модель для классификации произведений британской прозы 18-19 вв по авторам. Наилучшей моделью оказалась Multinomial Logistic Regression, accuracy которой получилась очень высокой - практически 100%. Для обучения произведения были разделены на фрагменты по ~5000 слов, чтобы иметь достаточное количество данных для train/test/validation split и 10 фолдов кросс-валидации. Для классификации лучшая модель часто использует служебные слова и знаки препинания, что показывает, что выбранные авторы хорошо разделимы по таким признакам.

British prose classification

EgorVoron

2026-05-29