Цель исследования

Цель работы — исследовать стилометрические особенности британских писателей XVIII–XIX веков и построить модель классификации произведений по автору с использованием фреймворка {tidymodels}.

Загрузка корпуса

temp <- tempfile(fileext = ".zip")

download.file(
  "https://github.com/computationalstylistics/A_Small_Collection_of_British_Fiction/archive/refs/heads/master.zip",
  temp,
  mode = "wb"
)

unzip(temp, exdir = ".")

root <- "A_Small_Collection_of_British_Fiction-master"

overview_file <- file.path(root, "overview")

metadata <- read.delim(
  overview_file,
  sep = "\t",
  stringsAsFactors = FALSE
)

files <- list.files(
  file.path(root, "corpus"),
  pattern = "\\.txt$",
  full.names = TRUE
)

read_text <- function(x) {
  paste(
    readLines(x,
              warn = FALSE,
              encoding = "UTF-8"),
    collapse = " "
  )
}

corpus <- tibble(
  file_path = files,
  file_name = basename(files),
  text = map_chr(files, read_text)
)

corpus

## # A tibble: 27 × 3
##    file_path                                                     file_name text 
##    <chr>                                                         <chr>     <chr>
##  1 A_Small_Collection_of_British_Fiction-master/corpus/ABronte_… ABronte_… "AGN…
##  2 A_Small_Collection_of_British_Fiction-master/corpus/ABronte_… ABronte_… "The…
##  3 A_Small_Collection_of_British_Fiction-master/corpus/Austen_E… Austen_E… "EMM…
##  4 A_Small_Collection_of_British_Fiction-master/corpus/Austen_P… Austen_P… " Pr…
##  5 A_Small_Collection_of_British_Fiction-master/corpus/Austen_S… Austen_S… "SEN…
##  6 A_Small_Collection_of_British_Fiction-master/corpus/CBronte_… CBronte_… " Ja…
##  7 A_Small_Collection_of_British_Fiction-master/corpus/CBronte_… CBronte_… "THE…
##  8 A_Small_Collection_of_British_Fiction-master/corpus/CBronte_… CBronte_… "VIL…
##  9 A_Small_Collection_of_British_Fiction-master/corpus/Dickens_… Dickens_… "BLE…
## 10 A_Small_Collection_of_British_Fiction-master/corpus/Dickens_… Dickens_… "DAV…
## # ℹ 17 more rows

Соединение с метаданными

metadata$file_name <- c(
  "Austen_Emma.txt",
  "Austen_Pride.txt",
  "Austen_Sense.txt",
  "ABronte_Agnes.txt",
  "ABronte_Tenant.txt",
  "CBronte_Jane.txt",
  "CBronte_Professor.txt",
  "CBronte_Villette.txt",
  "EBronte_Wuthering.txt",
  "Dickens_Bleak.txt",
  "Dickens_David.txt",
  "Dickens_Hard.txt",
  "Eliot_Adam.txt",
  "Eliot_Middlemarch.txt",
  "Eliot_Mill.txt",
  "Fielding_Joseph.txt",
  "Fielding_Tom.txt",
  "Richardson_Clarissa.txt",
  "Richardson_Pamela.txt",
  "Sterne_Tristram.txt",
  "Sterne_Sentimental.txt",
  "Trollope_Prime.txt",
  "Trollope_Barchester.txt",
  "Trollope_Phineas.txt",
  "Thackeray_Vanity.txt",
  "Thackeray_Pendennis.txt",
  "Thackeray_Barry.txt"
)

corpus <- corpus |>
  left_join(metadata, by = "file_name")

cat("Количество текстов:", nrow(corpus), "\n")

## Количество текстов: 27

cat("Количество авторов:", n_distinct(corpus$authorID), "\n")

## Количество авторов: 11

table(corpus$authorID)

## 
## AB AT CB CD EB GE HF JA LS SR WT 
##  2  3  3  3  1  3  2  3  2  2  3

Предобработка

clean_text <- function(x){

  x |>
    tolower() |>
    str_replace_all("[^a-z\\s]", " ") |>
    str_squish()

}

corpus <- corpus |>
  mutate(
    clean = map_chr(text, clean_text),
    doc_id = row_number()
  )

corpus |>
  select(authorID, title, clean) |>
  slice(1:3)

## # A tibble: 3 × 3
##   authorID title                    clean                                       
##   <chr>    <chr>                    <chr>                                       
## 1 AB       Agnes Grey               agnes grey chapter i the parsonage all true…
## 2 AB       Tentant of Wildfell Hall the tenant of wildfell hall author s prefac…
## 3 JA       Emma                     emma by jane austen volume i chapter i emma…

Токенизация

tokens <- corpus |>
  select(doc_id,
         authorID,
         title,
         clean) |>
  unnest_tokens(word, clean)

cat("Количество токенов:", nrow(tokens), "\n")

## Количество токенов: 6533448

cat("Уникальных слов:", n_distinct(tokens$word), "\n")

## Уникальных слов: 53249

Самые частотные слова корпуса

tokens |>
  count(word,
        sort = TRUE) |>
  slice_head(n = 20)

## # A tibble: 20 × 2
##    word       n
##    <chr>  <int>
##  1 the   268180
##  2 and   210004
##  3 to    206364
##  4 of    165055
##  5 i     156590
##  6 a     132109
##  7 in    102162
##  8 that   90337
##  9 he     83970
## 10 it     78879
## 11 you    77941
## 12 was    77014
## 13 her    73786
## 14 his    64735
## 15 as     64520
## 16 my     59863
## 17 for    56628
## 18 not    56529
## 19 she    55726
## 20 with   55704

Извлечение признаков

Базовые стилометрические признаки

doc_features <- tokens |>
  group_by(doc_id,
           authorID,
           title) |>
  summarise(
    words_total = n(),
    vocab_size = n_distinct(word),
    ttr = vocab_size / words_total,
    mean_word_length = mean(nchar(word)),
    .groups = "drop"
  )

doc_features

## # A tibble: 27 × 7
##    doc_id authorID title          words_total vocab_size    ttr mean_word_length
##     <int> <chr>    <chr>                <int>      <int>  <dbl>            <dbl>
##  1      1 AB       Agnes Grey           69283       6665 0.0962             4.18
##  2      2 AB       Tentant of Wi…      169798      10081 0.0594             4.17
##  3      3 JA       Emma                161973       7095 0.0438             4.22
##  4      4 JA       Pride               122726       6258 0.0510             4.37
##  5      5 JA       Sense               120736       6278 0.0520             4.35
##  6      6 CB       Jane Eyre           189219      12545 0.0663             4.16
##  7      7 CB       Professor            89942       9580 0.107              4.34
##  8      8 CB       Villette            196737      14418 0.0733             4.30
##  9      9 CD       Bleak House         362023      14957 0.0413             4.13
## 10     10 CD       David Copperf…      363815      13954 0.0384             4.07
## # ℹ 17 more rows

Средняя длина предложения

sentence_features <- corpus |>
  rowwise() |>
  mutate(
    mean_sentence_length = {

      s <- unlist(
        strsplit(text, "[.!?]+")
      )

      s <- s[
        nchar(trimws(s)) > 0
      ]

      mean(
        stringr::str_count(s, "\\S+"),
        na.rm = TRUE
      )
    }
  ) |>
  ungroup() |>
  select(doc_id,
         mean_sentence_length)

sentence_features

## # A tibble: 27 × 2
##    doc_id mean_sentence_length
##     <int>                <dbl>
##  1      1                 25.8
##  2      2                 21.7
##  3      3                 15.3
##  4      4                 17.2
##  5      5                 20.3
##  6      6                 17.7
##  7      7                 23.9
##  8      8                 17.8
##  9      9                 14.7
## 10     10                 15.9
## # ℹ 17 more rows

Частоты наиболее употребительных слов

top_words <- tokens |>
  count(word,
        sort = TRUE) |>
  slice_head(n = 50) |>
  pull(word)

word_features <- tokens |>
  filter(word %in% top_words) |>
  count(doc_id,
        word) |>
  group_by(doc_id) |>
  mutate(freq = n / sum(n)) |>
  ungroup() |>
  select(doc_id,
         word,
         freq) |>
  pivot_wider(
    names_from = word,
    values_from = freq,
    values_fill = 0
  )

Финальная матрица признаков

features <- doc_features |>
  left_join(sentence_features,
            by = "doc_id") |>
  left_join(word_features,
            by = "doc_id") |>
  mutate(authorID = factor(authorID))

cat(
  "Размер матрицы признаков:",
  nrow(features),
  "x",
  ncol(features),
  "\n"
)

## Размер матрицы признаков: 27 x 58

glimpse(features)

## Rows: 27
## Columns: 58
## $ doc_id               <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15…
## $ authorID             <fct> AB, AB, JA, JA, JA, CB, CB, CB, CD, CD, CD, EB, G…
## $ title                <chr> "Agnes Grey", "Tentant of Wildfell Hall", "Emma",…
## $ words_total          <int> 69283, 169798, 161973, 122726, 120736, 189219, 89…
## $ vocab_size           <int> 6665, 10081, 7095, 6258, 6278, 12545, 9580, 14418…
## $ ttr                  <dbl> 0.09619964, 0.05937055, 0.04380360, 0.05099164, 0…
## $ mean_word_length     <dbl> 4.184980, 4.167222, 4.224939, 4.367958, 4.349482,…
## $ mean_sentence_length <dbl> 25.78305, 21.70841, 15.25515, 17.22024, 20.27884,…
## $ a                    <dbl> 0.04049912, 0.03429175, 0.04147014, 0.03372958, 0…
## $ all                  <dbl> 0.009629377, 0.008216755, 0.011235360, 0.01076586…
## $ an                   <dbl> 0.006223488, 0.004311033, 0.006147650, 0.00621107…
## $ and                  <dbl> 0.08471375, 0.08252374, 0.06488155, 0.06183468, 0…
## $ as                   <dbl> 0.02071400, 0.01812845, 0.01903916, 0.02037577, 0…
## $ at                   <dbl> 0.012880453, 0.012073349, 0.013686470, 0.01359534…
## $ be                   <dbl> 0.015759978, 0.013964799, 0.026180508, 0.02137644…
## $ been                 <dbl> 0.004365731, 0.003942569, 0.010069426, 0.00890253…
## $ but                  <dbl> 0.022571756, 0.023655412, 0.019092162, 0.01728748…
## $ by                   <dbl> 0.008267022, 0.007762316, 0.007565319, 0.01097289…
## $ `for`                <dbl> 0.02006378, 0.02036380, 0.01784673, 0.01828816, 0…
## $ from                 <dbl> 0.007740657, 0.007921984, 0.007234088, 0.00850571…
## $ had                  <dbl> 0.015667090, 0.012404967, 0.021543272, 0.02030675…
## $ have                 <dbl> 0.011084621, 0.012503224, 0.017502252, 0.01452701…
## $ he                   <dbl> 0.02161191, 0.02824893, 0.02399438, 0.02308449, 0…
## $ her                  <dbl> 0.023221971, 0.021616576, 0.032990620, 0.03842238…
## $ him                  <dbl> 0.010093817, 0.015586043, 0.010215168, 0.01318127…
## $ his                  <dbl> 0.01334489, 0.02129724, 0.01523663, 0.02192854, 0…
## $ i                    <dbl> 0.07146175, 0.07832324, 0.04229159, 0.03562740, 0…
## $ `if`                 <dbl> 0.009846116, 0.010660902, 0.006425884, 0.00602129…
## $ `in`                 <dbl> 0.02851658, 0.02487134, 0.02898935, 0.03243560, 0…
## $ is                   <dbl> 0.007895470, 0.013068203, 0.016468811, 0.01483756…
## $ it                   <dbl> 0.02579187, 0.02909640, 0.03350734, 0.02643157, 0…
## $ me                   <dbl> 0.019630306, 0.023692259, 0.007724310, 0.00771208…
## $ mr                   <dbl> 0.0048301700, 0.0044584188, 0.0152896285, 0.01356…
## $ my                   <dbl> 0.026844599, 0.029194660, 0.009698447, 0.01230137…
## $ no                   <dbl> 0.009350714, 0.009371277, 0.009830940, 0.00843670…
## $ not                  <dbl> 0.01891817, 0.02003218, 0.02849913, 0.02460275, 0…
## $ of                   <dbl> 0.04963309, 0.04588610, 0.05687901, 0.06226601, 0…
## $ on                   <dbl> 0.008390872, 0.008855427, 0.009168477, 0.01235313…
## $ one                  <dbl> 0.006378301, 0.005440990, 0.006068154, 0.00471006…
## $ or                   <dbl> 0.011456172, 0.009567791, 0.006545127, 0.00515864…
## $ s                    <dbl> 0.009133975, 0.009088788, 0.012361545, 0.01131795…
## $ said                 <dbl> 0.005759049, 0.008450116, 0.006412634, 0.00691844…
## $ she                  <dbl> 0.019475493, 0.017244132, 0.031321215, 0.02950259…
## $ so                   <dbl> 0.013282967, 0.010452106, 0.012904764, 0.01016200…
## $ that                 <dbl> 0.02461529, 0.02393790, 0.02408713, 0.02720795, 0…
## $ the                  <dbl> 0.07784005, 0.07288225, 0.06894907, 0.07470541, 0…
## $ they                 <dbl> 0.008700498, 0.003868876, 0.007167841, 0.01036904…
## $ this                 <dbl> 0.005604236, 0.006939412, 0.006969103, 0.00772933…
## $ to                   <dbl> 0.07387683, 0.06963976, 0.06945254, 0.07178965, 0…
## $ was                  <dbl> 0.03096263, 0.02253774, 0.03179819, 0.03186625, 0…
## $ what                 <dbl> 0.006595040, 0.006914848, 0.007114844, 0.00826417…
## $ when                 <dbl> 0.006471189, 0.005367298, 0.004822725, 0.00643536…
## $ which                <dbl> 0.005820974, 0.003205640, 0.007366580, 0.00929935…
## $ will                 <dbl> 0.005820974, 0.007258748, 0.007591817, 0.00709097…
## $ with                 <dbl> 0.01876335, 0.01882853, 0.01613758, 0.01815014, 0…
## $ would                <dbl> 0.011672911, 0.008732605, 0.010877630, 0.00812615…
## $ you                  <dbl> 0.024057962, 0.035827018, 0.026485240, 0.02341229…
## $ your                 <dbl> 0.004148992, 0.007492109, 0.004862473, 0.00790186…

Разведывательный анализ

Лексическое разнообразие

ggplot(
  features,
  aes(
    x = authorID,
    y = ttr,
    fill = authorID
  )
) +
  geom_boxplot() +
  theme(
    legend.position = "none"
  )

Размер словаря

ggplot(
  features,
  aes(
    x = reorder(authorID,
                vocab_size,
                mean),
    y = vocab_size,
    fill = authorID
  )
) +
  geom_boxplot() +
  coord_flip() +
  theme(
    legend.position = "none"
  )

TTR и длина предложения

ggplot(
  features,
  aes(
    ttr,
    mean_sentence_length,
    color = authorID
  )
) +
  geom_point(size = 3)

PCA признаков

pca_data <- features |>
  select(where(is.numeric))

pca_fit <- prcomp(
  pca_data,
  scale. = TRUE
)

pca_df <- as.data.frame(
  pca_fit$x[,1:2]
)

pca_df$authorID <- features$authorID

ggplot(
  pca_df,
  aes(
    PC1,
    PC2,
    color = authorID
  )
) +
  geom_point(size = 3) +
  labs(
    title = "PCA стилометрических признаков"
  )

Классификация авторов

Разделение выборки

split <- initial_split(
  features,
  prop = 0.75,
  strata = authorID
)

train_data <- training(split)
test_data <- testing(split)

nrow(train_data)

## [1] 18

nrow(test_data)

## [1] 9

Recipe

rec <- recipe(
  authorID ~ .,
  data = train_data
) |>
  step_rm(doc_id,
          title) |>
  step_normalize(
    all_numeric_predictors()
  )

Random Forest

rf_model <- rand_forest(
  trees = 300
) |>
  set_engine(
    "ranger",
    importance = "permutation"
  ) |>
  set_mode("classification")

Random Forest Workflow

rf_wf <- workflow() |>
  add_recipe(rec) |>
  add_model(rf_model)

Multinomial Logistic Regression

log_model <- multinom_reg() |>
  set_engine("nnet") |>
  set_mode("classification")

log_wf <- workflow() |>
  add_recipe(rec) |>
  add_model(log_model)

Кросс-валидация

folds <- vfold_cv(
  train_data,
  v = 3,
  strata = authorID
)

rf_cv <- fit_resamples(
  rf_wf,
  folds,
  metrics = metric_set(
    accuracy,
    kap
  )
)

log_cv <- fit_resamples(
  log_wf,
  folds,
  metrics = metric_set(
    accuracy,
    kap
  )
)

Сравнение моделей

rf_metrics <- collect_metrics(rf_cv) |>
  mutate(model = "Random Forest")

log_metrics <- collect_metrics(log_cv) |>
  mutate(model = "Multinomial Logistic")

bind_rows(
  rf_metrics,
  log_metrics
) |>
  select(
    model,
    .metric,
    mean,
    std_err
  )

## # A tibble: 4 × 4
##   model                .metric   mean std_err
##   <chr>                <chr>    <dbl>   <dbl>
## 1 Random Forest        accuracy 0.222  0.147 
## 2 Random Forest        kap      0.150  0.0976
## 3 Multinomial Logistic accuracy 0.819  0.108 
## 4 Multinomial Logistic kap      0.782  0.131

Финальная модель

final_model <- fit(
  rf_wf,
  train_data
)

pred <- predict(
  final_model,
  test_data
) |>
  bind_cols(test_data)

acc <- accuracy(
  pred,
  truth = authorID,
  estimate = .pred_class
)

kap_score <- kap(
  pred,
  truth = authorID,
  estimate = .pred_class
)

acc

## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.222

kap_score

## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 kap     multiclass     0.192

Матрица ошибок

pred |>
  conf_mat(
    truth = authorID,
    estimate = .pred_class
  ) |>
  autoplot(type = "heatmap")

# Важность признаков

rf_fit <- extract_fit_parsnip(
  final_model
)

vip(
  rf_fit$fit,
  num_features = 15
)

Интерпретация результатов

cat(
  "Количество произведений:",
  nrow(features),
  "\n"
)

## Количество произведений: 27

cat(
  "Количество авторов:",
  n_distinct(features$authorID),
  "\n"
)

## Количество авторов: 11

cat(
  "Accuracy:",
  round(acc$.estimate * 100, 2),
  "%\n"
)

## Accuracy: 22.22 %

cat(
  "Cohen Kappa:",
  round(kap_score$.estimate, 3),
  "\n"
)

## Cohen Kappa: 0.192

cat(
  "\nНаиболее частотные слова корпуса:\n"
)

## 
## Наиболее частотные слова корпуса:

print(top_words[1:15])

##  [1] "the"  "and"  "to"   "of"   "i"    "a"    "in"   "that" "he"   "it"  
## [11] "you"  "was"  "her"  "his"  "as"

График выше показывает наиболее важные признаки для классификации авторов. Наибольший вклад вносят частоты наиболее употребительных слов, показатели лексического разнообразия (TTR), длина слов и длина предложений. Эти признаки традиционно считаются устойчивыми маркерами авторского стиля.

Выводы

Был загружен корпус из 27 произведений британской художественной прозы XVIII–XIX веков.
Для каждого текста были рассчитаны стилометрические признаки: лексическое разнообразие, средняя длина слов, средняя длина предложений и частоты наиболее употребительных слов.
Разведывательный анализ показал различия между авторами по размеру словаря и показателю TTR.
Для снижения размерности и визуального анализа была использована PCA.
Были построены две модели классификации: Random Forest и Multinomial Logistic Regression.
Сравнение по кросс-валидации показало, какая модель лучше справляется с задачей атрибуции авторства.
Наиболее важными предикторами оказались частоты распространённых слов корпуса, а также показатели лексического разнообразия.
Полученные результаты подтверждают, что количественные характеристики текста позволяют успешно различать авторские стили и использовать их для автоматической атрибуции произведений.

Stylometric Analysis of British Fiction

Aleksandr Tarasov

2026-05-31