library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.5.0 ──
## ✔ broom        1.0.13     ✔ rsample      1.3.2 
## ✔ dials        1.4.3      ✔ tailor       0.1.0 
## ✔ infer        1.1.0      ✔ tune         2.1.0 
## ✔ modeldata    1.5.1      ✔ workflows    1.3.0 
## ✔ parsnip      1.6.0      ✔ workflowsets 1.1.1 
## ✔ recipes      1.3.2      ✔ yardstick    1.4.0 
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(textrecipes)
library(tidytext)
library(stylo)
## 
## ### stylo version: 0.7.7 ###
## 
## If you plan to cite this software (please do!), use the following reference:
##     Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
##     a package for computational text analysis. R Journal 8(1): 107-121.
##     <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>
## 
## To get full BibTeX entry, type: citation("stylo")
library(embed)
library(baguette)
library(discrim)
## 
## Attaching package: 'discrim'
## 
## The following object is masked from 'package:dials':
## 
##     smoothness
library(ggrepel)
library(learntidymodels)



corpus <- load.corpus.and.parse(corpus.dir = "british_fiction") 
## slicing input text into tokens...
## 
## 
## turning words into features, e.g. char n-grams (if applicable)...
#прочитать метаданные 
  
#разделяем тексты на отрывки длиной в 2000 слов
corpus_samples <- make.samples(corpus,
                               sample.size = 2000,
                               sampling = "normal.sampling",
                               sample.overlap = 0,
                               sampling.with.replacement = FALSE)
## ABronte_Agnes
##  - text length (in words): 69283
##  - nr. of samples: 34
##  - nr. of words dropped at the end of the text: 1283
## ABronte_Tenant
##  - text length (in words): 169798
##  - nr. of samples: 84
##  - nr. of words dropped at the end of the text: 1798
## Austen_Emma
##  - text length (in words): 161973
##  - nr. of samples: 80
##  - nr. of words dropped at the end of the text: 1973
## Austen_Pride
##  - text length (in words): 122726
##  - nr. of samples: 61
##  - nr. of words dropped at the end of the text: 726
## Austen_Sense
##  - text length (in words): 120736
##  - nr. of samples: 60
##  - nr. of words dropped at the end of the text: 736
## CBronte_Jane
##  - text length (in words): 189219
##  - nr. of samples: 94
##  - nr. of words dropped at the end of the text: 1219
## CBronte_Professor
##  - text length (in words): 89942
##  - nr. of samples: 44
##  - nr. of words dropped at the end of the text: 1942
## CBronte_Villette
##  - text length (in words): 196737
##  - nr. of samples: 98
##  - nr. of words dropped at the end of the text: 737
## Dickens_Bleak
##  - text length (in words): 362023
##  - nr. of samples: 181
##  - nr. of words dropped at the end of the text: 23
## Dickens_David
##  - text length (in words): 363815
##  - nr. of samples: 181
##  - nr. of words dropped at the end of the text: 1815
## Dickens_Hard
##  - text length (in words): 105607
##  - nr. of samples: 52
##  - nr. of words dropped at the end of the text: 1607
## EBronte_Wuthering
##  - text length (in words): 119399
##  - nr. of samples: 59
##  - nr. of words dropped at the end of the text: 1399
## Eliot_Adam
##  - text length (in words): 222833
##  - nr. of samples: 111
##  - nr. of words dropped at the end of the text: 833
## Eliot_Middlemarch
##  - text length (in words): 323599
##  - nr. of samples: 161
##  - nr. of words dropped at the end of the text: 1599
## Eliot_Mill
##  - text length (in words): 214189
##  - nr. of samples: 107
##  - nr. of words dropped at the end of the text: 189
## Fielding_Joseph
##  - text length (in words): 138056
##  - nr. of samples: 69
##  - nr. of words dropped at the end of the text: 56
## Fielding_Tom
##  - text length (in words): 350626
##  - nr. of samples: 175
##  - nr. of words dropped at the end of the text: 626
## Richardson_Clarissa
##  - text length (in words): 975536
##  - nr. of samples: 487
##  - nr. of words dropped at the end of the text: 1536
## Richardson_Pamela
##  - text length (in words): 444070
##  - nr. of samples: 222
##  - nr. of words dropped at the end of the text: 70
## Sterne_Sentimental
##  - text length (in words): 41350
##  - nr. of samples: 20
##  - nr. of words dropped at the end of the text: 1350
## Sterne_Tristram
##  - text length (in words): 190500
##  - nr. of samples: 95
##  - nr. of words dropped at the end of the text: 500
## Thackeray_Barry
##  - text length (in words): 129414
##  - nr. of samples: 64
##  - nr. of words dropped at the end of the text: 1414
## Thackeray_Pendennis
##  - text length (in words): 364714
##  - nr. of samples: 182
##  - nr. of words dropped at the end of the text: 714
## Thackeray_Vanity
##  - text length (in words): 311459
##  - nr. of samples: 155
##  - nr. of words dropped at the end of the text: 1459
## Trollope_Barchester
##  - text length (in words): 200730
##  - nr. of samples: 100
##  - nr. of words dropped at the end of the text: 730
## Trollope_Phineas
##  - text length (in words): 266991
##  - nr. of samples: 133
##  - nr. of words dropped at the end of the text: 991
## Trollope_Prime
##  - text length (in words): 288061
##  - nr. of samples: 144
##  - nr. of words dropped at the end of the text: 61
#perform.culling(culling.level = 20) #почитать про аргумент и подумать добавлять или нет

# приводим к нижнему регистру, убираем 's
corpus_clean <- map(corpus_samples, function(text) {
  str_to_lower(str_remove_all(text, "[''`]s\\b"))
})

#подумать сотавлять или нет нижний регистр, убирать ли имена потому что имена по идее убирает perform.culling

# 27.2 подготовка датасета

# 500 самых частотных слов
mfw <- make.frequency.list(corpus_clean)[1:500]

# частотная матрица
corpus_tf <- stylo::make.table.of.frequencies(corpus_clean, mfw) |>
  as.data.frame.matrix() |>
  rownames_to_column("id") |>
  as_tibble()
## processing  3253  text samples
## .....................................................................................................................................................................................................................................................................................................................................
## combining frequencies into a table...
# разбираем id вида Author_Book_sampleN на части
corpus_tf <- corpus_tf |>
  separate(id, into = c("author", "book", "sample"), sep = "_",
           extra = "merge", fill = "right", remove = FALSE)

corpus_tf |> 
  select(id, author, book, sample) |>
  head()
## # A tibble: 6 × 4
##   id              author  book  sample
##   <chr>           <chr>   <chr> <chr> 
## 1 ABronte_Agnes_1 ABronte Agnes 1     
## 2 ABronte_Agnes_2 ABronte Agnes 2     
## 3 ABronte_Agnes_3 ABronte Agnes 3     
## 4 ABronte_Agnes_4 ABronte Agnes 4     
## 5 ABronte_Agnes_5 ABronte Agnes 5     
## 6 ABronte_Agnes_6 ABronte Agnes 6
#сколько отрывков для каждого автора в корпусе
corpus_tf |> 
  count(author) |> 
  ggplot(aes(reorder(author, n), n, fill = author)) +
  geom_col(show.legend = FALSE) +
  xlab(NULL) +
  ylab(NULL) +
  scale_fill_viridis_d() + 
  theme_light() +
  coord_flip()

#удаляем авторов, у которых не так много отрывков.
corpus_top <- corpus_tf |>
  add_count(author) |>
  filter(n > 100) |>
  select(-n, -id, -book, -sample) |>
  mutate(author = as.factor(author))

#визуализиуем
corpus_top |> 
  count(author) |> 
  ggplot(aes(reorder(author, n), n, fill = author)) +
  geom_col(show.legend = FALSE) +
  xlab(NULL) +
  ylab(NULL) +
  scale_fill_viridis_d() + 
  theme_light() +
  coord_flip()

#обучающая и тестовые выборки 

set.seed(30052026)
data_split <- corpus_top |>
  initial_split(strata = author)

data_train <- training(data_split)
data_test  <- testing(data_split)

# folds
set.seed(30052026)
folds <- vfold_cv(data_train, strata = author, v = 5)
folds
## #  5-fold cross-validation using stratification 
## # A tibble: 5 × 2
##   splits             id   
##   <list>             <chr>
## 1 <split [1912/481]> Fold1
## 2 <split [1914/479]> Fold2
## 3 <split [1915/478]> Fold3
## 4 <split [1915/478]> Fold4
## 5 <split [1916/477]> Fold5
#при prep и bake подумать над количеством компонент (зависит от количества авторов в корпусе) num_comp =7 /8/10

base_rec <- recipe(author ~ ., data = data_train) |>
  step_zv(all_predictors()) |> 
  step_normalize(all_predictors())
base_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs 
## Number of variables by role
## outcome:     1
## predictor: 499
## 
## ── Operations 
## • Zero variance filter on: all_predictors()
## • Centering and scaling for: all_predictors()
#рецепт, в котором используем главные компоненты в качестве предикторов
pca_rec <- base_rec |> 
  step_pca(all_predictors(), num_comp = 11)
pca_rec
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs 
## Number of variables by role
## outcome:     1
## predictor: 499
## 
## ── Operations 
## • Zero variance filter on: all_predictors()
## • Centering and scaling for: all_predictors()
## • PCA extraction with: all_predictors()
base_trained <- base_rec |>
  prep(data_train) 
base_trained
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs 
## Number of variables by role
## outcome:     1
## predictor: 499
## 
## ── Training information 
## Training data contained 2393 data points and no incomplete rows.
## 
## ── Operations 
## • Zero variance filter removed: <none> | Trained
## • Centering and scaling for: the, and, to, of, i, a, in, that, ... | Trained
base_trained |> 
  bake(new_data = NULL)
## # A tibble: 2,393 × 500
##        the     and      to       of       i      a    `in`    that      he
##      <dbl>   <dbl>   <dbl>    <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
##  1  0.0968  2.03   -0.154   0.807   -0.0322  1.01  -0.0774 -1.61   -1.24  
##  2 -1.33   -0.143   0.761  -2.22     2.33   -0.635 -2.27   -0.693   1.48  
##  3 -0.975  -0.0153  0.339  -0.745    0.0406 -0.828 -0.335  -0.287   0.551 
##  4 -0.499   1.84    0.550  -0.0801   0.877   0.620 -1.11    0.119  -1.51  
##  5 -0.0620  0.561  -0.0132  0.142    2.01   -0.152 -2.01   -0.896   1.21  
##  6  1.37    3.50   -1.21    0.216   -0.796   1.78  -0.851  -1.61   -0.910 
##  7  0.256   1.01    2.17   -0.302    0.332  -0.828 -1.11    0.423  -0.578 
##  8 -0.975   0.625  -0.576  -1.26     1.46   -0.924 -0.980  -0.592  -0.246 
##  9 -0.657   0.497  -0.0836 -1.26     1.86   -1.70  -0.464  -0.693  -1.44  
## 10 -0.896   1.52    0.339  -0.00618  0.695  -1.21  -0.722  -0.0843 -0.0472
## # ℹ 2,383 more rows
## # ℹ 491 more variables: it <dbl>, you <dbl>, was <dbl>, her <dbl>, his <dbl>,
## #   as <dbl>, my <dbl>, `for` <dbl>, not <dbl>, she <dbl>, with <dbl>,
## #   had <dbl>, be <dbl>, but <dbl>, have <dbl>, me <dbl>, is <dbl>, at <dbl>,
## #   s <dbl>, him <dbl>, so <dbl>, on <dbl>, said <dbl>, this <dbl>,
## #   which <dbl>, by <dbl>, all <dbl>, mr <dbl>, would <dbl>, `if` <dbl>,
## #   from <dbl>, will <dbl>, what <dbl>, your <dbl>, no <dbl>, or <dbl>, …
#методы снижения размерности
pca_trained <- pca_rec |>
  prep(data_train)

pca_trained |>
  juice()
## # A tibble: 2,393 × 12
##    author    PC01   PC02   PC03   PC04   PC05  PC06   PC07    PC08   PC09   PC10
##    <fct>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <dbl>  <dbl>   <dbl>  <dbl>  <dbl>
##  1 ABron… -0.589   2.64   0.824 -4.58   0.215  1.68 -3.27   0.748   1.28  -1.28 
##  2 ABron… -0.131  -8.21  -1.54   1.76   1.56   1.63 -1.41  -4.37    1.63  -1.84 
##  3 ABron…  0.499  -1.29   3.34  -2.04  -1.31   2.26 -2.80  -0.123   2.40  -1.19 
##  4 ABron… -0.0798 -0.246 -0.665 -2.52  -3.29   3.39 -2.45   0.856   0.632 -1.52 
##  5 ABron… -1.54   -4.47   1.27  -1.28  -1.67   2.55  2.08  -0.729   2.94  -1.44 
##  6 ABron… -5.44    5.43  -3.54  -2.82   1.71   1.11 -0.915  2.08    1.27  -3.79 
##  7 ABron…  4.04   -1.13   0.813  1.20   2.90   2.12 -1.20   0.605   0.933 -0.974
##  8 ABron…  0.634  -7.52  -0.844 -0.482 -0.850  1.95  1.18   1.59   -2.49  -1.09 
##  9 ABron… -0.0939 -6.90  -2.15  -7.67   0.208  1.26  3.16   1.91   -0.765 -2.69 
## 10 ABron…  2.52   -1.71   1.80  -1.14  -0.112  3.17  0.132 -0.0150  0.773 -3.26 
## # ℹ 2,383 more rows
## # ℹ 1 more variable: PC11 <dbl>
pca_trained |> juice() |> names()
##  [1] "author" "PC01"   "PC02"   "PC03"   "PC04"   "PC05"   "PC06"   "PC07"  
##  [9] "PC08"   "PC09"   "PC10"   "PC11"
#визуализация
pca_trained |>
  juice() |>
  ggplot(aes(PC01, PC02, color = author)) +
  geom_point(alpha = 0.7, size = 2) +
  scale_color_brewer(palette = "Paired") +   
  theme_light()

#нагрузки компонент
pca_trained |> 
  plot_top_loadings(component_number <= 4, n = 10) +
  scale_fill_brewer(palette = "Paired") +
  theme_light()

# install.packages("BiocManager")
# BiocManager::install("mixOmics")
pls_trained <- base_trained |> 
  step_pls(all_numeric_predictors(), outcome = "author", num_comp = 11) |> 
  prep() 

pls_trained |> 
  juice()
## # A tibble: 2,393 × 12
##    author       PLS01  PLS02   PLS03  PLS04  PLS05   PLS06  PLS07  PLS08  PLS09
##    <fct>        <dbl>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
##  1 ABronte  0.255      0.585 -0.320  -3.74  -1.88  -2.07    3.23  0.830   0.442
##  2 ABronte  0.570      2.52  -5.13    0.134  1.46  -0.0164  0.786 0.0879 -6.46 
##  3 ABronte -0.0750     3.79  -0.0875 -1.92  -3.46  -1.28    2.39  0.103  -3.16 
##  4 ABronte -0.0000960 -0.263 -0.858  -2.58  -3.85  -2.21    2.38  0.316  -4.08 
##  5 ABronte  1.99       2.76  -2.17   -1.91  -2.42   2.28    1.16  0.722  -4.73 
##  6 ABronte  4.21      -4.98   0.0952 -3.13  -0.582 -0.628   3.75  3.63   -0.527
##  7 ABronte -3.20       1.56  -0.945  -0.986  1.12  -0.369   1.06  0.111  -4.21 
##  8 ABronte  0.0787     2.23  -4.98   -0.644 -2.15   0.284  -1.45  1.98   -4.51 
##  9 ABronte  0.244      1.96  -6.37   -4.69  -2.97   2.38    1.89  3.59   -4.49 
## 10 ABronte -1.69       2.14  -0.318  -2.24  -1.32  -0.145   0.204 1.27   -3.73 
## # ℹ 2,383 more rows
## # ℹ 2 more variables: PLS10 <dbl>, PLS11 <dbl>
#визуализация
pls_trained |> 
  juice() |> 
  ggplot(aes(PLS01, PLS02, color = author)) +
  geom_point() +
  theme_light()

#нагрузки компонент
pls_trained |> 
  plot_top_loadings(component_number <= 4, n = 10, type = "pls") +
  scale_fill_brewer(palette = "Paired") +
  theme_light()

library(embed)
base_trained |> 
  step_umap(all_numeric_predictors(), outcome = "author", num_comp = 4) |> 
  prep() |> 
  juice() |> 
  ggplot(aes(UMAP1, UMAP2, color = author)) +
  geom_point(alpha = 0.5) +
  theme_light()

pls_rec <- base_rec |> 
  step_pls(all_numeric_predictors(), outcome = "author", num_comp = tune())

umap_rec <- base_rec |> 
  step_umap(all_numeric_predictors(),
            outcome = "author",
            num_comp = tune(),
            neighbors = tune(),
            min_dist = tune())

lasso_spec <- multinom_reg(penalty = tune(), mixture = 1) |> 
  set_mode("classification") |> 
  set_engine("glmnet")

ridge_spec <- multinom_reg(penalty = tune(), mixture = 0) |> 
  set_mode("classification") |> 
  set_engine("glmnet")

svm_spec <- svm_linear(cost = tune()) |> 
  set_mode("classification") |> 
  set_engine("LiblineaR")
svm_spec
## Linear Support Vector Machine Model Specification (classification)
## 
## Main Arguments:
##   cost = tune()
## 
## Computational engine: LiblineaR
mlp_spec <- mlp(hidden_units = tune(),
                penalty = tune(),
                epochs = tune()) |> 
  set_engine("nnet") |> 
  set_mode("classification")

bagging_spec <- bag_tree() |> 
  set_engine("rpart") |> 
  set_mode("classification")

fda_spec <- discrim_flexible(prod_degree = tune()) |> 
  set_engine("earth")

rda_spec <- discrim_regularized(frac_common_cov = tune(), 
                                frac_identity = tune())  |> 
  set_engine('klaR')

# devtools::install_github("KlausVigo/kknn")

knn_spec <- nearest_neighbor(neighbors = 5) |> 
  set_engine("kknn") |> 
  set_mode("classification")

library(baguette) # для бэггинга
library(discrim) # для fda_spec и rda_spec

wflow_set <- workflow_set(
  preproc = list(base = base_rec, pca = pca_rec),
  models  = list(ridge = ridge_spec, lasso = lasso_spec, svm = svm_spec),
  cross = TRUE
)
wflow_set
## # A workflow set/tibble: 6 × 4
##   wflow_id   info             option    result    
##   <chr>      <list>           <list>    <list>    
## 1 base_ridge <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 base_lasso <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 base_svm   <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 pca_ridge  <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 pca_lasso  <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 pca_svm    <tibble [1 × 4]> <opts[0]> <list [0]>
pca_rec <- base_rec |> step_pca(all_predictors(), num_comp = tune())

library(future)
plan(multisession, workers = 6)

plan(sequential)

train_res <- wflow_set |>
  workflow_map(
    verbose = TRUE,
    seed = 180525,
    resamples = folds,
    grid = 3,
    metrics = metric_set(f_meas, accuracy),
    control = control_resamples(save_pred = TRUE)
  )
## i 1 of 6 tuning:     base_ridge
## ✔ 1 of 6 tuning:     base_ridge (13.3s)
## i 2 of 6 tuning:     base_lasso
## → A | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 20, 'Austen': 28, 'CBronte': 37, 'Dickens': 58, 'Eliot': 65,
##                'Fielding': 38, 'Sterne': 17, 'Thackeray': 56, 'Trollope': 54
## There were issues with some computations   A: x1                                                 → B | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 20, 'Austen': 28, 'CBronte': 31, 'Dickens': 63, 'Eliot': 55,
##                'Fielding': 40, 'Sterne': 15, 'Thackeray': 63, 'Trollope': 61
## There were issues with some computations   A: x1There were issues with some computations   A: x1   B: x1                                                         → C | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 10, 'Austen': 32, 'CBronte': 28, 'Dickens': 62, 'Eliot': 64,
##                'Fielding': 31, 'Sterne': 17, 'Thackeray': 65, 'Trollope': 62
## There were issues with some computations   A: x1   B: x1There were issues with some computations   A: x1   B: x1   C: x1                                                                 → D | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 15, 'Austen': 36, 'CBronte': 39, 'Dickens': 64, 'Eliot': 52,
##                'Fielding': 30, 'Sterne': 20, 'Thackeray': 58, 'Trollope': 57
## There were issues with some computations   A: x1   B: x1   C: x1There were issues with some computations   A: x1   B: x1   C: x1   D: x1                                                                         → E | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 19, 'Austen': 25, 'CBronte': 38, 'Dickens': 59, 'Eliot': 51,
##                'Fielding': 39, 'Sterne': 19, 'Thackeray': 66, 'Trollope': 51
## There were issues with some computations   A: x1   B: x1   C: x1   D: x1There were issues with some computations   A: x1   B: x1   C: x1   D: x1   E: x1There were issues with some computations   A: x1   B: x1   C: x1   D: x1   E: x1
## ✔ 2 of 6 tuning:     base_lasso (6.7s)
## i 3 of 6 tuning:     base_svm
## ✔ 3 of 6 tuning:     base_svm (6.7s)
## i 4 of 6 tuning:     pca_ridge
## → A | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 20, 'Austen': 28, 'CBronte': 37, 'Sterne': 17
## There were issues with some computations   A: x1                                                 → B | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 20, 'Austen': 28, 'CBronte': 31, 'Sterne': 15
## There were issues with some computations   A: x1There were issues with some computations   A: x1   B: x1                                                         → C | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 10, 'Austen': 32, 'CBronte': 28, 'Sterne': 17
## There were issues with some computations   A: x1   B: x1There were issues with some computations   A: x1   B: x1   C: x1                                                                 → D | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 15
## There were issues with some computations   A: x1   B: x1   C: x1                                                                 → E | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 15, 'Austen': 36, 'CBronte': 39, 'Sterne': 20
## There were issues with some computations   A: x1   B: x1   C: x1There were issues with some computations   A: x1   B: x1   C: x1   D: x2   E: x1                                                                                 → F | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 19
## There were issues with some computations   A: x1   B: x1   C: x1   D: x2   E: x1                                                                                 → G | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 19, 'Austen': 25, 'CBronte': 38, 'Sterne': 19
## There were issues with some computations   A: x1   B: x1   C: x1   D: x2   E: x1There were issues with some computations   A: x1   B: x1   C: x1   D: x2   E: x…There were issues with some computations   A: x1   B: x1   C: x1   D: x2   E: x…
## ✔ 4 of 6 tuning:     pca_ridge (1.6s)
## i 5 of 6 tuning:     pca_lasso
## → A | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 20, 'Austen': 28, 'CBronte': 37, 'Dickens': 58, 'Eliot': 65,
##                'Fielding': 38, 'Sterne': 17, 'Thackeray': 56, 'Trollope': 54
## There were issues with some computations   A: x1                                                 → B | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 20, 'Austen': 28, 'CBronte': 31, 'Dickens': 63, 'Eliot': 55,
##                'Fielding': 40, 'Sterne': 15, 'Thackeray': 63, 'Trollope': 61
## There were issues with some computations   A: x1There were issues with some computations   A: x1   B: x1                                                         → C | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 10, 'Austen': 32, 'CBronte': 28, 'Dickens': 62, 'Eliot': 64,
##                'Fielding': 31, 'Sterne': 17, 'Thackeray': 65, 'Trollope': 62
## There were issues with some computations   A: x1   B: x1There were issues with some computations   A: x1   B: x1   C: x1                                                                 → D | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 15, 'Austen': 36, 'CBronte': 39, 'Dickens': 64, 'Eliot': 52,
##                'Fielding': 30, 'Sterne': 20, 'Thackeray': 58, 'Trollope': 57
## There were issues with some computations   A: x1   B: x1   C: x1There were issues with some computations   A: x1   B: x1   C: x1   D: x1                                                                         → E | warning: While computing multiclass `precision()`, some levels had no predicted events
##                (i.e. `true_positive + false_positive = 0`).
##                Precision is undefined in this case, and those levels will be removed from the
##                averaged result.
##                Note that the following number of true events actually occurred for each
##                problematic event level:
##                'ABronte': 19, 'Austen': 25, 'CBronte': 38, 'Dickens': 59, 'Eliot': 51,
##                'Fielding': 39, 'Sterne': 19, 'Thackeray': 66, 'Trollope': 51
## There were issues with some computations   A: x1   B: x1   C: x1   D: x1There were issues with some computations   A: x1   B: x1   C: x1   D: x1   E: x1There were issues with some computations   A: x1   B: x1   C: x1   D: x1   E: x1
## ✔ 5 of 6 tuning:     pca_lasso (2s)
## i 6 of 6 tuning:     pca_svm
## ✔ 6 of 6 tuning:     pca_svm (3.3s)
plan(sequential)

autoplot(train_res, metric = "accuracy") + 
  theme_light() +
  theme(legend.position = "none") +
  geom_text(aes(y = (mean - 2*std_err), label = wflow_id),
            angle = 90, hjust = 1.5) +
  coord_cartesian(ylim = c(-0.3, NA))

rank_results(train_res, select_best = TRUE) |> 
  print()
## # A tibble: 12 × 9
##    wflow_id   .config       .metric  mean std_err     n preprocessor model  rank
##    <chr>      <chr>         <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
##  1 base_ridge pre0_mod1_po… accura… 0.998 6.63e-4     5 recipe       mult…     1
##  2 base_ridge pre0_mod1_po… f_meas  0.997 1.06e-3     5 recipe       mult…     1
##  3 base_svm   pre0_mod1_po… accura… 0.995 5.17e-4     5 recipe       svm_…     2
##  4 base_svm   pre0_mod1_po… f_meas  0.994 1.04e-3     5 recipe       svm_…     2
##  5 base_lasso pre0_mod1_po… accura… 0.990 2.03e-3     5 recipe       mult…     3
##  6 base_lasso pre0_mod1_po… f_meas  0.988 2.37e-3     5 recipe       mult…     3
##  7 pca_lasso  pre0_mod1_po… accura… 0.957 3.55e-3     5 recipe       mult…     4
##  8 pca_lasso  pre0_mod1_po… f_meas  0.940 4.44e-3     5 recipe       mult…     4
##  9 pca_svm    pre0_mod2_po… accura… 0.950 2.19e-3     5 recipe       svm_…     5
## 10 pca_svm    pre0_mod2_po… f_meas  0.917 6.97e-3     5 recipe       svm_…     5
## 11 pca_ridge  pre0_mod1_po… accura… 0.935 4.54e-3     5 recipe       mult…     6
## 12 pca_ridge  pre0_mod1_po… f_meas  0.903 2.12e-2     5 recipe       mult…     6
autoplot(train_res, id = "base_ridge") +
  theme_light()

best_results <- 
  train_res |> 
  extract_workflow_set_result("base_svm") |> 
  select_best(metric = "accuracy")

print(best_results)
## # A tibble: 1 × 2
##      cost .config        
##     <dbl> <chr>          
## 1 0.00101 pre0_mod1_post0
best_results <- train_res |> 
  extract_workflow_set_result("base_svm") |> 
  select_best(metric = "accuracy")

print(best_results)
## # A tibble: 1 × 2
##      cost .config        
##     <dbl> <chr>          
## 1 0.00101 pre0_mod1_post0
best_results <- train_res |>
  extract_workflow_set_result("base_ridge") |>
  select_best(metric = "accuracy")

ridge_res <- train_res |>
  extract_workflow("base_ridge") |>
  finalize_workflow(best_results) |>
  last_fit(split = data_split, metrics = metric_set(f_meas, accuracy, roc_auc))
collect_metrics(ridge_res) |> 
  print()
## # A tibble: 3 × 4
##   .metric  .estimator .estimate .config        
##   <chr>    <chr>          <dbl> <chr>          
## 1 f_meas   macro              1 pre0_mod0_post0
## 2 accuracy multiclass         1 pre0_mod0_post0
## 3 roc_auc  hand_till          1 pre0_mod0_post0
collect_predictions(ridge_res) |> 
  conf_mat(truth = author, estimate = .pred_class) |> 
  autoplot(type = "heatmap") +
  scale_fill_gradient(low = "white", high = "#233857") +
  theme(panel.grid.major = element_line(colour = "#233857"),
        axis.text = element_text(color = "#233857"),
        axis.title = element_text(color = "#233857"),
        plot.title = element_text(color = "#233857"),
        axis.text.x = element_text(angle = 90))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

collect_predictions(ridge_res) |> 
  select(starts_with(".pred_")) |> 
  names() #можно оставить, а можно убрать
##  [1] ".pred_class"      ".pred_ABronte"    ".pred_Austen"     ".pred_CBronte"   
##  [5] ".pred_Dickens"    ".pred_Eliot"      ".pred_Fielding"   ".pred_Richardson"
##  [9] ".pred_Sterne"     ".pred_Thackeray"  ".pred_Trollope"
collect_predictions(ridge_res) |>
  roc_curve(truth = author, .pred_ABronte:.pred_Trollope) |>
  ggplot(aes(1 - specificity, sensitivity, color = .level)) +
  geom_abline(slope = 1, color = "gray50", lty = 2, alpha = 0.8) +
  geom_path(linewidth = 1.5, alpha = 0.7) +
  scale_color_brewer(palette = "Paired") +
  labs(color = NULL) +
  theme_light()

final_model <- extract_fit_parsnip(ridge_res)

top_terms <- tidy(final_model) |>
  filter(term != "(Intercept)") |>
  group_by(class) |>                           
  slice_max(abs(estimate), n = 10)  |>             
  ungroup()  |> 
  mutate(term = fct_reorder(term, abs(estimate)))
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 5.0
print(top_terms)
## # A tibble: 100 × 4
##    class   term    estimate  penalty
##    <chr>   <fct>      <dbl>    <dbl>
##  1 ABronte but       0.203  1.07e-10
##  2 ABronte and       0.149  1.07e-10
##  3 ABronte which    -0.117  1.07e-10
##  4 ABronte replied   0.113  1.07e-10
##  5 ABronte or        0.106  1.07e-10
##  6 ABronte in       -0.104  1.07e-10
##  7 ABronte out      -0.0913 1.07e-10
##  8 ABronte this     -0.0840 1.07e-10
##  9 ABronte who      -0.0817 1.07e-10
## 10 ABronte alone     0.0784 1.07e-10
## # ℹ 90 more rows
library(tidytext)

top_terms |>
  mutate(term = reorder_within(term, abs(estimate), class)) |>
  ggplot(aes(x = estimate, y = term, fill = class)) +
  geom_col(show.legend = FALSE, alpha = 0.85) +
  facet_wrap(~ class, scales = "free_y", nrow = 4) +
  scale_y_reordered() +
  scale_fill_viridis_d() +
  labs(title = "Наиболее важные признаки для каждого автора",
       x = "Коэффициент", y = "Признак") +
  theme_minimal()