library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.5.0 ──
## ✔ broom 1.0.13 ✔ rsample 1.3.2
## ✔ dials 1.4.3 ✔ tailor 0.1.0
## ✔ infer 1.1.0 ✔ tune 2.1.0
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.6.0 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.2 ✔ yardstick 1.4.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(textrecipes)
library(tidytext)
library(stylo)
##
## ### stylo version: 0.7.7 ###
##
## If you plan to cite this software (please do!), use the following reference:
## Eder, M., Rybicki, J. and Kestemont, M. (2016). Stylometry with R:
## a package for computational text analysis. R Journal 8(1): 107-121.
## <https://journal.r-project.org/archive/2016/RJ-2016-007/index.html>
##
## To get full BibTeX entry, type: citation("stylo")
library(embed)
library(baguette)
library(discrim)
##
## Attaching package: 'discrim'
##
## The following object is masked from 'package:dials':
##
## smoothness
library(ggrepel)
library(learntidymodels)
corpus <- load.corpus.and.parse(corpus.dir = "british_fiction")
## slicing input text into tokens...
##
##
## turning words into features, e.g. char n-grams (if applicable)...
#прочитать метаданные
#разделяем тексты на отрывки длиной в 2000 слов
corpus_samples <- make.samples(corpus,
sample.size = 2000,
sampling = "normal.sampling",
sample.overlap = 0,
sampling.with.replacement = FALSE)
## ABronte_Agnes
## - text length (in words): 69283
## - nr. of samples: 34
## - nr. of words dropped at the end of the text: 1283
## ABronte_Tenant
## - text length (in words): 169798
## - nr. of samples: 84
## - nr. of words dropped at the end of the text: 1798
## Austen_Emma
## - text length (in words): 161973
## - nr. of samples: 80
## - nr. of words dropped at the end of the text: 1973
## Austen_Pride
## - text length (in words): 122726
## - nr. of samples: 61
## - nr. of words dropped at the end of the text: 726
## Austen_Sense
## - text length (in words): 120736
## - nr. of samples: 60
## - nr. of words dropped at the end of the text: 736
## CBronte_Jane
## - text length (in words): 189219
## - nr. of samples: 94
## - nr. of words dropped at the end of the text: 1219
## CBronte_Professor
## - text length (in words): 89942
## - nr. of samples: 44
## - nr. of words dropped at the end of the text: 1942
## CBronte_Villette
## - text length (in words): 196737
## - nr. of samples: 98
## - nr. of words dropped at the end of the text: 737
## Dickens_Bleak
## - text length (in words): 362023
## - nr. of samples: 181
## - nr. of words dropped at the end of the text: 23
## Dickens_David
## - text length (in words): 363815
## - nr. of samples: 181
## - nr. of words dropped at the end of the text: 1815
## Dickens_Hard
## - text length (in words): 105607
## - nr. of samples: 52
## - nr. of words dropped at the end of the text: 1607
## EBronte_Wuthering
## - text length (in words): 119399
## - nr. of samples: 59
## - nr. of words dropped at the end of the text: 1399
## Eliot_Adam
## - text length (in words): 222833
## - nr. of samples: 111
## - nr. of words dropped at the end of the text: 833
## Eliot_Middlemarch
## - text length (in words): 323599
## - nr. of samples: 161
## - nr. of words dropped at the end of the text: 1599
## Eliot_Mill
## - text length (in words): 214189
## - nr. of samples: 107
## - nr. of words dropped at the end of the text: 189
## Fielding_Joseph
## - text length (in words): 138056
## - nr. of samples: 69
## - nr. of words dropped at the end of the text: 56
## Fielding_Tom
## - text length (in words): 350626
## - nr. of samples: 175
## - nr. of words dropped at the end of the text: 626
## Richardson_Clarissa
## - text length (in words): 975536
## - nr. of samples: 487
## - nr. of words dropped at the end of the text: 1536
## Richardson_Pamela
## - text length (in words): 444070
## - nr. of samples: 222
## - nr. of words dropped at the end of the text: 70
## Sterne_Sentimental
## - text length (in words): 41350
## - nr. of samples: 20
## - nr. of words dropped at the end of the text: 1350
## Sterne_Tristram
## - text length (in words): 190500
## - nr. of samples: 95
## - nr. of words dropped at the end of the text: 500
## Thackeray_Barry
## - text length (in words): 129414
## - nr. of samples: 64
## - nr. of words dropped at the end of the text: 1414
## Thackeray_Pendennis
## - text length (in words): 364714
## - nr. of samples: 182
## - nr. of words dropped at the end of the text: 714
## Thackeray_Vanity
## - text length (in words): 311459
## - nr. of samples: 155
## - nr. of words dropped at the end of the text: 1459
## Trollope_Barchester
## - text length (in words): 200730
## - nr. of samples: 100
## - nr. of words dropped at the end of the text: 730
## Trollope_Phineas
## - text length (in words): 266991
## - nr. of samples: 133
## - nr. of words dropped at the end of the text: 991
## Trollope_Prime
## - text length (in words): 288061
## - nr. of samples: 144
## - nr. of words dropped at the end of the text: 61
#perform.culling(culling.level = 20) #почитать про аргумент и подумать добавлять или нет
# приводим к нижнему регистру, убираем 's
corpus_clean <- map(corpus_samples, function(text) {
str_to_lower(str_remove_all(text, "[''`]s\\b"))
})
#подумать сотавлять или нет нижний регистр, убирать ли имена потому что имена по идее убирает perform.culling
# 27.2 подготовка датасета
# 500 самых частотных слов
mfw <- make.frequency.list(corpus_clean)[1:500]
# частотная матрица
corpus_tf <- stylo::make.table.of.frequencies(corpus_clean, mfw) |>
as.data.frame.matrix() |>
rownames_to_column("id") |>
as_tibble()
## processing 3253 text samples
## .....................................................................................................................................................................................................................................................................................................................................
## combining frequencies into a table...
# разбираем id вида Author_Book_sampleN на части
corpus_tf <- corpus_tf |>
separate(id, into = c("author", "book", "sample"), sep = "_",
extra = "merge", fill = "right", remove = FALSE)
corpus_tf |>
select(id, author, book, sample) |>
head()
## # A tibble: 6 × 4
## id author book sample
## <chr> <chr> <chr> <chr>
## 1 ABronte_Agnes_1 ABronte Agnes 1
## 2 ABronte_Agnes_2 ABronte Agnes 2
## 3 ABronte_Agnes_3 ABronte Agnes 3
## 4 ABronte_Agnes_4 ABronte Agnes 4
## 5 ABronte_Agnes_5 ABronte Agnes 5
## 6 ABronte_Agnes_6 ABronte Agnes 6
#сколько отрывков для каждого автора в корпусе
corpus_tf |>
count(author) |>
ggplot(aes(reorder(author, n), n, fill = author)) +
geom_col(show.legend = FALSE) +
xlab(NULL) +
ylab(NULL) +
scale_fill_viridis_d() +
theme_light() +
coord_flip()

#удаляем авторов, у которых не так много отрывков.
corpus_top <- corpus_tf |>
add_count(author) |>
filter(n > 100) |>
select(-n, -id, -book, -sample) |>
mutate(author = as.factor(author))
#визуализиуем
corpus_top |>
count(author) |>
ggplot(aes(reorder(author, n), n, fill = author)) +
geom_col(show.legend = FALSE) +
xlab(NULL) +
ylab(NULL) +
scale_fill_viridis_d() +
theme_light() +
coord_flip()

#обучающая и тестовые выборки
set.seed(30052026)
data_split <- corpus_top |>
initial_split(strata = author)
data_train <- training(data_split)
data_test <- testing(data_split)
# folds
set.seed(30052026)
folds <- vfold_cv(data_train, strata = author, v = 5)
folds
## # 5-fold cross-validation using stratification
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [1912/481]> Fold1
## 2 <split [1914/479]> Fold2
## 3 <split [1915/478]> Fold3
## 4 <split [1915/478]> Fold4
## 5 <split [1916/477]> Fold5
#при prep и bake подумать над количеством компонент (зависит от количества авторов в корпусе) num_comp =7 /8/10
base_rec <- recipe(author ~ ., data = data_train) |>
step_zv(all_predictors()) |>
step_normalize(all_predictors())
base_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 499
##
## ── Operations
## • Zero variance filter on: all_predictors()
## • Centering and scaling for: all_predictors()
#рецепт, в котором используем главные компоненты в качестве предикторов
pca_rec <- base_rec |>
step_pca(all_predictors(), num_comp = 11)
pca_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 499
##
## ── Operations
## • Zero variance filter on: all_predictors()
## • Centering and scaling for: all_predictors()
## • PCA extraction with: all_predictors()
base_trained <- base_rec |>
prep(data_train)
base_trained
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 499
##
## ── Training information
## Training data contained 2393 data points and no incomplete rows.
##
## ── Operations
## • Zero variance filter removed: <none> | Trained
## • Centering and scaling for: the, and, to, of, i, a, in, that, ... | Trained
base_trained |>
bake(new_data = NULL)
## # A tibble: 2,393 × 500
## the and to of i a `in` that he
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0968 2.03 -0.154 0.807 -0.0322 1.01 -0.0774 -1.61 -1.24
## 2 -1.33 -0.143 0.761 -2.22 2.33 -0.635 -2.27 -0.693 1.48
## 3 -0.975 -0.0153 0.339 -0.745 0.0406 -0.828 -0.335 -0.287 0.551
## 4 -0.499 1.84 0.550 -0.0801 0.877 0.620 -1.11 0.119 -1.51
## 5 -0.0620 0.561 -0.0132 0.142 2.01 -0.152 -2.01 -0.896 1.21
## 6 1.37 3.50 -1.21 0.216 -0.796 1.78 -0.851 -1.61 -0.910
## 7 0.256 1.01 2.17 -0.302 0.332 -0.828 -1.11 0.423 -0.578
## 8 -0.975 0.625 -0.576 -1.26 1.46 -0.924 -0.980 -0.592 -0.246
## 9 -0.657 0.497 -0.0836 -1.26 1.86 -1.70 -0.464 -0.693 -1.44
## 10 -0.896 1.52 0.339 -0.00618 0.695 -1.21 -0.722 -0.0843 -0.0472
## # ℹ 2,383 more rows
## # ℹ 491 more variables: it <dbl>, you <dbl>, was <dbl>, her <dbl>, his <dbl>,
## # as <dbl>, my <dbl>, `for` <dbl>, not <dbl>, she <dbl>, with <dbl>,
## # had <dbl>, be <dbl>, but <dbl>, have <dbl>, me <dbl>, is <dbl>, at <dbl>,
## # s <dbl>, him <dbl>, so <dbl>, on <dbl>, said <dbl>, this <dbl>,
## # which <dbl>, by <dbl>, all <dbl>, mr <dbl>, would <dbl>, `if` <dbl>,
## # from <dbl>, will <dbl>, what <dbl>, your <dbl>, no <dbl>, or <dbl>, …
#методы снижения размерности
pca_trained <- pca_rec |>
prep(data_train)
pca_trained |>
juice()
## # A tibble: 2,393 × 12
## author PC01 PC02 PC03 PC04 PC05 PC06 PC07 PC08 PC09 PC10
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ABron… -0.589 2.64 0.824 -4.58 0.215 1.68 -3.27 0.748 1.28 -1.28
## 2 ABron… -0.131 -8.21 -1.54 1.76 1.56 1.63 -1.41 -4.37 1.63 -1.84
## 3 ABron… 0.499 -1.29 3.34 -2.04 -1.31 2.26 -2.80 -0.123 2.40 -1.19
## 4 ABron… -0.0798 -0.246 -0.665 -2.52 -3.29 3.39 -2.45 0.856 0.632 -1.52
## 5 ABron… -1.54 -4.47 1.27 -1.28 -1.67 2.55 2.08 -0.729 2.94 -1.44
## 6 ABron… -5.44 5.43 -3.54 -2.82 1.71 1.11 -0.915 2.08 1.27 -3.79
## 7 ABron… 4.04 -1.13 0.813 1.20 2.90 2.12 -1.20 0.605 0.933 -0.974
## 8 ABron… 0.634 -7.52 -0.844 -0.482 -0.850 1.95 1.18 1.59 -2.49 -1.09
## 9 ABron… -0.0939 -6.90 -2.15 -7.67 0.208 1.26 3.16 1.91 -0.765 -2.69
## 10 ABron… 2.52 -1.71 1.80 -1.14 -0.112 3.17 0.132 -0.0150 0.773 -3.26
## # ℹ 2,383 more rows
## # ℹ 1 more variable: PC11 <dbl>
pca_trained |> juice() |> names()
## [1] "author" "PC01" "PC02" "PC03" "PC04" "PC05" "PC06" "PC07"
## [9] "PC08" "PC09" "PC10" "PC11"
#визуализация
pca_trained |>
juice() |>
ggplot(aes(PC01, PC02, color = author)) +
geom_point(alpha = 0.7, size = 2) +
scale_color_brewer(palette = "Paired") +
theme_light()

#нагрузки компонент
pca_trained |>
plot_top_loadings(component_number <= 4, n = 10) +
scale_fill_brewer(palette = "Paired") +
theme_light()

# install.packages("BiocManager")
# BiocManager::install("mixOmics")
pls_trained <- base_trained |>
step_pls(all_numeric_predictors(), outcome = "author", num_comp = 11) |>
prep()
pls_trained |>
juice()
## # A tibble: 2,393 × 12
## author PLS01 PLS02 PLS03 PLS04 PLS05 PLS06 PLS07 PLS08 PLS09
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ABronte 0.255 0.585 -0.320 -3.74 -1.88 -2.07 3.23 0.830 0.442
## 2 ABronte 0.570 2.52 -5.13 0.134 1.46 -0.0164 0.786 0.0879 -6.46
## 3 ABronte -0.0750 3.79 -0.0875 -1.92 -3.46 -1.28 2.39 0.103 -3.16
## 4 ABronte -0.0000960 -0.263 -0.858 -2.58 -3.85 -2.21 2.38 0.316 -4.08
## 5 ABronte 1.99 2.76 -2.17 -1.91 -2.42 2.28 1.16 0.722 -4.73
## 6 ABronte 4.21 -4.98 0.0952 -3.13 -0.582 -0.628 3.75 3.63 -0.527
## 7 ABronte -3.20 1.56 -0.945 -0.986 1.12 -0.369 1.06 0.111 -4.21
## 8 ABronte 0.0787 2.23 -4.98 -0.644 -2.15 0.284 -1.45 1.98 -4.51
## 9 ABronte 0.244 1.96 -6.37 -4.69 -2.97 2.38 1.89 3.59 -4.49
## 10 ABronte -1.69 2.14 -0.318 -2.24 -1.32 -0.145 0.204 1.27 -3.73
## # ℹ 2,383 more rows
## # ℹ 2 more variables: PLS10 <dbl>, PLS11 <dbl>
#визуализация
pls_trained |>
juice() |>
ggplot(aes(PLS01, PLS02, color = author)) +
geom_point() +
theme_light()

#нагрузки компонент
pls_trained |>
plot_top_loadings(component_number <= 4, n = 10, type = "pls") +
scale_fill_brewer(palette = "Paired") +
theme_light()

library(embed)
base_trained |>
step_umap(all_numeric_predictors(), outcome = "author", num_comp = 4) |>
prep() |>
juice() |>
ggplot(aes(UMAP1, UMAP2, color = author)) +
geom_point(alpha = 0.5) +
theme_light()

pls_rec <- base_rec |>
step_pls(all_numeric_predictors(), outcome = "author", num_comp = tune())
umap_rec <- base_rec |>
step_umap(all_numeric_predictors(),
outcome = "author",
num_comp = tune(),
neighbors = tune(),
min_dist = tune())
lasso_spec <- multinom_reg(penalty = tune(), mixture = 1) |>
set_mode("classification") |>
set_engine("glmnet")
ridge_spec <- multinom_reg(penalty = tune(), mixture = 0) |>
set_mode("classification") |>
set_engine("glmnet")
svm_spec <- svm_linear(cost = tune()) |>
set_mode("classification") |>
set_engine("LiblineaR")
svm_spec
## Linear Support Vector Machine Model Specification (classification)
##
## Main Arguments:
## cost = tune()
##
## Computational engine: LiblineaR
mlp_spec <- mlp(hidden_units = tune(),
penalty = tune(),
epochs = tune()) |>
set_engine("nnet") |>
set_mode("classification")
bagging_spec <- bag_tree() |>
set_engine("rpart") |>
set_mode("classification")
fda_spec <- discrim_flexible(prod_degree = tune()) |>
set_engine("earth")
rda_spec <- discrim_regularized(frac_common_cov = tune(),
frac_identity = tune()) |>
set_engine('klaR')
# devtools::install_github("KlausVigo/kknn")
knn_spec <- nearest_neighbor(neighbors = 5) |>
set_engine("kknn") |>
set_mode("classification")
library(baguette) # для бэггинга
library(discrim) # для fda_spec и rda_spec
wflow_set <- workflow_set(
preproc = list(base = base_rec, pca = pca_rec),
models = list(ridge = ridge_spec, lasso = lasso_spec, svm = svm_spec),
cross = TRUE
)
wflow_set
## # A workflow set/tibble: 6 × 4
## wflow_id info option result
## <chr> <list> <list> <list>
## 1 base_ridge <tibble [1 × 4]> <opts[0]> <list [0]>
## 2 base_lasso <tibble [1 × 4]> <opts[0]> <list [0]>
## 3 base_svm <tibble [1 × 4]> <opts[0]> <list [0]>
## 4 pca_ridge <tibble [1 × 4]> <opts[0]> <list [0]>
## 5 pca_lasso <tibble [1 × 4]> <opts[0]> <list [0]>
## 6 pca_svm <tibble [1 × 4]> <opts[0]> <list [0]>
pca_rec <- base_rec |> step_pca(all_predictors(), num_comp = tune())
library(future)
plan(multisession, workers = 6)
plan(sequential)
train_res <- wflow_set |>
workflow_map(
verbose = TRUE,
seed = 180525,
resamples = folds,
grid = 3,
metrics = metric_set(f_meas, accuracy),
control = control_resamples(save_pred = TRUE)
)
## i 1 of 6 tuning: base_ridge
## ✔ 1 of 6 tuning: base_ridge (13.3s)
## i 2 of 6 tuning: base_lasso
## → A | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 20, 'Austen': 28, 'CBronte': 37, 'Dickens': 58, 'Eliot': 65,
## 'Fielding': 38, 'Sterne': 17, 'Thackeray': 56, 'Trollope': 54
## There were issues with some computations A: x1 → B | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 20, 'Austen': 28, 'CBronte': 31, 'Dickens': 63, 'Eliot': 55,
## 'Fielding': 40, 'Sterne': 15, 'Thackeray': 63, 'Trollope': 61
## There were issues with some computations A: x1There were issues with some computations A: x1 B: x1 → C | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 10, 'Austen': 32, 'CBronte': 28, 'Dickens': 62, 'Eliot': 64,
## 'Fielding': 31, 'Sterne': 17, 'Thackeray': 65, 'Trollope': 62
## There were issues with some computations A: x1 B: x1There were issues with some computations A: x1 B: x1 C: x1 → D | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 15, 'Austen': 36, 'CBronte': 39, 'Dickens': 64, 'Eliot': 52,
## 'Fielding': 30, 'Sterne': 20, 'Thackeray': 58, 'Trollope': 57
## There were issues with some computations A: x1 B: x1 C: x1There were issues with some computations A: x1 B: x1 C: x1 D: x1 → E | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 19, 'Austen': 25, 'CBronte': 38, 'Dickens': 59, 'Eliot': 51,
## 'Fielding': 39, 'Sterne': 19, 'Thackeray': 66, 'Trollope': 51
## There were issues with some computations A: x1 B: x1 C: x1 D: x1There were issues with some computations A: x1 B: x1 C: x1 D: x1 E: x1There were issues with some computations A: x1 B: x1 C: x1 D: x1 E: x1
## ✔ 2 of 6 tuning: base_lasso (6.7s)
## i 3 of 6 tuning: base_svm
## ✔ 3 of 6 tuning: base_svm (6.7s)
## i 4 of 6 tuning: pca_ridge
## → A | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 20, 'Austen': 28, 'CBronte': 37, 'Sterne': 17
## There were issues with some computations A: x1 → B | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 20, 'Austen': 28, 'CBronte': 31, 'Sterne': 15
## There were issues with some computations A: x1There were issues with some computations A: x1 B: x1 → C | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 10, 'Austen': 32, 'CBronte': 28, 'Sterne': 17
## There were issues with some computations A: x1 B: x1There were issues with some computations A: x1 B: x1 C: x1 → D | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 15
## There were issues with some computations A: x1 B: x1 C: x1 → E | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 15, 'Austen': 36, 'CBronte': 39, 'Sterne': 20
## There were issues with some computations A: x1 B: x1 C: x1There were issues with some computations A: x1 B: x1 C: x1 D: x2 E: x1 → F | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 19
## There were issues with some computations A: x1 B: x1 C: x1 D: x2 E: x1 → G | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 19, 'Austen': 25, 'CBronte': 38, 'Sterne': 19
## There were issues with some computations A: x1 B: x1 C: x1 D: x2 E: x1There were issues with some computations A: x1 B: x1 C: x1 D: x2 E: x…There were issues with some computations A: x1 B: x1 C: x1 D: x2 E: x…
## ✔ 4 of 6 tuning: pca_ridge (1.6s)
## i 5 of 6 tuning: pca_lasso
## → A | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 20, 'Austen': 28, 'CBronte': 37, 'Dickens': 58, 'Eliot': 65,
## 'Fielding': 38, 'Sterne': 17, 'Thackeray': 56, 'Trollope': 54
## There were issues with some computations A: x1 → B | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 20, 'Austen': 28, 'CBronte': 31, 'Dickens': 63, 'Eliot': 55,
## 'Fielding': 40, 'Sterne': 15, 'Thackeray': 63, 'Trollope': 61
## There were issues with some computations A: x1There were issues with some computations A: x1 B: x1 → C | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 10, 'Austen': 32, 'CBronte': 28, 'Dickens': 62, 'Eliot': 64,
## 'Fielding': 31, 'Sterne': 17, 'Thackeray': 65, 'Trollope': 62
## There were issues with some computations A: x1 B: x1There were issues with some computations A: x1 B: x1 C: x1 → D | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 15, 'Austen': 36, 'CBronte': 39, 'Dickens': 64, 'Eliot': 52,
## 'Fielding': 30, 'Sterne': 20, 'Thackeray': 58, 'Trollope': 57
## There were issues with some computations A: x1 B: x1 C: x1There were issues with some computations A: x1 B: x1 C: x1 D: x1 → E | warning: While computing multiclass `precision()`, some levels had no predicted events
## (i.e. `true_positive + false_positive = 0`).
## Precision is undefined in this case, and those levels will be removed from the
## averaged result.
## Note that the following number of true events actually occurred for each
## problematic event level:
## 'ABronte': 19, 'Austen': 25, 'CBronte': 38, 'Dickens': 59, 'Eliot': 51,
## 'Fielding': 39, 'Sterne': 19, 'Thackeray': 66, 'Trollope': 51
## There were issues with some computations A: x1 B: x1 C: x1 D: x1There were issues with some computations A: x1 B: x1 C: x1 D: x1 E: x1There were issues with some computations A: x1 B: x1 C: x1 D: x1 E: x1
## ✔ 5 of 6 tuning: pca_lasso (2s)
## i 6 of 6 tuning: pca_svm
## ✔ 6 of 6 tuning: pca_svm (3.3s)
plan(sequential)
autoplot(train_res, metric = "accuracy") +
theme_light() +
theme(legend.position = "none") +
geom_text(aes(y = (mean - 2*std_err), label = wflow_id),
angle = 90, hjust = 1.5) +
coord_cartesian(ylim = c(-0.3, NA))

rank_results(train_res, select_best = TRUE) |>
print()
## # A tibble: 12 × 9
## wflow_id .config .metric mean std_err n preprocessor model rank
## <chr> <chr> <chr> <dbl> <dbl> <int> <chr> <chr> <int>
## 1 base_ridge pre0_mod1_po… accura… 0.998 6.63e-4 5 recipe mult… 1
## 2 base_ridge pre0_mod1_po… f_meas 0.997 1.06e-3 5 recipe mult… 1
## 3 base_svm pre0_mod1_po… accura… 0.995 5.17e-4 5 recipe svm_… 2
## 4 base_svm pre0_mod1_po… f_meas 0.994 1.04e-3 5 recipe svm_… 2
## 5 base_lasso pre0_mod1_po… accura… 0.990 2.03e-3 5 recipe mult… 3
## 6 base_lasso pre0_mod1_po… f_meas 0.988 2.37e-3 5 recipe mult… 3
## 7 pca_lasso pre0_mod1_po… accura… 0.957 3.55e-3 5 recipe mult… 4
## 8 pca_lasso pre0_mod1_po… f_meas 0.940 4.44e-3 5 recipe mult… 4
## 9 pca_svm pre0_mod2_po… accura… 0.950 2.19e-3 5 recipe svm_… 5
## 10 pca_svm pre0_mod2_po… f_meas 0.917 6.97e-3 5 recipe svm_… 5
## 11 pca_ridge pre0_mod1_po… accura… 0.935 4.54e-3 5 recipe mult… 6
## 12 pca_ridge pre0_mod1_po… f_meas 0.903 2.12e-2 5 recipe mult… 6
autoplot(train_res, id = "base_ridge") +
theme_light()

best_results <-
train_res |>
extract_workflow_set_result("base_svm") |>
select_best(metric = "accuracy")
print(best_results)
## # A tibble: 1 × 2
## cost .config
## <dbl> <chr>
## 1 0.00101 pre0_mod1_post0
best_results <- train_res |>
extract_workflow_set_result("base_svm") |>
select_best(metric = "accuracy")
print(best_results)
## # A tibble: 1 × 2
## cost .config
## <dbl> <chr>
## 1 0.00101 pre0_mod1_post0
best_results <- train_res |>
extract_workflow_set_result("base_ridge") |>
select_best(metric = "accuracy")
ridge_res <- train_res |>
extract_workflow("base_ridge") |>
finalize_workflow(best_results) |>
last_fit(split = data_split, metrics = metric_set(f_meas, accuracy, roc_auc))
collect_metrics(ridge_res) |>
print()
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 f_meas macro 1 pre0_mod0_post0
## 2 accuracy multiclass 1 pre0_mod0_post0
## 3 roc_auc hand_till 1 pre0_mod0_post0
collect_predictions(ridge_res) |>
conf_mat(truth = author, estimate = .pred_class) |>
autoplot(type = "heatmap") +
scale_fill_gradient(low = "white", high = "#233857") +
theme(panel.grid.major = element_line(colour = "#233857"),
axis.text = element_text(color = "#233857"),
axis.title = element_text(color = "#233857"),
plot.title = element_text(color = "#233857"),
axis.text.x = element_text(angle = 90))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

collect_predictions(ridge_res) |>
select(starts_with(".pred_")) |>
names() #можно оставить, а можно убрать
## [1] ".pred_class" ".pred_ABronte" ".pred_Austen" ".pred_CBronte"
## [5] ".pred_Dickens" ".pred_Eliot" ".pred_Fielding" ".pred_Richardson"
## [9] ".pred_Sterne" ".pred_Thackeray" ".pred_Trollope"
collect_predictions(ridge_res) |>
roc_curve(truth = author, .pred_ABronte:.pred_Trollope) |>
ggplot(aes(1 - specificity, sensitivity, color = .level)) +
geom_abline(slope = 1, color = "gray50", lty = 2, alpha = 0.8) +
geom_path(linewidth = 1.5, alpha = 0.7) +
scale_color_brewer(palette = "Paired") +
labs(color = NULL) +
theme_light()

final_model <- extract_fit_parsnip(ridge_res)
top_terms <- tidy(final_model) |>
filter(term != "(Intercept)") |>
group_by(class) |>
slice_max(abs(estimate), n = 10) |>
ungroup() |>
mutate(term = fct_reorder(term, abs(estimate)))
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 5.0
print(top_terms)
## # A tibble: 100 × 4
## class term estimate penalty
## <chr> <fct> <dbl> <dbl>
## 1 ABronte but 0.203 1.07e-10
## 2 ABronte and 0.149 1.07e-10
## 3 ABronte which -0.117 1.07e-10
## 4 ABronte replied 0.113 1.07e-10
## 5 ABronte or 0.106 1.07e-10
## 6 ABronte in -0.104 1.07e-10
## 7 ABronte out -0.0913 1.07e-10
## 8 ABronte this -0.0840 1.07e-10
## 9 ABronte who -0.0817 1.07e-10
## 10 ABronte alone 0.0784 1.07e-10
## # ℹ 90 more rows
library(tidytext)
top_terms |>
mutate(term = reorder_within(term, abs(estimate), class)) |>
ggplot(aes(x = estimate, y = term, fill = class)) +
geom_col(show.legend = FALSE, alpha = 0.85) +
facet_wrap(~ class, scales = "free_y", nrow = 4) +
scale_y_reordered() +
scale_fill_viridis_d() +
labs(title = "Наиболее важные признаки для каждого автора",
x = "Коэффициент", y = "Признак") +
theme_minimal()
