Техническая информация и библиотеки

rm(list = ls())
date()
## [1] "Fri Feb  7 22:16:37 2025"
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-apple-darwin20
## Running under: macOS Ventura 13.7.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Moscow
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.36     R6_2.5.1          fastmap_1.2.0     xfun_0.49        
##  [5] cachem_1.1.0      knitr_1.49        htmltools_0.5.8.1 rmarkdown_2.27   
##  [9] lifecycle_1.0.4   cli_3.6.3         sass_0.4.9        renv_1.0.11      
## [13] jquerylib_0.1.4   compiler_4.4.2    rstudioapi_0.16.0 tools_4.4.2      
## [17] evaluate_1.0.1    bslib_0.7.0       yaml_2.3.10       jsonlite_1.8.8   
## [21] rlang_1.1.4

Библиотеки

library(quanteda)
# library(quanteda.textstats)
# library(stm)
library(knitr)
# #library(topicmodels)
# library(ldatuning)
 library(stringr)
 library(tidyr)
# library(wordcloud)
 library(dplyr)
 library(ggplot2)

Загрузка объектов

#load(file = "Corpus_S.Rdata")
load(file = "Tokens_S.Rdata")
load(file = "Tokens_S3.Rdata")

Предобработка

Создаю словарь перевода слов в форму без части речи.


Dictionary <- data.frame(
  Word_POS = types(Tokens_S)
  , Word = str_replace(types(Tokens_S), "([^_]+)_.*", "\\1")  # Слово до подчеркивания
  , POS = str_replace(types(Tokens_S), ".*_([^_?]+).*", "\\1")  # Часть речи после подчеркивания
)

# Просмотр результата
head(Dictionary)
##        Word_POS        Word  POS
## 1 романтичный_A романтичный    A
## 2     чувство_S     чувство    S
## 3    красивый_A    красивый    A
## 4    персонаж_S    персонаж    S
## 5        я_SPRO           я SPRO
## 6     считать_V     считать    V
unique(Dictionary$POS)  
##  [1] "A"      "S"      "SPRO"   "V"      "CONJ"   "PR"     "APRO"   "ADVPRO"
##  [9] "ADV"    "PART"   "ANUM"   "NUM"    "INTJ"

Заменяю слова на их части речи. И создаю матрицу dfm Текст х Часть речи

DFM_POS <- tokens_replace(Tokens_S
                          , pattern = Dictionary$Word_POS
                          , replacement = Dictionary$POS
                          ) |>
  dfm() 

head(DFM_POS)
## Document-feature matrix of: 6 documents, 13 features (42.31% sparse) and 3 docvars.
##                                        features
## docs                                    a  s spro  v conj pr apro advpro adv
##   Clinical_Испытуемый_1_Картинка_1.txt  7  6    6  9    3  3    0      0   0
##   Clinical_Испытуемый_1_Картинка_2.txt  8 30   24 27   19  9    4      3   9
##   Clinical_Испытуемый_1_Картинка_3.txt  3 15    6 11    8  5    1      1   0
##   Clinical_Испытуемый_10_Картинка_1.txt 2 11    3  4    1  3    2      2   0
##   Clinical_Испытуемый_10_Картинка_2.txt 0  7    5  9    0  3    0      0   1
##   Clinical_Испытуемый_10_Картинка_3.txt 1  5    3  4    1  1    1      1   0
##                                        features
## docs                                    part
##   Clinical_Испытуемый_1_Картинка_1.txt     0
##   Clinical_Испытуемый_1_Картинка_2.txt     6
##   Clinical_Испытуемый_1_Картинка_3.txt     0
##   Clinical_Испытуемый_10_Картинка_1.txt    0
##   Clinical_Испытуемый_10_Картинка_2.txt    0
##   Clinical_Испытуемый_10_Картинка_3.txt    0
## [ reached max_nfeat ... 3 more features ]
featnames(DFM_POS)
##  [1] "a"      "s"      "spro"   "v"      "conj"   "pr"     "apro"   "advpro"
##  [9] "adv"    "part"   "anum"   "num"    "intj"

dfm перевожу в таблицу data.frame Добавляю общую длину текста

POS <- convert(DFM_POS, to = "data.frame")

POS$Sample <- docvars(DFM_POS, "Sample")
POS$Image <- docvars(DFM_POS, "Image")

POS <-   select(POS, -doc_id)


POS <- POS |>
  mutate(Words = rowSums(select(POS, -Sample, -Image))) |>
  select(Sample, Image, Words, everything()) #порядок колонок


head(POS)
##     Sample     Image Words a  s spro  v conj pr apro advpro adv part anum num
## 1 Clinical picture_1    34 7  6    6  9    3  3    0      0   0    0    0   0
## 2 Clinical picture_2   139 8 30   24 27   19  9    4      3   9    6    0   0
## 3 Clinical picture_3    50 3 15    6 11    8  5    1      1   0    0    0   0
## 4 Clinical picture_1    28 2 11    3  4    1  3    2      2   0    0    0   0
## 5 Clinical picture_2    25 0  7    5  9    0  3    0      0   1    0    0   0
## 6 Clinical picture_3    17 1  5    3  4    1  1    1      1   0    0    0   0
##   intj
## 1    0
## 2    0
## 3    0
## 4    0
## 5    0
## 6    0

Добавляем проценты.

POS <- POS |> 
  mutate(across(a:intj, ~ . / Words * 100, .names = "pct_{.col}"))

head(POS)
##     Sample     Image Words a  s spro  v conj pr apro advpro adv part anum num
## 1 Clinical picture_1    34 7  6    6  9    3  3    0      0   0    0    0   0
## 2 Clinical picture_2   139 8 30   24 27   19  9    4      3   9    6    0   0
## 3 Clinical picture_3    50 3 15    6 11    8  5    1      1   0    0    0   0
## 4 Clinical picture_1    28 2 11    3  4    1  3    2      2   0    0    0   0
## 5 Clinical picture_2    25 0  7    5  9    0  3    0      0   1    0    0   0
## 6 Clinical picture_3    17 1  5    3  4    1  1    1      1   0    0    0   0
##   intj     pct_a    pct_s pct_spro    pct_v  pct_conj    pct_pr pct_apro
## 1    0 20.588235 17.64706 17.64706 26.47059  8.823529  8.823529 0.000000
## 2    0  5.755396 21.58273 17.26619 19.42446 13.669065  6.474820 2.877698
## 3    0  6.000000 30.00000 12.00000 22.00000 16.000000 10.000000 2.000000
## 4    0  7.142857 39.28571 10.71429 14.28571  3.571429 10.714286 7.142857
## 5    0  0.000000 28.00000 20.00000 36.00000  0.000000 12.000000 0.000000
## 6    0  5.882353 29.41176 17.64706 23.52941  5.882353  5.882353 5.882353
##   pct_advpro pct_adv pct_part pct_anum pct_num pct_intj
## 1   0.000000 0.00000 0.000000        0       0        0
## 2   2.158273 6.47482 4.316547        0       0        0
## 3   2.000000 0.00000 0.000000        0       0        0
## 4   7.142857 0.00000 0.000000        0       0        0
## 5   0.000000 4.00000 0.000000        0       0        0
## 6   5.882353 0.00000 0.000000        0       0        0

Отдельная таблица - все три текста по испытуемому вместе. Матрица частей речи по сумме трёх текстов

DFM_POS3 <- tokens_replace(Tokens_S3
                          , pattern = Dictionary$Word_POS
                          , replacement = Dictionary$POS
                          ) |>
  dfm() 

head(DFM_POS3)
## Document-feature matrix of: 6 documents, 13 features (23.08% sparse) and 2 docvars.
##        features
## docs     a  s spro  v conj pr apro advpro adv part
##   Cli1  18 51   36 47   30 17    5      4   9    6
##   Cli10  3 23   11 17    2  7    3      3   1    0
##   Cli11 13 75   24 65   24 25   20      0  11   15
##   Cli12 12 56   10 35    9 23    6      0   4    3
##   Cli13  6 47   25 47   22 27    9     12  17   26
##   Cli14  7 30   42 43   24 19   10     14  21   29
## [ reached max_nfeat ... 3 more features ]
featnames(DFM_POS3)
##  [1] "a"      "s"      "spro"   "v"      "conj"   "pr"     "apro"   "advpro"
##  [9] "adv"    "part"   "anum"   "num"    "intj"

Создание data.frame

POS3 <- convert(DFM_POS3, to = "data.frame")

POS3$Sample <- docvars(DFM_POS3, "Sample")

POS3 <-   select(POS3, -doc_id)


POS3 <- POS3 |>
  mutate(Words = rowSums(select(POS3, -Sample))) |>
  select(Sample, Words, everything())


head(POS3)
##     Sample Words  a  s spro  v conj pr apro advpro adv part anum num intj
## 1 Clinical   223 18 51   36 47   30 17    5      4   9    6    0   0    0
## 2 Clinical    70  3 23   11 17    2  7    3      3   1    0    0   0    0
## 3 Clinical   273 13 75   24 65   24 25   20      0  11   15    1   0    0
## 4 Clinical   160 12 56   10 35    9 23    6      0   4    3    2   0    0
## 5 Clinical   239  6 47   25 47   22 27    9     12  17   26    0   1    0
## 6 Clinical   239  7 30   42 43   24 19   10     14  21   29    0   0    0

Пересчитываем в проценты.

POS3 <- POS3 |> 
  mutate(across(a:intj, ~ . / Words * 100, .names = "pct_{.col}"))

head(POS3)
##     Sample Words  a  s spro  v conj pr apro advpro adv part anum num intj
## 1 Clinical   223 18 51   36 47   30 17    5      4   9    6    0   0    0
## 2 Clinical    70  3 23   11 17    2  7    3      3   1    0    0   0    0
## 3 Clinical   273 13 75   24 65   24 25   20      0  11   15    1   0    0
## 4 Clinical   160 12 56   10 35    9 23    6      0   4    3    2   0    0
## 5 Clinical   239  6 47   25 47   22 27    9     12  17   26    0   1    0
## 6 Clinical   239  7 30   42 43   24 19   10     14  21   29    0   0    0
##      pct_a    pct_s  pct_spro    pct_v  pct_conj    pct_pr pct_apro pct_advpro
## 1 8.071749 22.86996 16.143498 21.07623 13.452915  7.623318 2.242152   1.793722
## 2 4.285714 32.85714 15.714286 24.28571  2.857143 10.000000 4.285714   4.285714
## 3 4.761905 27.47253  8.791209 23.80952  8.791209  9.157509 7.326007   0.000000
## 4 7.500000 35.00000  6.250000 21.87500  5.625000 14.375000 3.750000   0.000000
## 5 2.510460 19.66527 10.460251 19.66527  9.205021 11.297071 3.765690   5.020921
## 6 2.928870 12.55230 17.573222 17.99163 10.041841  7.949791 4.184100   5.857741
##    pct_adv  pct_part  pct_anum pct_num pct_intj
## 1 4.035874  2.690583 0.0000000 0.00000        0
## 2 1.428571  0.000000 0.0000000 0.00000        0
## 3 4.029304  5.494505 0.3663004 0.00000        0
## 4 2.500000  1.875000 1.2500000 0.00000        0
## 5 7.112971 10.878661 0.0000000 0.41841        0
## 6 8.786611 12.133891 0.0000000 0.00000        0

Длинная версия таблицы

POS3_long_perc <- POS3 %>%
  select(Sample, starts_with("pct_")) %>%
  pivot_longer(
    cols = starts_with("pct_"),    # Выбираем колонки с префиксом "pct_"
    names_to = "Feature",          # Новая колонка с именами переменных
    values_to = "Value"            # Новая колонка с их значениями
  )

POS3_long_abs <- POS3 %>%
  select(-starts_with("pct_")) %>%  # Удаляем колонки с префиксом "pct_"
  pivot_longer(
    cols = -Sample,    # Оставляем колонку Sample неизменной
    names_to = "Feature",  # Новая колонка с именами переменных
    values_to = "Value"    # Новая колонка с их значениями
  )


head(POS3_long_perc)
## # A tibble: 6 × 3
##   Sample   Feature  Value
##   <fct>    <chr>    <dbl>
## 1 Clinical pct_a     8.07
## 2 Clinical pct_s    22.9 
## 3 Clinical pct_spro 16.1 
## 4 Clinical pct_v    21.1 
## 5 Clinical pct_conj 13.5 
## 6 Clinical pct_pr    7.62
head(POS3_long_abs)
## # A tibble: 6 × 3
##   Sample   Feature Value
##   <fct>    <chr>   <dbl>
## 1 Clinical Words     223
## 2 Clinical a          18
## 3 Clinical s          51
## 4 Clinical spro       36
## 5 Clinical v          47
## 6 Clinical conj       30

Словарь частей речи

Части речи
A прилагательное
ADV наречие
ADVPRO местоименное наречие
ANUM числительное-прилагательное
APRO местоимение-прилагательное
COM часть композита - сложного слова
CONJ союз
INTJ междометие
NUM числительное
PART частица
PR предлог
S существительное
SPRO местоимение-существительное
V глагол

По частям речи и трём текстам

Все слова

POS %>%
  group_by(Sample, Image) %>%
  summarise(across(Words, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop")%>%
  kable(digits = 2)
Sample Image Words_mean Words_sd Words_median Words_min Words_max
Clinical picture_1 63.72 27.57 61.0 28 131
Clinical picture_2 59.04 26.98 51.0 24 139
Clinical picture_3 70.16 32.01 64.0 17 160
Control picture_1 120.33 60.02 107.5 45 265
Control picture_2 101.73 46.86 96.5 28 210
Control picture_3 110.57 48.74 105.5 49 266

Существительное (s)

POS %>%
  group_by(Sample, Image) %>%
  summarise(across(s, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop")%>%
  kable(digits = 2)
Sample Image s_mean s_sd s_median s_min s_max
Clinical picture_1 12.12 5.53 11.0 5 27
Clinical picture_2 11.44 6.93 12.0 2 30
Clinical picture_3 17.84 9.18 15.0 5 48
Control picture_1 20.50 12.48 17.0 8 62
Control picture_2 17.63 9.32 16.5 2 37
Control picture_3 22.97 9.46 22.5 8 54
POS %>%
  group_by(Sample, Image) %>%
  summarise(across(pct_s, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop")%>%
  kable(digits = 2)
Sample Image pct_s_mean pct_s_sd pct_s_median pct_s_min pct_s_max
Clinical picture_1 20.27 7.12 19.70 8.77 39.29
Clinical picture_2 19.10 7.96 20.00 4.17 37.70
Clinical picture_3 26.06 6.40 24.56 17.05 41.67
Control picture_1 17.65 6.42 16.23 6.67 32.86
Control picture_2 17.63 6.17 18.02 5.56 29.58
Control picture_3 21.57 5.81 21.30 10.65 33.72

Прилагательное (a)

POS %>%
  group_by(Sample, Image) %>%
  summarise(across(a, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop")%>%
  kable(digits = 2)
Sample Image a_mean a_sd a_median a_min a_max
Clinical picture_1 3.16 2.06 3 0 8
Clinical picture_2 2.16 2.30 1 0 8
Clinical picture_3 2.84 2.27 2 0 9
Control picture_1 5.17 5.30 4 0 29
Control picture_2 4.40 2.70 5 0 11
Control picture_3 4.83 3.31 4 0 13
POS %>%
  group_by(Sample, Image) %>%
  summarise(across(pct_a, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop")%>%
  kable(digits = 2)
Sample Image pct_a_mean pct_a_sd pct_a_median pct_a_min pct_a_max
Clinical picture_1 5.32 4.08 4.44 0 20.59
Clinical picture_2 3.21 2.80 2.44 0 10.17
Clinical picture_3 4.25 3.08 2.78 0 12.07
Control picture_1 4.18 2.33 4.26 0 10.94
Control picture_2 4.41 2.57 3.99 0 9.80
Control picture_3 4.10 1.93 4.23 0 7.94

Наречие (adv)

POS %>%
  group_by(Sample, Image) %>%
  summarise(across(adv, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop") %>%
  kable(digits = 2)
Sample Image adv_mean adv_sd adv_median adv_min adv_max
Clinical picture_1 3.28 2.92 2.0 0 11
Clinical picture_2 3.36 2.40 3.0 0 9
Clinical picture_3 4.32 3.02 4.0 0 11
Control picture_1 8.73 6.96 6.5 1 29
Control picture_2 7.10 3.90 7.0 1 18
Control picture_3 9.37 5.97 8.0 2 26
POS %>%
  group_by(Sample, Image) %>%
  summarise(across(pct_adv, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop") %>%
  kable(digits = 2)
Sample Image pct_adv_mean pct_adv_sd pct_adv_median pct_adv_min pct_adv_max
Clinical picture_1 4.96 3.49 4.08 0.00 11.67
Clinical picture_2 5.77 3.86 5.48 0.00 18.60
Clinical picture_3 5.89 3.50 5.21 0.00 12.50
Control picture_1 6.94 3.68 6.81 2.22 20.69
Control picture_2 7.12 2.85 7.29 2.75 15.49
Control picture_3 8.12 2.81 8.12 3.70 13.11

Глагол (v)

POS %>%
  group_by(Sample, Image) %>%
  summarise(across(v, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop") %>%
  kable(digits = 2)
Sample Image v_mean v_sd v_median v_min v_max
Clinical picture_1 13.96 6.08 13.0 3 26
Clinical picture_2 13.88 5.54 14.0 5 27
Clinical picture_3 14.80 6.67 14.0 4 30
Control picture_1 25.40 11.05 22.5 9 53
Control picture_2 20.90 9.95 20.0 5 48
Control picture_3 20.17 7.10 20.0 9 36
POS %>%
  group_by(Sample, Image) %>%
  summarise(across(pct_v, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop") %>%
  kable(digits = 2)
Sample Image pct_v_mean pct_v_sd pct_v_median pct_v_min pct_v_max
Clinical picture_1 21.96 4.89 21.31 10.71 31.03
Clinical picture_2 24.69 6.81 24.59 11.63 36.00
Clinical picture_3 21.31 3.70 21.74 12.50 28.38
Control picture_1 21.78 3.96 21.58 15.75 31.46
Control picture_2 21.17 5.86 20.67 7.59 36.11
Control picture_3 18.98 4.07 18.99 11.76 27.91

Местоимение-существительное (spro)

она, он, они, я

POS %>%
  group_by(Sample, Image) %>%
  summarise(across(spro, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop") %>%
  kable(digits = 2)
Sample Image spro_mean spro_sd spro_median spro_min spro_max
Clinical picture_1 9.64 5.16 9.0 3 21
Clinical picture_2 9.16 5.57 9.0 2 24
Clinical picture_3 7.24 3.90 7.0 3 21
Control picture_1 17.30 9.92 14.0 6 38
Control picture_2 16.17 8.57 15.5 4 34
Control picture_3 13.63 7.84 13.0 2 41
POS %>%
  group_by(Sample, Image) %>%
  summarise(across(pct_spro, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop") %>%
  kable(digits = 2)
Sample Image pct_spro_mean pct_spro_sd pct_spro_median pct_spro_min pct_spro_max
Clinical picture_1 14.98 4.85 12.73 8.06 24.32
Clinical picture_2 16.12 7.29 16.67 3.28 33.33
Clinical picture_3 10.62 3.53 11.11 4.69 18.92
Control picture_1 14.58 4.84 14.17 5.78 25.98
Control picture_2 15.87 4.37 16.15 8.00 25.00
Control picture_3 12.25 4.52 12.50 4.08 25.45

Местоимение-прилагательное (apro)

этот, свой, какой-то.

POS %>%
  group_by(Sample, Image) %>%
  summarise(across(apro, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop") %>%
  kable(digits = 2)
Sample Image apro_mean apro_sd apro_median apro_min apro_max
Clinical picture_1 2.92 1.87 3 0 6
Clinical picture_2 2.24 1.94 2 0 6
Clinical picture_3 2.80 3.10 2 0 11
Control picture_1 6.53 4.93 5 2 19
Control picture_2 6.40 4.47 5 0 16
Control picture_3 6.20 5.52 5 0 27
POS %>%
  group_by(Sample, Image) %>%
  summarise(across(pct_apro, list(mean = mean
                           , sd = sd
                           , median = median
                           , min = min
                           , max = max))
                   , .groups = "drop") %>%
  kable(digits = 2)
Sample Image pct_apro_mean pct_apro_sd pct_apro_median pct_apro_min pct_apro_max
Clinical picture_1 4.80 3.37 4.58 0.00 14.29
Clinical picture_2 3.43 2.59 3.28 0.00 8.82
Clinical picture_3 3.52 2.63 2.70 0.00 8.33
Control picture_1 5.33 2.22 5.00 1.65 10.91
Control picture_2 6.01 2.99 6.34 0.00 12.24
Control picture_3 4.99 3.02 4.36 0.00 11.90

Три текста

POS3 %>%
  group_by(Sample) %>%
  summarise(
    across(everything(), list(
      mean = ~ mean(.),
      sd = ~ sd(.),
      median = ~ median(.),
      min = ~ min(.),
      max = ~ max(.)
    ), .names = "{.col}_{.fn}")
  ) %>%
   pivot_longer(
     cols = -Sample,  
     # Оставляем 'Sample' как неизменную колонку
     names_to = c("Feature", "Statistic"),  
     # Разбиваем название на части
     names_pattern = "(.*)_(.*)",  
     # Разделяем на две части: Feature и Statistic
     values_to = "Value"  
     # Указываем название колонки для значений
   ) %>%
   mutate(
     #Добавляем колонку, указывающую, является ли Feature процентной колонкой
     Type = if_else(str_detect(Feature, "^pct_"), "Percentage", "Absolute")
     , Feature = str_remove(Feature, "^pct_")  
     # Убираем префикс pct_, если он есть
   ) %>%
    pivot_wider(
      names_from = c(Sample, Type),
      values_from = Value   # Значения берём из колонки Value
   ) %>%
select(-Clinical_Percentage, -Control_Percentage, Clinical_Percentage, Control_Percentage)%>%
  rename(
    Clin_Abs = Clinical_Absolute
    , Clin_Perc = Clinical_Percentage
    , Cont_Abs = Control_Absolute
    , Cont_Perc = Control_Percentage
  ) %>%
  kable(digits = 2)
Feature Statistic Clin_Abs Cont_Abs Clin_Perc Cont_Perc
Words mean 192.92 332.63 NA NA
Words sd 67.68 142.54 NA NA
Words median 184.00 308.00 NA NA
Words min 70.00 151.00 NA NA
Words max 388.00 730.00 NA NA
a mean 8.16 14.40 4.24 4.27
a sd 4.68 8.94 1.95 1.67
a median 7.00 12.50 4.00 4.17
a min 2.00 3.00 1.13 0.91
a max 19.00 41.00 8.07 7.90
s mean 41.40 61.10 21.92 18.93
s sd 17.75 26.67 6.22 5.09
s median 36.00 55.50 20.65 18.74
s min 16.00 23.00 11.70 9.17
s max 96.00 123.00 35.00 31.69
spro mean 26.04 47.10 13.76 14.22
spro sd 9.92 23.21 3.88 3.66
spro median 25.00 44.50 14.49 14.18
spro min 10.00 13.00 6.25 7.65
spro max 44.00 102.00 20.24 22.35
v mean 42.64 66.47 22.34 20.35
v sd 14.19 26.20 3.33 3.28
v median 44.00 59.00 21.88 19.66
v min 17.00 28.00 14.29 14.41
v max 72.00 135.00 29.35 28.23
conj mean 18.52 33.43 9.11 9.78
conj sd 9.81 18.19 2.57 2.09
conj median 18.00 30.00 9.14 9.92
conj min 2.00 11.00 2.86 5.76
conj max 46.00 80.00 13.45 13.51
pr mean 18.44 30.23 9.68 9.41
pr sd 7.41 11.36 2.20 2.51
pr median 18.00 29.50 9.77 9.26
pr min 7.00 11.00 5.71 5.25
pr max 43.00 58.00 14.37 16.09
apro mean 7.96 19.13 3.98 5.48
apro sd 5.62 12.91 2.09 1.89
apro median 7.00 16.50 3.80 5.12
apro min 1.00 5.00 0.57 2.66
apro max 22.00 61.00 8.70 9.64
advpro mean 5.40 11.73 2.77 3.35
advpro sd 4.74 8.39 1.98 1.94
advpro median 4.00 10.50 2.58 3.44
advpro min 0.00 0.00 0.00 0.00
advpro max 18.00 33.00 6.79 8.24
adv mean 10.96 25.20 5.59 7.50
adv sd 6.21 13.52 2.77 2.13
adv median 9.00 23.00 5.93 7.61
adv min 1.00 7.00 1.43 3.70
adv max 23.00 65.00 13.53 14.62
part mean 12.96 22.53 6.34 6.38
part sd 9.41 15.13 3.73 2.77
part median 11.00 21.50 5.56 6.81
part min 0.00 1.00 0.00 0.57
part max 39.00 55.00 14.72 11.73
anum mean 0.12 0.33 0.06 0.10
anum sd 0.44 0.48 0.26 0.16
anum median 0.00 0.00 0.00 0.00
anum min 0.00 0.00 0.00 0.00
anum max 2.00 1.00 1.25 0.57
num mean 0.28 0.63 0.18 0.17
num sd 0.46 1.07 0.33 0.27
num median 0.00 0.00 0.00 0.00
num min 0.00 0.00 0.00 0.00
num max 1.00 4.00 1.09 0.98
intj mean 0.04 0.33 0.02 0.07
intj sd 0.20 0.99 0.12 0.19
intj median 0.00 0.00 0.00 0.00
intj min 0.00 0.00 0.00 0.00
intj max 1.00 5.00 0.60 0.82

Диаграммы

Число слов в тексте

POS %>%
  ggplot(aes(x = Image, y = Words, fill = Sample)) +
  geom_boxplot() +
  labs(
    title = "Распределение количества слов по выборкам и текстам",
    x = "Тексты (картинки)",
    y = "Количество слов",
    fill = "Выборка"
  ) +
  theme_minimal() 

-В статью - Распределение длины текстов (количество слов) по выборкам и стимульным карточкам ТАТ

POS %>%
 mutate( Image = recode(Image, 
                   "picture_1" = "Карточка 4", 
                   "picture_2" = "Карточка 6gf", 
                   "picture_3" = "Карточка 7gf"),
    Sample = recode(Sample, 
                    "Clinical" = "Клиническая (ПРЛ)", 
                    "Control" = "Контрольная")
    ) %>%
  ggplot(aes(x = Image, y = Words, fill = Sample)) +
  geom_boxplot() +
  labs(
    title = "Распределение длины текстов  (количество слов) по выборкам и стимульным карточкам ТАТ",
    x = "Стимульные карточки ТАТ",
    y = "Количество слов",
    fill = "Выборка"
  ) +
  theme_minimal()

POS %>%
  mutate(
    Image = recode(Image, 
                   "picture_1" = "Card 4", 
                   "picture_2" = "Card 6gf", 
                   "picture_3" = "Card gf"),
    Sample = recode(Sample, 
                    "Clinical" = "Clinical (BPD)", 
                    "Control" = "Control")
  ) %>%
  ggplot(aes(x = Image, y = Words, fill = Sample)) +
  geom_boxplot() +
  labs(
    title = "Distribution of text length (word count) by samples and TAT stimulus cards",
    x = "TAT stimulus cards",
    y = "Word count",
    fill = "Sample"
  ) +
  theme_minimal()

POS %>%
 mutate( Image = recode(Image, 
                   "picture_1" = "Карточка 4\nCard 4", 
                   "picture_2" = "Карточка 6gf\nCard 6gf", 
                   "picture_3" = "Карточка 7gf\nCard 7gf"),
    Sample = recode(Sample, 
                    "Clinical" = "Клиническая (ПРЛ)\nClinical (BPD)", 
                    "Control" = "Контрольная\nControl")
    ) %>%
  ggplot(aes(x = Image, y = Words, fill = Sample)) +
  geom_boxplot() +
  labs(
    title = "Распределение длины текстов (количество слов) по выборкам и стимульным карточкам ТАТ\nDistribution of text length (word count) by samples and TAT stimulus cards",
    x = "Стимульные карточки ТАТ\nTAT stimulus cards",
    y = "Количество слов\nWord count",
    fill = "Выборка\nSample"
  ) +
  theme_minimal()

POS %>%
  ggplot(aes(x = Words)) +
  geom_histogram(
    position = "identity",
    alpha = 0.6,
    bins = 20
  ) +
  facet_grid(Sample ~ Image) +
  labs(
    title = "Распределение количества слов по текстам и выборкам",
    x = "Количество слов",
    y = "Частота",
    fill = "Выборка"
  ) +
  theme_minimal() 

POS3_long_perc |>
   filter(!Feature %in% c("pct_anum", "pct_num", "pct_intj"))|>
    mutate(Feature = str_remove(Feature, "^pct_"))|>
ggplot(aes(x = Feature, y = Value, fill = Sample)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Распределение частей речи (доля)",
    x = "Части речи",
    y = "Процентное значение",
    fill = "Группа"
  ) 

-Для статьи - Распределение частей речи

POS3_long_perc |>
  filter(Feature %in% c("pct_a", "pct_adv", "pct_v", "pct_s")) |>
  mutate(Feature = recode(Feature, 
                          "pct_a" = "Прилагательные", 
                          "pct_adv" = "Наречия**", 
                          "pct_v" = "Глаголы*", 
                          "pct_s" = "Существительные"),
         Sample = recode(Sample, 
                    "Clinical" = "Клиническая (ПРЛ)", 
                    "Control" = "Контрольная")) |>
  ggplot(aes(x = Feature, y = Value, fill = Sample)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Распределение доли отдельных частей речи в текстах испытуемых",
    x = "Часть речи",
    y = "Доля от общего количества слов (%)",
    fill = "Выборка"
  )

POS3_long_perc |>
  filter(Feature %in% c("pct_a", "pct_adv", "pct_v", "pct_s")) |>
  mutate(Feature = recode(Feature, 
                          "pct_a" = "Adjectives", 
                          "pct_adv" = "Adverbs**", 
                          "pct_v" = "Verbs*", 
                          "pct_s" = "Nouns"),
         Sample = recode(Sample, 
                    "Clinical" = "Clinical (BPD)", 
                    "Control" = "Control")) |>
  ggplot(aes(x = Feature, y = Value, fill = Sample)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Distribution of the Proportion of Speech Parts in Participants' Texts",
    x = "Part of Speech",
    y = "Proportion of Total Words (%)",
    fill = "Group"
  )

POS3_long_perc |>
  filter(Feature %in% c("pct_a", "pct_adv", "pct_v", "pct_s")) |>
  mutate(Feature = recode(Feature, 
                          "pct_a" = "Прилагательные\nAdjectives", 
                          "pct_adv" = "Наречия**\nAdverbs**", 
                          "pct_v" = "Глаголы*\nVerbs*", 
                          "pct_s" = "Существительные\nNouns"),
         Sample = recode(Sample, 
                    "Clinical" = "Клиническая (ПРЛ)\nClinical (BPD)", 
                    "Control" = "Контрольная\nControl")) |>
  ggplot(aes(x = Feature, y = Value, fill = Sample)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Распределение доли отдельных частей речи в текстах испытуемых\nDistribution of the Proportion of Speech Parts in Participants' Texts",
    x = "Часть речи\nPart of Speech",
    y = "Доля от общего количества слов (%)\nProportion of Total Words (%)",
    fill = "Выборка\nGroup"
  )

POS3_long_abs |>
   filter(!Feature %in% c("anum", "num", "intj", "Words"))|>
ggplot(aes(x = Feature, y = Value, fill = Sample)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Распределение частей речи (абсолютные значения)",
    x = "Части речи",
    y = "Частота",
    fill = "Группа"
  ) 

Проверка достоверности различий

Длина текстов

Сравниваем общую длину всех трёх текстов по испытуемому.

Проверка на нормальность распределения

POS3 %>%
  group_by(Sample) %>%
  summarise(p_value = shapiro.test(Words)$p.value)%>%
  kable(digits = 3)
Sample p_value
Clinical 0.426
Control 0.014

Распределение в контрольной группе значимо (p<0,05) отличается от нормального. Выбираем непараметрический тест Вилкоксона

wilcox.test(Words ~ Sample, data = POS3, exact = TRUE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Words by Sample
## W = 121.5, p-value = 1.897e-05
## alternative hypothesis: true location shift is not equal to 0

Части речи

Критерий Вилкоксона по остальным переменным

 POS3 %>%
  select(Sample, starts_with("pct_")) %>%
  pivot_longer(cols = -Sample, names_to = "Feature", values_to = "Value") %>%
  group_by(Feature) %>%
  summarise(
    p_value = wilcox.test(Value ~ Sample)$p.value,
    W = wilcox.test(Value ~ Sample)$statistic
  ) %>%
   mutate(p_adj = p.adjust(p_value, method = "BH")) %>%
   mutate(
    p_adj = p.adjust(p_value, method = "BH"),
    Significance_adj = case_when(
      p_adj < 0.001 ~ "***",
      p_adj < 0.01  ~ "**",
      p_adj < 0.05  ~ "*",
      p_adj < 0.1  ~ "'",
      TRUE          ~ ""
    ),
    Significance = case_when(
      p_value < 0.001 ~ "***",
      p_value < 0.01  ~ "**",
      p_value < 0.05  ~ "*",
      p_value < 0.1  ~ "'",
      TRUE          ~ ""
    ))%>%
    select(Feature, p_value, Significance, everything())%>%
  as.data.frame()%>%
  kable(digits = 4)
Feature p_value Significance W p_adj Significance_adj
pct_a 0.8459 363.0 0.9000
pct_adv 0.0032 ** 203.0 0.0418 *
pct_advpro 0.2869 311.5 0.5328
pct_anum 0.0430 * 288.0 0.1399
pct_apro 0.0087 ** 221.0 0.0566
pct_conj 0.3658 321.0 0.5945
pct_intj 0.1553 329.0 0.3365
pct_num 0.7534 359.0 0.8904
pct_part 0.6326 346.0 0.8224
pct_pr 0.4251 423.0 0.6140
pct_s 0.0832 478.0 0.2162
pct_spro 0.9000 367.0 0.9000
pct_v 0.0298 * 504.0 0.1293

Заметные различия появляются в Наречиях. Они остаются если ввести поправку на множественные сравнения. Но еще можно обратить внимание на глаголы. Есть тенденция. Можно исключить проверку на множественные сравнения.

По отдельным текстам:

 POS %>%
  select(Sample, Image, starts_with("pct_")) %>%
  pivot_longer(cols = c(-Sample, -Image), names_to = "Feature", values_to = "Value") %>%
  group_by(Image, Feature) %>%
  summarise(
    p_value = wilcox.test(Value ~ Sample)$p.value,
    W = wilcox.test(Value ~ Sample)$statistic
  ) %>%
   mutate(p_adj = p.adjust(p_value, method = "BH")) %>%
  kable(digits = 2)
Image Feature p_value W p_adj
picture_1 pct_a 0.26 441.5 0.65
picture_1 pct_adv 0.04 255.5 0.58
picture_1 pct_advpro 0.82 361.0 0.91
picture_1 pct_anum 0.18 323.5 0.65
picture_1 pct_apro 0.30 313.0 0.65
picture_1 pct_conj 0.31 315.0 0.65
picture_1 pct_intj 0.38 362.5 0.65
picture_1 pct_num 0.54 355.0 0.78
picture_1 pct_part 0.40 325.0 0.65
picture_1 pct_pr 0.87 385.5 0.91
picture_1 pct_s 0.17 457.5 0.65
picture_1 pct_spro 0.91 382.5 0.91
picture_1 pct_v 0.83 388.0 0.91
picture_2 pct_a 0.03 247.5 0.11
picture_2 pct_adv 0.03 248.0 0.11
picture_2 pct_advpro 0.18 296.0 0.37
picture_2 pct_anum 0.38 362.5 0.62
picture_2 pct_apro 0.00 189.0 0.02
picture_2 pct_conj 0.66 348.5 0.82
picture_2 pct_intj 0.20 350.0 0.37
picture_2 pct_num 0.11 337.5 0.29
picture_2 pct_part 0.70 398.5 0.82
picture_2 pct_pr 0.92 381.5 0.92
picture_2 pct_s 0.48 417.5 0.69
picture_2 pct_spro 0.82 361.0 0.89
picture_2 pct_v 0.03 500.5 0.11
picture_3 pct_a 0.59 342.5 0.72
picture_3 pct_adv 0.02 231.5 0.11
picture_3 pct_advpro 0.22 302.5 0.39
picture_3 pct_anum 0.20 350.0 0.39
picture_3 pct_apro 0.09 275.0 0.30
picture_3 pct_conj 0.67 349.0 0.72
picture_3 pct_intj 0.27 342.0 0.39
picture_3 pct_num 0.65 354.0 0.72
picture_3 pct_part 0.25 306.0 0.39
picture_3 pct_pr 0.83 388.0 0.83
picture_3 pct_s 0.02 516.5 0.11
picture_3 pct_spro 0.14 287.5 0.37
picture_3 pct_v 0.03 504.5 0.13

save(Dictionary, file = "Dictionary.RData")