rm(list = ls())
date()
## [1] "Fri Feb 7 22:16:37 2025"
sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-apple-darwin20
## Running under: macOS Ventura 13.7.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Moscow
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.36 R6_2.5.1 fastmap_1.2.0 xfun_0.49
## [5] cachem_1.1.0 knitr_1.49 htmltools_0.5.8.1 rmarkdown_2.27
## [9] lifecycle_1.0.4 cli_3.6.3 sass_0.4.9 renv_1.0.11
## [13] jquerylib_0.1.4 compiler_4.4.2 rstudioapi_0.16.0 tools_4.4.2
## [17] evaluate_1.0.1 bslib_0.7.0 yaml_2.3.10 jsonlite_1.8.8
## [21] rlang_1.1.4
Библиотеки
library(quanteda)
# library(quanteda.textstats)
# library(stm)
library(knitr)
# #library(topicmodels)
# library(ldatuning)
library(stringr)
library(tidyr)
# library(wordcloud)
library(dplyr)
library(ggplot2)
Загрузка объектов
#load(file = "Corpus_S.Rdata")
load(file = "Tokens_S.Rdata")
load(file = "Tokens_S3.Rdata")
Создаю словарь перевода слов в форму без части речи.
Dictionary <- data.frame(
Word_POS = types(Tokens_S)
, Word = str_replace(types(Tokens_S), "([^_]+)_.*", "\\1") # Слово до подчеркивания
, POS = str_replace(types(Tokens_S), ".*_([^_?]+).*", "\\1") # Часть речи после подчеркивания
)
# Просмотр результата
head(Dictionary)
## Word_POS Word POS
## 1 романтичный_A романтичный A
## 2 чувство_S чувство S
## 3 красивый_A красивый A
## 4 персонаж_S персонаж S
## 5 я_SPRO я SPRO
## 6 считать_V считать V
unique(Dictionary$POS)
## [1] "A" "S" "SPRO" "V" "CONJ" "PR" "APRO" "ADVPRO"
## [9] "ADV" "PART" "ANUM" "NUM" "INTJ"
Заменяю слова на их части речи. И создаю матрицу dfm Текст х Часть речи
DFM_POS <- tokens_replace(Tokens_S
, pattern = Dictionary$Word_POS
, replacement = Dictionary$POS
) |>
dfm()
head(DFM_POS)
## Document-feature matrix of: 6 documents, 13 features (42.31% sparse) and 3 docvars.
## features
## docs a s spro v conj pr apro advpro adv
## Clinical_Испытуемый_1_Картинка_1.txt 7 6 6 9 3 3 0 0 0
## Clinical_Испытуемый_1_Картинка_2.txt 8 30 24 27 19 9 4 3 9
## Clinical_Испытуемый_1_Картинка_3.txt 3 15 6 11 8 5 1 1 0
## Clinical_Испытуемый_10_Картинка_1.txt 2 11 3 4 1 3 2 2 0
## Clinical_Испытуемый_10_Картинка_2.txt 0 7 5 9 0 3 0 0 1
## Clinical_Испытуемый_10_Картинка_3.txt 1 5 3 4 1 1 1 1 0
## features
## docs part
## Clinical_Испытуемый_1_Картинка_1.txt 0
## Clinical_Испытуемый_1_Картинка_2.txt 6
## Clinical_Испытуемый_1_Картинка_3.txt 0
## Clinical_Испытуемый_10_Картинка_1.txt 0
## Clinical_Испытуемый_10_Картинка_2.txt 0
## Clinical_Испытуемый_10_Картинка_3.txt 0
## [ reached max_nfeat ... 3 more features ]
featnames(DFM_POS)
## [1] "a" "s" "spro" "v" "conj" "pr" "apro" "advpro"
## [9] "adv" "part" "anum" "num" "intj"
dfm перевожу в таблицу data.frame Добавляю общую длину текста
POS <- convert(DFM_POS, to = "data.frame")
POS$Sample <- docvars(DFM_POS, "Sample")
POS$Image <- docvars(DFM_POS, "Image")
POS <- select(POS, -doc_id)
POS <- POS |>
mutate(Words = rowSums(select(POS, -Sample, -Image))) |>
select(Sample, Image, Words, everything()) #порядок колонок
head(POS)
## Sample Image Words a s spro v conj pr apro advpro adv part anum num
## 1 Clinical picture_1 34 7 6 6 9 3 3 0 0 0 0 0 0
## 2 Clinical picture_2 139 8 30 24 27 19 9 4 3 9 6 0 0
## 3 Clinical picture_3 50 3 15 6 11 8 5 1 1 0 0 0 0
## 4 Clinical picture_1 28 2 11 3 4 1 3 2 2 0 0 0 0
## 5 Clinical picture_2 25 0 7 5 9 0 3 0 0 1 0 0 0
## 6 Clinical picture_3 17 1 5 3 4 1 1 1 1 0 0 0 0
## intj
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
Добавляем проценты.
POS <- POS |>
mutate(across(a:intj, ~ . / Words * 100, .names = "pct_{.col}"))
head(POS)
## Sample Image Words a s spro v conj pr apro advpro adv part anum num
## 1 Clinical picture_1 34 7 6 6 9 3 3 0 0 0 0 0 0
## 2 Clinical picture_2 139 8 30 24 27 19 9 4 3 9 6 0 0
## 3 Clinical picture_3 50 3 15 6 11 8 5 1 1 0 0 0 0
## 4 Clinical picture_1 28 2 11 3 4 1 3 2 2 0 0 0 0
## 5 Clinical picture_2 25 0 7 5 9 0 3 0 0 1 0 0 0
## 6 Clinical picture_3 17 1 5 3 4 1 1 1 1 0 0 0 0
## intj pct_a pct_s pct_spro pct_v pct_conj pct_pr pct_apro
## 1 0 20.588235 17.64706 17.64706 26.47059 8.823529 8.823529 0.000000
## 2 0 5.755396 21.58273 17.26619 19.42446 13.669065 6.474820 2.877698
## 3 0 6.000000 30.00000 12.00000 22.00000 16.000000 10.000000 2.000000
## 4 0 7.142857 39.28571 10.71429 14.28571 3.571429 10.714286 7.142857
## 5 0 0.000000 28.00000 20.00000 36.00000 0.000000 12.000000 0.000000
## 6 0 5.882353 29.41176 17.64706 23.52941 5.882353 5.882353 5.882353
## pct_advpro pct_adv pct_part pct_anum pct_num pct_intj
## 1 0.000000 0.00000 0.000000 0 0 0
## 2 2.158273 6.47482 4.316547 0 0 0
## 3 2.000000 0.00000 0.000000 0 0 0
## 4 7.142857 0.00000 0.000000 0 0 0
## 5 0.000000 4.00000 0.000000 0 0 0
## 6 5.882353 0.00000 0.000000 0 0 0
Отдельная таблица - все три текста по испытуемому вместе. Матрица частей речи по сумме трёх текстов
DFM_POS3 <- tokens_replace(Tokens_S3
, pattern = Dictionary$Word_POS
, replacement = Dictionary$POS
) |>
dfm()
head(DFM_POS3)
## Document-feature matrix of: 6 documents, 13 features (23.08% sparse) and 2 docvars.
## features
## docs a s spro v conj pr apro advpro adv part
## Cli1 18 51 36 47 30 17 5 4 9 6
## Cli10 3 23 11 17 2 7 3 3 1 0
## Cli11 13 75 24 65 24 25 20 0 11 15
## Cli12 12 56 10 35 9 23 6 0 4 3
## Cli13 6 47 25 47 22 27 9 12 17 26
## Cli14 7 30 42 43 24 19 10 14 21 29
## [ reached max_nfeat ... 3 more features ]
featnames(DFM_POS3)
## [1] "a" "s" "spro" "v" "conj" "pr" "apro" "advpro"
## [9] "adv" "part" "anum" "num" "intj"
Создание data.frame
POS3 <- convert(DFM_POS3, to = "data.frame")
POS3$Sample <- docvars(DFM_POS3, "Sample")
POS3 <- select(POS3, -doc_id)
POS3 <- POS3 |>
mutate(Words = rowSums(select(POS3, -Sample))) |>
select(Sample, Words, everything())
head(POS3)
## Sample Words a s spro v conj pr apro advpro adv part anum num intj
## 1 Clinical 223 18 51 36 47 30 17 5 4 9 6 0 0 0
## 2 Clinical 70 3 23 11 17 2 7 3 3 1 0 0 0 0
## 3 Clinical 273 13 75 24 65 24 25 20 0 11 15 1 0 0
## 4 Clinical 160 12 56 10 35 9 23 6 0 4 3 2 0 0
## 5 Clinical 239 6 47 25 47 22 27 9 12 17 26 0 1 0
## 6 Clinical 239 7 30 42 43 24 19 10 14 21 29 0 0 0
Пересчитываем в проценты.
POS3 <- POS3 |>
mutate(across(a:intj, ~ . / Words * 100, .names = "pct_{.col}"))
head(POS3)
## Sample Words a s spro v conj pr apro advpro adv part anum num intj
## 1 Clinical 223 18 51 36 47 30 17 5 4 9 6 0 0 0
## 2 Clinical 70 3 23 11 17 2 7 3 3 1 0 0 0 0
## 3 Clinical 273 13 75 24 65 24 25 20 0 11 15 1 0 0
## 4 Clinical 160 12 56 10 35 9 23 6 0 4 3 2 0 0
## 5 Clinical 239 6 47 25 47 22 27 9 12 17 26 0 1 0
## 6 Clinical 239 7 30 42 43 24 19 10 14 21 29 0 0 0
## pct_a pct_s pct_spro pct_v pct_conj pct_pr pct_apro pct_advpro
## 1 8.071749 22.86996 16.143498 21.07623 13.452915 7.623318 2.242152 1.793722
## 2 4.285714 32.85714 15.714286 24.28571 2.857143 10.000000 4.285714 4.285714
## 3 4.761905 27.47253 8.791209 23.80952 8.791209 9.157509 7.326007 0.000000
## 4 7.500000 35.00000 6.250000 21.87500 5.625000 14.375000 3.750000 0.000000
## 5 2.510460 19.66527 10.460251 19.66527 9.205021 11.297071 3.765690 5.020921
## 6 2.928870 12.55230 17.573222 17.99163 10.041841 7.949791 4.184100 5.857741
## pct_adv pct_part pct_anum pct_num pct_intj
## 1 4.035874 2.690583 0.0000000 0.00000 0
## 2 1.428571 0.000000 0.0000000 0.00000 0
## 3 4.029304 5.494505 0.3663004 0.00000 0
## 4 2.500000 1.875000 1.2500000 0.00000 0
## 5 7.112971 10.878661 0.0000000 0.41841 0
## 6 8.786611 12.133891 0.0000000 0.00000 0
Длинная версия таблицы
POS3_long_perc <- POS3 %>%
select(Sample, starts_with("pct_")) %>%
pivot_longer(
cols = starts_with("pct_"), # Выбираем колонки с префиксом "pct_"
names_to = "Feature", # Новая колонка с именами переменных
values_to = "Value" # Новая колонка с их значениями
)
POS3_long_abs <- POS3 %>%
select(-starts_with("pct_")) %>% # Удаляем колонки с префиксом "pct_"
pivot_longer(
cols = -Sample, # Оставляем колонку Sample неизменной
names_to = "Feature", # Новая колонка с именами переменных
values_to = "Value" # Новая колонка с их значениями
)
head(POS3_long_perc)
## # A tibble: 6 × 3
## Sample Feature Value
## <fct> <chr> <dbl>
## 1 Clinical pct_a 8.07
## 2 Clinical pct_s 22.9
## 3 Clinical pct_spro 16.1
## 4 Clinical pct_v 21.1
## 5 Clinical pct_conj 13.5
## 6 Clinical pct_pr 7.62
head(POS3_long_abs)
## # A tibble: 6 × 3
## Sample Feature Value
## <fct> <chr> <dbl>
## 1 Clinical Words 223
## 2 Clinical a 18
## 3 Clinical s 51
## 4 Clinical spro 36
## 5 Clinical v 47
## 6 Clinical conj 30
Части речи
A прилагательное
ADV наречие
ADVPRO местоименное наречие
ANUM числительное-прилагательное
APRO местоимение-прилагательное
COM часть композита - сложного слова
CONJ союз
INTJ междометие
NUM числительное
PART частица
PR предлог
S существительное
SPRO местоимение-существительное
V глагол
POS %>%
group_by(Sample, Image) %>%
summarise(across(Words, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop")%>%
kable(digits = 2)
| Sample | Image | Words_mean | Words_sd | Words_median | Words_min | Words_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 63.72 | 27.57 | 61.0 | 28 | 131 |
| Clinical | picture_2 | 59.04 | 26.98 | 51.0 | 24 | 139 |
| Clinical | picture_3 | 70.16 | 32.01 | 64.0 | 17 | 160 |
| Control | picture_1 | 120.33 | 60.02 | 107.5 | 45 | 265 |
| Control | picture_2 | 101.73 | 46.86 | 96.5 | 28 | 210 |
| Control | picture_3 | 110.57 | 48.74 | 105.5 | 49 | 266 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(s, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop")%>%
kable(digits = 2)
| Sample | Image | s_mean | s_sd | s_median | s_min | s_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 12.12 | 5.53 | 11.0 | 5 | 27 |
| Clinical | picture_2 | 11.44 | 6.93 | 12.0 | 2 | 30 |
| Clinical | picture_3 | 17.84 | 9.18 | 15.0 | 5 | 48 |
| Control | picture_1 | 20.50 | 12.48 | 17.0 | 8 | 62 |
| Control | picture_2 | 17.63 | 9.32 | 16.5 | 2 | 37 |
| Control | picture_3 | 22.97 | 9.46 | 22.5 | 8 | 54 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(pct_s, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop")%>%
kable(digits = 2)
| Sample | Image | pct_s_mean | pct_s_sd | pct_s_median | pct_s_min | pct_s_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 20.27 | 7.12 | 19.70 | 8.77 | 39.29 |
| Clinical | picture_2 | 19.10 | 7.96 | 20.00 | 4.17 | 37.70 |
| Clinical | picture_3 | 26.06 | 6.40 | 24.56 | 17.05 | 41.67 |
| Control | picture_1 | 17.65 | 6.42 | 16.23 | 6.67 | 32.86 |
| Control | picture_2 | 17.63 | 6.17 | 18.02 | 5.56 | 29.58 |
| Control | picture_3 | 21.57 | 5.81 | 21.30 | 10.65 | 33.72 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(a, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop")%>%
kable(digits = 2)
| Sample | Image | a_mean | a_sd | a_median | a_min | a_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 3.16 | 2.06 | 3 | 0 | 8 |
| Clinical | picture_2 | 2.16 | 2.30 | 1 | 0 | 8 |
| Clinical | picture_3 | 2.84 | 2.27 | 2 | 0 | 9 |
| Control | picture_1 | 5.17 | 5.30 | 4 | 0 | 29 |
| Control | picture_2 | 4.40 | 2.70 | 5 | 0 | 11 |
| Control | picture_3 | 4.83 | 3.31 | 4 | 0 | 13 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(pct_a, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop")%>%
kable(digits = 2)
| Sample | Image | pct_a_mean | pct_a_sd | pct_a_median | pct_a_min | pct_a_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 5.32 | 4.08 | 4.44 | 0 | 20.59 |
| Clinical | picture_2 | 3.21 | 2.80 | 2.44 | 0 | 10.17 |
| Clinical | picture_3 | 4.25 | 3.08 | 2.78 | 0 | 12.07 |
| Control | picture_1 | 4.18 | 2.33 | 4.26 | 0 | 10.94 |
| Control | picture_2 | 4.41 | 2.57 | 3.99 | 0 | 9.80 |
| Control | picture_3 | 4.10 | 1.93 | 4.23 | 0 | 7.94 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(adv, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop") %>%
kable(digits = 2)
| Sample | Image | adv_mean | adv_sd | adv_median | adv_min | adv_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 3.28 | 2.92 | 2.0 | 0 | 11 |
| Clinical | picture_2 | 3.36 | 2.40 | 3.0 | 0 | 9 |
| Clinical | picture_3 | 4.32 | 3.02 | 4.0 | 0 | 11 |
| Control | picture_1 | 8.73 | 6.96 | 6.5 | 1 | 29 |
| Control | picture_2 | 7.10 | 3.90 | 7.0 | 1 | 18 |
| Control | picture_3 | 9.37 | 5.97 | 8.0 | 2 | 26 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(pct_adv, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop") %>%
kable(digits = 2)
| Sample | Image | pct_adv_mean | pct_adv_sd | pct_adv_median | pct_adv_min | pct_adv_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 4.96 | 3.49 | 4.08 | 0.00 | 11.67 |
| Clinical | picture_2 | 5.77 | 3.86 | 5.48 | 0.00 | 18.60 |
| Clinical | picture_3 | 5.89 | 3.50 | 5.21 | 0.00 | 12.50 |
| Control | picture_1 | 6.94 | 3.68 | 6.81 | 2.22 | 20.69 |
| Control | picture_2 | 7.12 | 2.85 | 7.29 | 2.75 | 15.49 |
| Control | picture_3 | 8.12 | 2.81 | 8.12 | 3.70 | 13.11 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(v, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop") %>%
kable(digits = 2)
| Sample | Image | v_mean | v_sd | v_median | v_min | v_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 13.96 | 6.08 | 13.0 | 3 | 26 |
| Clinical | picture_2 | 13.88 | 5.54 | 14.0 | 5 | 27 |
| Clinical | picture_3 | 14.80 | 6.67 | 14.0 | 4 | 30 |
| Control | picture_1 | 25.40 | 11.05 | 22.5 | 9 | 53 |
| Control | picture_2 | 20.90 | 9.95 | 20.0 | 5 | 48 |
| Control | picture_3 | 20.17 | 7.10 | 20.0 | 9 | 36 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(pct_v, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop") %>%
kable(digits = 2)
| Sample | Image | pct_v_mean | pct_v_sd | pct_v_median | pct_v_min | pct_v_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 21.96 | 4.89 | 21.31 | 10.71 | 31.03 |
| Clinical | picture_2 | 24.69 | 6.81 | 24.59 | 11.63 | 36.00 |
| Clinical | picture_3 | 21.31 | 3.70 | 21.74 | 12.50 | 28.38 |
| Control | picture_1 | 21.78 | 3.96 | 21.58 | 15.75 | 31.46 |
| Control | picture_2 | 21.17 | 5.86 | 20.67 | 7.59 | 36.11 |
| Control | picture_3 | 18.98 | 4.07 | 18.99 | 11.76 | 27.91 |
она, он, они, я
POS %>%
group_by(Sample, Image) %>%
summarise(across(spro, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop") %>%
kable(digits = 2)
| Sample | Image | spro_mean | spro_sd | spro_median | spro_min | spro_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 9.64 | 5.16 | 9.0 | 3 | 21 |
| Clinical | picture_2 | 9.16 | 5.57 | 9.0 | 2 | 24 |
| Clinical | picture_3 | 7.24 | 3.90 | 7.0 | 3 | 21 |
| Control | picture_1 | 17.30 | 9.92 | 14.0 | 6 | 38 |
| Control | picture_2 | 16.17 | 8.57 | 15.5 | 4 | 34 |
| Control | picture_3 | 13.63 | 7.84 | 13.0 | 2 | 41 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(pct_spro, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop") %>%
kable(digits = 2)
| Sample | Image | pct_spro_mean | pct_spro_sd | pct_spro_median | pct_spro_min | pct_spro_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 14.98 | 4.85 | 12.73 | 8.06 | 24.32 |
| Clinical | picture_2 | 16.12 | 7.29 | 16.67 | 3.28 | 33.33 |
| Clinical | picture_3 | 10.62 | 3.53 | 11.11 | 4.69 | 18.92 |
| Control | picture_1 | 14.58 | 4.84 | 14.17 | 5.78 | 25.98 |
| Control | picture_2 | 15.87 | 4.37 | 16.15 | 8.00 | 25.00 |
| Control | picture_3 | 12.25 | 4.52 | 12.50 | 4.08 | 25.45 |
этот, свой, какой-то.
POS %>%
group_by(Sample, Image) %>%
summarise(across(apro, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop") %>%
kable(digits = 2)
| Sample | Image | apro_mean | apro_sd | apro_median | apro_min | apro_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 2.92 | 1.87 | 3 | 0 | 6 |
| Clinical | picture_2 | 2.24 | 1.94 | 2 | 0 | 6 |
| Clinical | picture_3 | 2.80 | 3.10 | 2 | 0 | 11 |
| Control | picture_1 | 6.53 | 4.93 | 5 | 2 | 19 |
| Control | picture_2 | 6.40 | 4.47 | 5 | 0 | 16 |
| Control | picture_3 | 6.20 | 5.52 | 5 | 0 | 27 |
POS %>%
group_by(Sample, Image) %>%
summarise(across(pct_apro, list(mean = mean
, sd = sd
, median = median
, min = min
, max = max))
, .groups = "drop") %>%
kable(digits = 2)
| Sample | Image | pct_apro_mean | pct_apro_sd | pct_apro_median | pct_apro_min | pct_apro_max |
|---|---|---|---|---|---|---|
| Clinical | picture_1 | 4.80 | 3.37 | 4.58 | 0.00 | 14.29 |
| Clinical | picture_2 | 3.43 | 2.59 | 3.28 | 0.00 | 8.82 |
| Clinical | picture_3 | 3.52 | 2.63 | 2.70 | 0.00 | 8.33 |
| Control | picture_1 | 5.33 | 2.22 | 5.00 | 1.65 | 10.91 |
| Control | picture_2 | 6.01 | 2.99 | 6.34 | 0.00 | 12.24 |
| Control | picture_3 | 4.99 | 3.02 | 4.36 | 0.00 | 11.90 |
POS3 %>%
group_by(Sample) %>%
summarise(
across(everything(), list(
mean = ~ mean(.),
sd = ~ sd(.),
median = ~ median(.),
min = ~ min(.),
max = ~ max(.)
), .names = "{.col}_{.fn}")
) %>%
pivot_longer(
cols = -Sample,
# Оставляем 'Sample' как неизменную колонку
names_to = c("Feature", "Statistic"),
# Разбиваем название на части
names_pattern = "(.*)_(.*)",
# Разделяем на две части: Feature и Statistic
values_to = "Value"
# Указываем название колонки для значений
) %>%
mutate(
#Добавляем колонку, указывающую, является ли Feature процентной колонкой
Type = if_else(str_detect(Feature, "^pct_"), "Percentage", "Absolute")
, Feature = str_remove(Feature, "^pct_")
# Убираем префикс pct_, если он есть
) %>%
pivot_wider(
names_from = c(Sample, Type),
values_from = Value # Значения берём из колонки Value
) %>%
select(-Clinical_Percentage, -Control_Percentage, Clinical_Percentage, Control_Percentage)%>%
rename(
Clin_Abs = Clinical_Absolute
, Clin_Perc = Clinical_Percentage
, Cont_Abs = Control_Absolute
, Cont_Perc = Control_Percentage
) %>%
kable(digits = 2)
| Feature | Statistic | Clin_Abs | Cont_Abs | Clin_Perc | Cont_Perc |
|---|---|---|---|---|---|
| Words | mean | 192.92 | 332.63 | NA | NA |
| Words | sd | 67.68 | 142.54 | NA | NA |
| Words | median | 184.00 | 308.00 | NA | NA |
| Words | min | 70.00 | 151.00 | NA | NA |
| Words | max | 388.00 | 730.00 | NA | NA |
| a | mean | 8.16 | 14.40 | 4.24 | 4.27 |
| a | sd | 4.68 | 8.94 | 1.95 | 1.67 |
| a | median | 7.00 | 12.50 | 4.00 | 4.17 |
| a | min | 2.00 | 3.00 | 1.13 | 0.91 |
| a | max | 19.00 | 41.00 | 8.07 | 7.90 |
| s | mean | 41.40 | 61.10 | 21.92 | 18.93 |
| s | sd | 17.75 | 26.67 | 6.22 | 5.09 |
| s | median | 36.00 | 55.50 | 20.65 | 18.74 |
| s | min | 16.00 | 23.00 | 11.70 | 9.17 |
| s | max | 96.00 | 123.00 | 35.00 | 31.69 |
| spro | mean | 26.04 | 47.10 | 13.76 | 14.22 |
| spro | sd | 9.92 | 23.21 | 3.88 | 3.66 |
| spro | median | 25.00 | 44.50 | 14.49 | 14.18 |
| spro | min | 10.00 | 13.00 | 6.25 | 7.65 |
| spro | max | 44.00 | 102.00 | 20.24 | 22.35 |
| v | mean | 42.64 | 66.47 | 22.34 | 20.35 |
| v | sd | 14.19 | 26.20 | 3.33 | 3.28 |
| v | median | 44.00 | 59.00 | 21.88 | 19.66 |
| v | min | 17.00 | 28.00 | 14.29 | 14.41 |
| v | max | 72.00 | 135.00 | 29.35 | 28.23 |
| conj | mean | 18.52 | 33.43 | 9.11 | 9.78 |
| conj | sd | 9.81 | 18.19 | 2.57 | 2.09 |
| conj | median | 18.00 | 30.00 | 9.14 | 9.92 |
| conj | min | 2.00 | 11.00 | 2.86 | 5.76 |
| conj | max | 46.00 | 80.00 | 13.45 | 13.51 |
| pr | mean | 18.44 | 30.23 | 9.68 | 9.41 |
| pr | sd | 7.41 | 11.36 | 2.20 | 2.51 |
| pr | median | 18.00 | 29.50 | 9.77 | 9.26 |
| pr | min | 7.00 | 11.00 | 5.71 | 5.25 |
| pr | max | 43.00 | 58.00 | 14.37 | 16.09 |
| apro | mean | 7.96 | 19.13 | 3.98 | 5.48 |
| apro | sd | 5.62 | 12.91 | 2.09 | 1.89 |
| apro | median | 7.00 | 16.50 | 3.80 | 5.12 |
| apro | min | 1.00 | 5.00 | 0.57 | 2.66 |
| apro | max | 22.00 | 61.00 | 8.70 | 9.64 |
| advpro | mean | 5.40 | 11.73 | 2.77 | 3.35 |
| advpro | sd | 4.74 | 8.39 | 1.98 | 1.94 |
| advpro | median | 4.00 | 10.50 | 2.58 | 3.44 |
| advpro | min | 0.00 | 0.00 | 0.00 | 0.00 |
| advpro | max | 18.00 | 33.00 | 6.79 | 8.24 |
| adv | mean | 10.96 | 25.20 | 5.59 | 7.50 |
| adv | sd | 6.21 | 13.52 | 2.77 | 2.13 |
| adv | median | 9.00 | 23.00 | 5.93 | 7.61 |
| adv | min | 1.00 | 7.00 | 1.43 | 3.70 |
| adv | max | 23.00 | 65.00 | 13.53 | 14.62 |
| part | mean | 12.96 | 22.53 | 6.34 | 6.38 |
| part | sd | 9.41 | 15.13 | 3.73 | 2.77 |
| part | median | 11.00 | 21.50 | 5.56 | 6.81 |
| part | min | 0.00 | 1.00 | 0.00 | 0.57 |
| part | max | 39.00 | 55.00 | 14.72 | 11.73 |
| anum | mean | 0.12 | 0.33 | 0.06 | 0.10 |
| anum | sd | 0.44 | 0.48 | 0.26 | 0.16 |
| anum | median | 0.00 | 0.00 | 0.00 | 0.00 |
| anum | min | 0.00 | 0.00 | 0.00 | 0.00 |
| anum | max | 2.00 | 1.00 | 1.25 | 0.57 |
| num | mean | 0.28 | 0.63 | 0.18 | 0.17 |
| num | sd | 0.46 | 1.07 | 0.33 | 0.27 |
| num | median | 0.00 | 0.00 | 0.00 | 0.00 |
| num | min | 0.00 | 0.00 | 0.00 | 0.00 |
| num | max | 1.00 | 4.00 | 1.09 | 0.98 |
| intj | mean | 0.04 | 0.33 | 0.02 | 0.07 |
| intj | sd | 0.20 | 0.99 | 0.12 | 0.19 |
| intj | median | 0.00 | 0.00 | 0.00 | 0.00 |
| intj | min | 0.00 | 0.00 | 0.00 | 0.00 |
| intj | max | 1.00 | 5.00 | 0.60 | 0.82 |
Число слов в тексте
POS %>%
ggplot(aes(x = Image, y = Words, fill = Sample)) +
geom_boxplot() +
labs(
title = "Распределение количества слов по выборкам и текстам",
x = "Тексты (картинки)",
y = "Количество слов",
fill = "Выборка"
) +
theme_minimal()
POS %>%
mutate( Image = recode(Image,
"picture_1" = "Карточка 4",
"picture_2" = "Карточка 6gf",
"picture_3" = "Карточка 7gf"),
Sample = recode(Sample,
"Clinical" = "Клиническая (ПРЛ)",
"Control" = "Контрольная")
) %>%
ggplot(aes(x = Image, y = Words, fill = Sample)) +
geom_boxplot() +
labs(
title = "Распределение длины текстов (количество слов) по выборкам и стимульным карточкам ТАТ",
x = "Стимульные карточки ТАТ",
y = "Количество слов",
fill = "Выборка"
) +
theme_minimal()
POS %>%
mutate(
Image = recode(Image,
"picture_1" = "Card 4",
"picture_2" = "Card 6gf",
"picture_3" = "Card gf"),
Sample = recode(Sample,
"Clinical" = "Clinical (BPD)",
"Control" = "Control")
) %>%
ggplot(aes(x = Image, y = Words, fill = Sample)) +
geom_boxplot() +
labs(
title = "Distribution of text length (word count) by samples and TAT stimulus cards",
x = "TAT stimulus cards",
y = "Word count",
fill = "Sample"
) +
theme_minimal()
POS %>%
mutate( Image = recode(Image,
"picture_1" = "Карточка 4\nCard 4",
"picture_2" = "Карточка 6gf\nCard 6gf",
"picture_3" = "Карточка 7gf\nCard 7gf"),
Sample = recode(Sample,
"Clinical" = "Клиническая (ПРЛ)\nClinical (BPD)",
"Control" = "Контрольная\nControl")
) %>%
ggplot(aes(x = Image, y = Words, fill = Sample)) +
geom_boxplot() +
labs(
title = "Распределение длины текстов (количество слов) по выборкам и стимульным карточкам ТАТ\nDistribution of text length (word count) by samples and TAT stimulus cards",
x = "Стимульные карточки ТАТ\nTAT stimulus cards",
y = "Количество слов\nWord count",
fill = "Выборка\nSample"
) +
theme_minimal()
POS %>%
ggplot(aes(x = Words)) +
geom_histogram(
position = "identity",
alpha = 0.6,
bins = 20
) +
facet_grid(Sample ~ Image) +
labs(
title = "Распределение количества слов по текстам и выборкам",
x = "Количество слов",
y = "Частота",
fill = "Выборка"
) +
theme_minimal()
POS3_long_perc |>
filter(!Feature %in% c("pct_anum", "pct_num", "pct_intj"))|>
mutate(Feature = str_remove(Feature, "^pct_"))|>
ggplot(aes(x = Feature, y = Value, fill = Sample)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Распределение частей речи (доля)",
x = "Части речи",
y = "Процентное значение",
fill = "Группа"
)
POS3_long_perc |>
filter(Feature %in% c("pct_a", "pct_adv", "pct_v", "pct_s")) |>
mutate(Feature = recode(Feature,
"pct_a" = "Прилагательные",
"pct_adv" = "Наречия**",
"pct_v" = "Глаголы*",
"pct_s" = "Существительные"),
Sample = recode(Sample,
"Clinical" = "Клиническая (ПРЛ)",
"Control" = "Контрольная")) |>
ggplot(aes(x = Feature, y = Value, fill = Sample)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Распределение доли отдельных частей речи в текстах испытуемых",
x = "Часть речи",
y = "Доля от общего количества слов (%)",
fill = "Выборка"
)
POS3_long_perc |>
filter(Feature %in% c("pct_a", "pct_adv", "pct_v", "pct_s")) |>
mutate(Feature = recode(Feature,
"pct_a" = "Adjectives",
"pct_adv" = "Adverbs**",
"pct_v" = "Verbs*",
"pct_s" = "Nouns"),
Sample = recode(Sample,
"Clinical" = "Clinical (BPD)",
"Control" = "Control")) |>
ggplot(aes(x = Feature, y = Value, fill = Sample)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Distribution of the Proportion of Speech Parts in Participants' Texts",
x = "Part of Speech",
y = "Proportion of Total Words (%)",
fill = "Group"
)
POS3_long_perc |>
filter(Feature %in% c("pct_a", "pct_adv", "pct_v", "pct_s")) |>
mutate(Feature = recode(Feature,
"pct_a" = "Прилагательные\nAdjectives",
"pct_adv" = "Наречия**\nAdverbs**",
"pct_v" = "Глаголы*\nVerbs*",
"pct_s" = "Существительные\nNouns"),
Sample = recode(Sample,
"Clinical" = "Клиническая (ПРЛ)\nClinical (BPD)",
"Control" = "Контрольная\nControl")) |>
ggplot(aes(x = Feature, y = Value, fill = Sample)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Распределение доли отдельных частей речи в текстах испытуемых\nDistribution of the Proportion of Speech Parts in Participants' Texts",
x = "Часть речи\nPart of Speech",
y = "Доля от общего количества слов (%)\nProportion of Total Words (%)",
fill = "Выборка\nGroup"
)
POS3_long_abs |>
filter(!Feature %in% c("anum", "num", "intj", "Words"))|>
ggplot(aes(x = Feature, y = Value, fill = Sample)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Распределение частей речи (абсолютные значения)",
x = "Части речи",
y = "Частота",
fill = "Группа"
)
Сравниваем общую длину всех трёх текстов по испытуемому.
Проверка на нормальность распределения
POS3 %>%
group_by(Sample) %>%
summarise(p_value = shapiro.test(Words)$p.value)%>%
kable(digits = 3)
| Sample | p_value |
|---|---|
| Clinical | 0.426 |
| Control | 0.014 |
Распределение в контрольной группе значимо (p<0,05) отличается от нормального. Выбираем непараметрический тест Вилкоксона
wilcox.test(Words ~ Sample, data = POS3, exact = TRUE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Words by Sample
## W = 121.5, p-value = 1.897e-05
## alternative hypothesis: true location shift is not equal to 0
Критерий Вилкоксона по остальным переменным
POS3 %>%
select(Sample, starts_with("pct_")) %>%
pivot_longer(cols = -Sample, names_to = "Feature", values_to = "Value") %>%
group_by(Feature) %>%
summarise(
p_value = wilcox.test(Value ~ Sample)$p.value,
W = wilcox.test(Value ~ Sample)$statistic
) %>%
mutate(p_adj = p.adjust(p_value, method = "BH")) %>%
mutate(
p_adj = p.adjust(p_value, method = "BH"),
Significance_adj = case_when(
p_adj < 0.001 ~ "***",
p_adj < 0.01 ~ "**",
p_adj < 0.05 ~ "*",
p_adj < 0.1 ~ "'",
TRUE ~ ""
),
Significance = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
p_value < 0.1 ~ "'",
TRUE ~ ""
))%>%
select(Feature, p_value, Significance, everything())%>%
as.data.frame()%>%
kable(digits = 4)
| Feature | p_value | Significance | W | p_adj | Significance_adj |
|---|---|---|---|---|---|
| pct_a | 0.8459 | 363.0 | 0.9000 | ||
| pct_adv | 0.0032 | ** | 203.0 | 0.0418 | * |
| pct_advpro | 0.2869 | 311.5 | 0.5328 | ||
| pct_anum | 0.0430 | * | 288.0 | 0.1399 | |
| pct_apro | 0.0087 | ** | 221.0 | 0.0566 | ’ |
| pct_conj | 0.3658 | 321.0 | 0.5945 | ||
| pct_intj | 0.1553 | 329.0 | 0.3365 | ||
| pct_num | 0.7534 | 359.0 | 0.8904 | ||
| pct_part | 0.6326 | 346.0 | 0.8224 | ||
| pct_pr | 0.4251 | 423.0 | 0.6140 | ||
| pct_s | 0.0832 | ’ | 478.0 | 0.2162 | |
| pct_spro | 0.9000 | 367.0 | 0.9000 | ||
| pct_v | 0.0298 | * | 504.0 | 0.1293 |
Заметные различия появляются в Наречиях. Они остаются если ввести поправку на множественные сравнения. Но еще можно обратить внимание на глаголы. Есть тенденция. Можно исключить проверку на множественные сравнения.
По отдельным текстам:
POS %>%
select(Sample, Image, starts_with("pct_")) %>%
pivot_longer(cols = c(-Sample, -Image), names_to = "Feature", values_to = "Value") %>%
group_by(Image, Feature) %>%
summarise(
p_value = wilcox.test(Value ~ Sample)$p.value,
W = wilcox.test(Value ~ Sample)$statistic
) %>%
mutate(p_adj = p.adjust(p_value, method = "BH")) %>%
kable(digits = 2)
| Image | Feature | p_value | W | p_adj |
|---|---|---|---|---|
| picture_1 | pct_a | 0.26 | 441.5 | 0.65 |
| picture_1 | pct_adv | 0.04 | 255.5 | 0.58 |
| picture_1 | pct_advpro | 0.82 | 361.0 | 0.91 |
| picture_1 | pct_anum | 0.18 | 323.5 | 0.65 |
| picture_1 | pct_apro | 0.30 | 313.0 | 0.65 |
| picture_1 | pct_conj | 0.31 | 315.0 | 0.65 |
| picture_1 | pct_intj | 0.38 | 362.5 | 0.65 |
| picture_1 | pct_num | 0.54 | 355.0 | 0.78 |
| picture_1 | pct_part | 0.40 | 325.0 | 0.65 |
| picture_1 | pct_pr | 0.87 | 385.5 | 0.91 |
| picture_1 | pct_s | 0.17 | 457.5 | 0.65 |
| picture_1 | pct_spro | 0.91 | 382.5 | 0.91 |
| picture_1 | pct_v | 0.83 | 388.0 | 0.91 |
| picture_2 | pct_a | 0.03 | 247.5 | 0.11 |
| picture_2 | pct_adv | 0.03 | 248.0 | 0.11 |
| picture_2 | pct_advpro | 0.18 | 296.0 | 0.37 |
| picture_2 | pct_anum | 0.38 | 362.5 | 0.62 |
| picture_2 | pct_apro | 0.00 | 189.0 | 0.02 |
| picture_2 | pct_conj | 0.66 | 348.5 | 0.82 |
| picture_2 | pct_intj | 0.20 | 350.0 | 0.37 |
| picture_2 | pct_num | 0.11 | 337.5 | 0.29 |
| picture_2 | pct_part | 0.70 | 398.5 | 0.82 |
| picture_2 | pct_pr | 0.92 | 381.5 | 0.92 |
| picture_2 | pct_s | 0.48 | 417.5 | 0.69 |
| picture_2 | pct_spro | 0.82 | 361.0 | 0.89 |
| picture_2 | pct_v | 0.03 | 500.5 | 0.11 |
| picture_3 | pct_a | 0.59 | 342.5 | 0.72 |
| picture_3 | pct_adv | 0.02 | 231.5 | 0.11 |
| picture_3 | pct_advpro | 0.22 | 302.5 | 0.39 |
| picture_3 | pct_anum | 0.20 | 350.0 | 0.39 |
| picture_3 | pct_apro | 0.09 | 275.0 | 0.30 |
| picture_3 | pct_conj | 0.67 | 349.0 | 0.72 |
| picture_3 | pct_intj | 0.27 | 342.0 | 0.39 |
| picture_3 | pct_num | 0.65 | 354.0 | 0.72 |
| picture_3 | pct_part | 0.25 | 306.0 | 0.39 |
| picture_3 | pct_pr | 0.83 | 388.0 | 0.83 |
| picture_3 | pct_s | 0.02 | 516.5 | 0.11 |
| picture_3 | pct_spro | 0.14 | 287.5 | 0.37 |
| picture_3 | pct_v | 0.03 | 504.5 | 0.13 |
save(Dictionary, file = "Dictionary.RData")