rm(list = ls())
date()
## [1] "Thu Aug 27 00:32:49 2020"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.6.1 magrittr_1.5 tools_3.6.1 htmltools_0.5.0
## [5] yaml_2.2.1 stringi_1.4.6 rmarkdown_2.3 knitr_1.29
## [9] stringr_1.4.0 xfun_0.16 digest_0.6.25 rlang_0.4.7
## [13] evaluate_0.14
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.6.2
## Package version: 2.1.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
#library(stringr)
load(file = "Corpus.RData")
#load(file = "Corpus_S.RData")
load(file = "Tokens_S.Rdata")
# load(file = "Freq_plus_Fun.RData") #Таблица с частотами НКРЯ
Количество текстов
ndoc(Tokens_S)
## [1] 281
DFM <- dfm(Tokens_S)
Descript <- DFM %>%
dfm_keep("__s") %>%
convert(to = "data.frame") %>%
rename("Sentences" = "__s")
Descript$Tokens <- ntoken(DFM)
Descript$Types <- ntype(DFM)
Descript$SentLength <- Descript$Tokens / Descript$Sentences
Descript$Group <- Tokens_S$Group
summary(Descript)
## doc_id Sentences Tokens Types
## Length:281 Min. :16.00 Min. : 366.0 Min. :193.0
## Class :character 1st Qu.:34.00 1st Qu.: 619.0 1st Qu.:312.0
## Mode :character Median :41.00 Median : 726.0 Median :357.0
## Mean :43.29 Mean : 751.1 Mean :360.2
## 3rd Qu.:51.00 3rd Qu.: 848.0 3rd Qu.:398.0
## Max. :93.00 Max. :1400.0 Max. :633.0
## SentLength Group
## Min. :10.58 Length:281
## 1st Qu.:15.22 Class :character
## Median :17.23 Mode :character
## Mean :18.39
## 3rd Qu.:20.34
## Max. :38.22
Вывод сортированной таблицы для поиска аномалий
Descript %>%
arrange(SentLength) %>%
head()
## doc_id Sentences Tokens Types SentLength Group
## 1 IDOR_100 83 878 456 10.57831 IDOR
## 2 Sluzh_274 65 720 297 11.07692 Sluzh
## 3 IDOR_164 82 911 301 11.10976 IDOR
## 4 IP1610_222 64 712 290 11.12500 IP1610
## 5 IDOR_92 93 1103 469 11.86022 IDOR
## 6 IDOR_154 54 646 371 11.96296 IDOR
Descript %>%
arrange(SentLength) %>%
tail()
## doc_id Sentences Tokens Types SentLength Group
## 276 Bio_3 18 608 289 33.77778 Bio
## 277 Chem2_70 29 993 396 34.24138 Chem2
## 278 IDOR_187 20 686 377 34.30000 IDOR
## 279 IDOR_220 21 726 321 34.57143 IDOR
## 280 IDOR_213 20 698 338 34.90000 IDOR
## 281 Phy2_249 18 688 349 38.22222 Phy2
Descript %>%
arrange(Sentences) %>%
head()
## doc_id Sentences Tokens Types SentLength Group
## 1 Bio_16 16 410 232 25.62500 Bio
## 2 IDOR_174 16 426 256 26.62500 IDOR
## 3 Bio_3 18 608 289 33.77778 Bio
## 4 Bio2_30 18 445 223 24.72222 Bio2
## 5 Chem_59 18 459 226 25.50000 Chem
## 6 IDOR_137 18 600 301 33.33333 IDOR
Descript %>%
arrange(Sentences) %>%
tail()
## doc_id Sentences Tokens Types SentLength Group
## 276 Chem_54 85 1399 588 16.45882 Chem
## 277 IDOR_107 86 1204 531 14.00000 IDOR
## 278 IDOR_195 88 1301 610 14.78409 IDOR
## 279 IDOR_200 89 1077 518 12.10112 IDOR
## 280 IDOR_217 92 1400 633 15.21739 IDOR
## 281 IDOR_92 93 1103 469 11.86022 IDOR
Печать текста для проверки
# texts(Corpus[docnames(Corpus) == "Phy2_249"])
# docnames(Corpus_S)
# Corpus_S$Group
# docvars(Corpus_S)
table(Descript$Group)
##
## Bio Bio2 Chem Chem2 IDOR IP1610 IP1842 Phy2 Sluzh
## 29 18 22 21 130 13 11 17 20
Descript %>%
group_by(Group) %>%
summarise(Lenth = mean(Tokens), SD = sd(Tokens), groups = Group)
## `summarise()` regrouping output by 'Group' (override with `.groups` argument)
## # A tibble: 281 x 4
## # Groups: Group [9]
## Group Lenth SD groups
## <chr> <dbl> <dbl> <chr>
## 1 Bio 717. 188. Bio
## 2 Bio 717. 188. Bio
## 3 Bio 717. 188. Bio
## 4 Bio 717. 188. Bio
## 5 Bio 717. 188. Bio
## 6 Bio 717. 188. Bio
## 7 Bio 717. 188. Bio
## 8 Bio 717. 188. Bio
## 9 Bio 717. 188. Bio
## 10 Bio 717. 188. Bio
## # … with 271 more rows
ggplot(Descript, aes(Tokens, Group)) +
geom_boxplot()
Descript %>%
group_by(Group) %>%
summarise(Lenth = mean(SentLength), SD = sd(SentLength), groups = Group)
## `summarise()` regrouping output by 'Group' (override with `.groups` argument)
## # A tibble: 281 x 4
## # Groups: Group [9]
## Group Lenth SD groups
## <chr> <dbl> <dbl> <chr>
## 1 Bio 18.1 5.19 Bio
## 2 Bio 18.1 5.19 Bio
## 3 Bio 18.1 5.19 Bio
## 4 Bio 18.1 5.19 Bio
## 5 Bio 18.1 5.19 Bio
## 6 Bio 18.1 5.19 Bio
## 7 Bio 18.1 5.19 Bio
## 8 Bio 18.1 5.19 Bio
## 9 Bio 18.1 5.19 Bio
## 10 Bio 18.1 5.19 Bio
## # … with 271 more rows
ggplot(Descript, aes(SentLength, Group)) +
geom_boxplot()