rm(list = ls())
date()
## [1] "Thu Aug 27 00:32:49 2020"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.1  magrittr_1.5    tools_3.6.1     htmltools_0.5.0
##  [5] yaml_2.2.1      stringi_1.4.6   rmarkdown_2.3   knitr_1.29     
##  [9] stringr_1.4.0   xfun_0.16       digest_0.6.25   rlang_0.4.7    
## [13] evaluate_0.14

Библиотеки

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(quanteda)
## Warning: package 'quanteda' was built under R version 3.6.2
## Package version: 2.1.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
#library(stringr)

Загрузка объектов

load(file = "Corpus.RData")
#load(file = "Corpus_S.RData")
load(file = "Tokens_S.Rdata")
# load(file = "Freq_plus_Fun.RData") #Таблица с частотами НКРЯ

Количество текстов

ndoc(Tokens_S)
## [1] 281

Сводная таблица по объёму

DFM <- dfm(Tokens_S)

Descript <- DFM %>% 
        dfm_keep("__s") %>% 
        convert(to = "data.frame") %>% 
        rename("Sentences" = "__s")

Descript$Tokens <- ntoken(DFM) 
Descript$Types <- ntype(DFM)
Descript$SentLength <- Descript$Tokens / Descript$Sentences
Descript$Group <- Tokens_S$Group

summary(Descript)
##     doc_id            Sentences         Tokens           Types      
##  Length:281         Min.   :16.00   Min.   : 366.0   Min.   :193.0  
##  Class :character   1st Qu.:34.00   1st Qu.: 619.0   1st Qu.:312.0  
##  Mode  :character   Median :41.00   Median : 726.0   Median :357.0  
##                     Mean   :43.29   Mean   : 751.1   Mean   :360.2  
##                     3rd Qu.:51.00   3rd Qu.: 848.0   3rd Qu.:398.0  
##                     Max.   :93.00   Max.   :1400.0   Max.   :633.0  
##    SentLength       Group          
##  Min.   :10.58   Length:281        
##  1st Qu.:15.22   Class :character  
##  Median :17.23   Mode  :character  
##  Mean   :18.39                     
##  3rd Qu.:20.34                     
##  Max.   :38.22

Вывод сортированной таблицы для поиска аномалий

Descript %>% 
        arrange(SentLength) %>% 
        head()
##       doc_id Sentences Tokens Types SentLength  Group
## 1   IDOR_100        83    878   456   10.57831   IDOR
## 2  Sluzh_274        65    720   297   11.07692  Sluzh
## 3   IDOR_164        82    911   301   11.10976   IDOR
## 4 IP1610_222        64    712   290   11.12500 IP1610
## 5    IDOR_92        93   1103   469   11.86022   IDOR
## 6   IDOR_154        54    646   371   11.96296   IDOR
Descript %>% 
        arrange(SentLength) %>% 
        tail()
##       doc_id Sentences Tokens Types SentLength Group
## 276    Bio_3        18    608   289   33.77778   Bio
## 277 Chem2_70        29    993   396   34.24138 Chem2
## 278 IDOR_187        20    686   377   34.30000  IDOR
## 279 IDOR_220        21    726   321   34.57143  IDOR
## 280 IDOR_213        20    698   338   34.90000  IDOR
## 281 Phy2_249        18    688   349   38.22222  Phy2
Descript %>% 
        arrange(Sentences) %>% 
        head() 
##     doc_id Sentences Tokens Types SentLength Group
## 1   Bio_16        16    410   232   25.62500   Bio
## 2 IDOR_174        16    426   256   26.62500  IDOR
## 3    Bio_3        18    608   289   33.77778   Bio
## 4  Bio2_30        18    445   223   24.72222  Bio2
## 5  Chem_59        18    459   226   25.50000  Chem
## 6 IDOR_137        18    600   301   33.33333  IDOR
Descript %>% 
        arrange(Sentences) %>% 
        tail() 
##       doc_id Sentences Tokens Types SentLength Group
## 276  Chem_54        85   1399   588   16.45882  Chem
## 277 IDOR_107        86   1204   531   14.00000  IDOR
## 278 IDOR_195        88   1301   610   14.78409  IDOR
## 279 IDOR_200        89   1077   518   12.10112  IDOR
## 280 IDOR_217        92   1400   633   15.21739  IDOR
## 281  IDOR_92        93   1103   469   11.86022  IDOR

Печать текста для проверки

# texts(Corpus[docnames(Corpus) == "Phy2_249"])
# docnames(Corpus_S)
# Corpus_S$Group
# docvars(Corpus_S)

Объёмы групп

table(Descript$Group)
## 
##    Bio   Bio2   Chem  Chem2   IDOR IP1610 IP1842   Phy2  Sluzh 
##     29     18     22     21    130     13     11     17     20

Длина текстов

Descript %>% 
        group_by(Group) %>% 
        summarise(Lenth = mean(Tokens), SD = sd(Tokens), groups = Group)
## `summarise()` regrouping output by 'Group' (override with `.groups` argument)
## # A tibble: 281 x 4
## # Groups:   Group [9]
##    Group Lenth    SD groups
##    <chr> <dbl> <dbl> <chr> 
##  1 Bio    717.  188. Bio   
##  2 Bio    717.  188. Bio   
##  3 Bio    717.  188. Bio   
##  4 Bio    717.  188. Bio   
##  5 Bio    717.  188. Bio   
##  6 Bio    717.  188. Bio   
##  7 Bio    717.  188. Bio   
##  8 Bio    717.  188. Bio   
##  9 Bio    717.  188. Bio   
## 10 Bio    717.  188. Bio   
## # … with 271 more rows
ggplot(Descript, aes(Tokens, Group)) +
        geom_boxplot()

Длина предложений

Descript %>% 
        group_by(Group) %>% 
        summarise(Lenth = mean(SentLength), SD = sd(SentLength), groups = Group)
## `summarise()` regrouping output by 'Group' (override with `.groups` argument)
## # A tibble: 281 x 4
## # Groups:   Group [9]
##    Group Lenth    SD groups
##    <chr> <dbl> <dbl> <chr> 
##  1 Bio    18.1  5.19 Bio   
##  2 Bio    18.1  5.19 Bio   
##  3 Bio    18.1  5.19 Bio   
##  4 Bio    18.1  5.19 Bio   
##  5 Bio    18.1  5.19 Bio   
##  6 Bio    18.1  5.19 Bio   
##  7 Bio    18.1  5.19 Bio   
##  8 Bio    18.1  5.19 Bio   
##  9 Bio    18.1  5.19 Bio   
## 10 Bio    18.1  5.19 Bio   
## # … with 271 more rows
ggplot(Descript, aes(SentLength, Group)) +
        geom_boxplot()