rm(list = ls())
date()
## [1] "Sat Oct  5 15:50:51 2019"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.1  magrittr_1.5    tools_3.6.1     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.2      stringi_1.4.3   rmarkdown_1.16 
##  [9] knitr_1.25      stringr_1.4.0   xfun_0.10       digest_0.6.21  
## [13] evaluate_0.14

##Библиотеки

library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# library(ggplot2)
library(stringr)
library(ggplot2)

##Импорт данных

load(file = "Corpus_S.RData")
summary(summary(Corpus_S, n = 1000)[-c(1)])
##      Types           Tokens         Sentences         group          
##  Min.   :194.0   Min.   : 387.0   Min.   : 1.000   Length:225        
##  1st Qu.:315.0   1st Qu.: 663.0   1st Qu.: 1.000   Class :character  
##  Median :361.0   Median : 776.0   Median : 2.000   Mode  :character  
##  Mean   :365.8   Mean   : 804.3   Mean   : 2.898                     
##  3rd Qu.:407.0   3rd Qu.: 907.0   3rd Qu.: 3.000                     
##  Max.   :637.0   Max.   :1506.0   Max.   :31.000

Создание корпуса и токенизация

Corpus_S2 <- Corpus_S %>%
  corpus_segment(pattern = "\\\\s", valuetype = "regex", pattern_position = "after") #%>% 
  #corpus_trim(what = "documents", min_ntoken = 1)
docvars(Corpus_S2, "documentN") <- Corpus_S2$documents$`_document`
Tokens_S2 <- tokens(Corpus_S2, what = "fastestword")

добавил атрибут названия текста в явном виде в корпус. Он же передаётся в токенизатор. Убрал знаки абаза и исчезли нулевые предложения.

статистика по длине предложений

Sentences <- docvars(Tokens_S2, field = c("group", "_document"))
Sentences$sentence <- names(Tokens_S2)
Sentences$n <- ntoken(Tokens_S2)

Sentences %>% 
  summarise(avg = mean(n), min = min(n), max = max(n), sd = sd(n))
##        avg min max       sd
## 1 15.94699   1 133 10.63197
filter(Sentences, n == 1)
##      group  _document      sentence n
## 1      Bio       Bio4       Bio4.31 1
## 2      Bio       Bio4       Bio4.33 1
## 3      Bio       Bio9       Bio9.24 1
## 4      Bio      Bio16       Bio16.1 1
## 5      Bio      Bio16       Bio16.3 1
## 6      Bio      Bio23      Bio23.56 1
## 7      Bio      Bio25       Bio25.3 1
## 8      Bio      Bio29      Bio29.30 1
## 9     Chem     Chem34     Chem34.11 1
## 10    Chem     Chem34     Chem34.24 1
## 11    Chem     Chem50     Chem50.26 1
## 12    IDOR     IDOR53     IDOR53.41 1
## 13    IDOR     IDOR53     IDOR53.79 1
## 14    IDOR     IDOR82      IDOR82.2 1
## 15    IDOR    IDOR112    IDOR112.13 1
## 16    IDOR    IDOR125    IDOR125.13 1
## 17    IDOR    IDOR125    IDOR125.50 1
## 18    IDOR    IDOR137    IDOR137.33 1
## 19    IDOR    IDOR139    IDOR139.18 1
## 20    IDOR    IDOR142    IDOR142.26 1
## 21    IDOR    IDOR164    IDOR164.62 1
## 22    IDOR    IDOR167    IDOR167.34 1
## 23 IP16-10 IP16-10188 IP16-10188.22 1
## 24 IP16-10 IP16-10192 IP16-10192.13 1
## 25 IP16-10 IP16-10192 IP16-10192.19 1
## 26 IP18-42 IP18-42203 IP18-42203.54 1
Sentences %>% 
  arrange(desc(n)) %>% 
  head(20)
##      group  _document     sentence   n
## 1    Sluzh   Sluzh206  Sluzh206.20 133
## 2     IDOR     IDOR98     IDOR98.7 125
## 3      Bio       Bio3      Bio3.12 120
## 4     IDOR    IDOR165   IDOR165.27 118
## 5      Bio       Bio6      Bio6.14 117
## 6    Sluzh   Sluzh221  Sluzh221.13 114
## 7     IDOR    IDOR171   IDOR171.54 103
## 8      Bio       Bio6      Bio6.16  92
## 9     IDOR     IDOR88    IDOR88.26  90
## 10    IDOR    IDOR166    IDOR166.7  90
## 11    IDOR    IDOR181   IDOR181.17  89
## 12    IDOR     IDOR92    IDOR92.24  87
## 13    IDOR    IDOR181   IDOR181.12  86
## 14    IDOR     IDOR92     IDOR92.4  84
## 15     Bio       Bio6      Bio6.10  81
## 16 IP18-42 IP18-42198 IP18-42198.2  81
## 17   Sluzh   Sluzh224  Sluzh224.30  81
## 18    IDOR    IDOR116   IDOR116.23  80
## 19     Bio      Bio10      Bio10.8  79
## 20    Chem     Chem50    Chem50.49  79
# Corpus_S2 %>% 
#   corpus_subset(documentN == 
#                   # "Sluzh206"
#                 "IDOR98"
#                 ) %>% 
#   texts()

Статистика по группам и график

Sentences %>% 
  summarise(avg = mean(n), min = min(n), max = max(n), sd = sd(n))
##        avg min max       sd
## 1 15.94699   1 133 10.63197
ggplot(Sentences, aes(group, n)) +
  geom_boxplot()

Sentences %>% 
  group_by(group) %>% 
  summarise(avg = mean(n), min = min(n), max = max(n), sd = sd(n))
## # A tibble: 6 x 5
##   group     avg   min   max    sd
##   <chr>   <dbl> <int> <int> <dbl>
## 1 Bio      16.1     1   120 11.5 
## 2 Chem     15.6     1    79 10.1 
## 3 IDOR     16.1     1   125 10.6 
## 4 IP16-10  14.6     1    62  8.48
## 5 IP18-42  16.5     1    81 10.4 
## 6 Sluzh    15.6     2   133 11.2

##Запись объектов на диск

save(Corpus_S2, file = "Corpus_S2.RData")
save(Tokens_S2, file = "Tokens_S2.RData")