rm(list = ls())
date()
## [1] "Sat Oct 5 15:50:51 2019"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.6.1 magrittr_1.5 tools_3.6.1 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.2 stringi_1.4.3 rmarkdown_1.16
## [9] knitr_1.25 stringr_1.4.0 xfun_0.10 digest_0.6.21
## [13] evaluate_0.14
##Библиотеки
library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# library(ggplot2)
library(stringr)
library(ggplot2)
##Импорт данных
load(file = "Corpus_S.RData")
summary(summary(Corpus_S, n = 1000)[-c(1)])
## Types Tokens Sentences group
## Min. :194.0 Min. : 387.0 Min. : 1.000 Length:225
## 1st Qu.:315.0 1st Qu.: 663.0 1st Qu.: 1.000 Class :character
## Median :361.0 Median : 776.0 Median : 2.000 Mode :character
## Mean :365.8 Mean : 804.3 Mean : 2.898
## 3rd Qu.:407.0 3rd Qu.: 907.0 3rd Qu.: 3.000
## Max. :637.0 Max. :1506.0 Max. :31.000
Создание корпуса и токенизация
Corpus_S2 <- Corpus_S %>%
corpus_segment(pattern = "\\\\s", valuetype = "regex", pattern_position = "after") #%>%
#corpus_trim(what = "documents", min_ntoken = 1)
docvars(Corpus_S2, "documentN") <- Corpus_S2$documents$`_document`
Tokens_S2 <- tokens(Corpus_S2, what = "fastestword")
добавил атрибут названия текста в явном виде в корпус. Он же передаётся в токенизатор. Убрал знаки абаза и исчезли нулевые предложения.
Sentences <- docvars(Tokens_S2, field = c("group", "_document"))
Sentences$sentence <- names(Tokens_S2)
Sentences$n <- ntoken(Tokens_S2)
Sentences %>%
summarise(avg = mean(n), min = min(n), max = max(n), sd = sd(n))
## avg min max sd
## 1 15.94699 1 133 10.63197
filter(Sentences, n == 1)
## group _document sentence n
## 1 Bio Bio4 Bio4.31 1
## 2 Bio Bio4 Bio4.33 1
## 3 Bio Bio9 Bio9.24 1
## 4 Bio Bio16 Bio16.1 1
## 5 Bio Bio16 Bio16.3 1
## 6 Bio Bio23 Bio23.56 1
## 7 Bio Bio25 Bio25.3 1
## 8 Bio Bio29 Bio29.30 1
## 9 Chem Chem34 Chem34.11 1
## 10 Chem Chem34 Chem34.24 1
## 11 Chem Chem50 Chem50.26 1
## 12 IDOR IDOR53 IDOR53.41 1
## 13 IDOR IDOR53 IDOR53.79 1
## 14 IDOR IDOR82 IDOR82.2 1
## 15 IDOR IDOR112 IDOR112.13 1
## 16 IDOR IDOR125 IDOR125.13 1
## 17 IDOR IDOR125 IDOR125.50 1
## 18 IDOR IDOR137 IDOR137.33 1
## 19 IDOR IDOR139 IDOR139.18 1
## 20 IDOR IDOR142 IDOR142.26 1
## 21 IDOR IDOR164 IDOR164.62 1
## 22 IDOR IDOR167 IDOR167.34 1
## 23 IP16-10 IP16-10188 IP16-10188.22 1
## 24 IP16-10 IP16-10192 IP16-10192.13 1
## 25 IP16-10 IP16-10192 IP16-10192.19 1
## 26 IP18-42 IP18-42203 IP18-42203.54 1
Sentences %>%
arrange(desc(n)) %>%
head(20)
## group _document sentence n
## 1 Sluzh Sluzh206 Sluzh206.20 133
## 2 IDOR IDOR98 IDOR98.7 125
## 3 Bio Bio3 Bio3.12 120
## 4 IDOR IDOR165 IDOR165.27 118
## 5 Bio Bio6 Bio6.14 117
## 6 Sluzh Sluzh221 Sluzh221.13 114
## 7 IDOR IDOR171 IDOR171.54 103
## 8 Bio Bio6 Bio6.16 92
## 9 IDOR IDOR88 IDOR88.26 90
## 10 IDOR IDOR166 IDOR166.7 90
## 11 IDOR IDOR181 IDOR181.17 89
## 12 IDOR IDOR92 IDOR92.24 87
## 13 IDOR IDOR181 IDOR181.12 86
## 14 IDOR IDOR92 IDOR92.4 84
## 15 Bio Bio6 Bio6.10 81
## 16 IP18-42 IP18-42198 IP18-42198.2 81
## 17 Sluzh Sluzh224 Sluzh224.30 81
## 18 IDOR IDOR116 IDOR116.23 80
## 19 Bio Bio10 Bio10.8 79
## 20 Chem Chem50 Chem50.49 79
# Corpus_S2 %>%
# corpus_subset(documentN ==
# # "Sluzh206"
# "IDOR98"
# ) %>%
# texts()
Статистика по группам и график
Sentences %>%
summarise(avg = mean(n), min = min(n), max = max(n), sd = sd(n))
## avg min max sd
## 1 15.94699 1 133 10.63197
ggplot(Sentences, aes(group, n)) +
geom_boxplot()
Sentences %>%
group_by(group) %>%
summarise(avg = mean(n), min = min(n), max = max(n), sd = sd(n))
## # A tibble: 6 x 5
## group avg min max sd
## <chr> <dbl> <int> <int> <dbl>
## 1 Bio 16.1 1 120 11.5
## 2 Chem 15.6 1 79 10.1
## 3 IDOR 16.1 1 125 10.6
## 4 IP16-10 14.6 1 62 8.48
## 5 IP18-42 16.5 1 81 10.4
## 6 Sluzh 15.6 2 133 11.2
##Запись объектов на диск
save(Corpus_S2, file = "Corpus_S2.RData")
save(Tokens_S2, file = "Tokens_S2.RData")