rm(list = ls())
date()
## [1] "Sun Oct  6 22:16:29 2019"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.1  magrittr_1.5    tools_3.6.1     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.2      stringi_1.4.3   rmarkdown_1.16 
##  [9] knitr_1.25      stringr_1.4.0   xfun_0.10       digest_0.6.21  
## [13] evaluate_0.14

Библиотеки

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
## 
##     View
library(ggplot2)
#library(stringr)

Загрузка корпусов

load(file = "ToleranceS.RData")
load(file = "NotLikeS.RData")
ndoc(ToleranceS)
## [1] 51
ndoc(NotLikeS)
## [1] 51

Токенизация

TokensTolerance <- tokens(ToleranceS, what = "fastestword")
TokensNotLike <- tokens(NotLikeS, what = "fastestword")

Удаление знаков предложения

TokensNotLike <- tokens_remove(TokensNotLike, pattern = "\\s")
TokensTolerance <- tokens_remove(TokensTolerance, pattern = "\\s")

Создаём таблицу объёмами текстов

LengtText <- full_join(
        data.frame(
                Text = names(TokensNotLike)
                , Group = docvars(TokensNotLike)
                , NotLikeLength = ntoken(TokensNotLike)
                , stringsAsFactors = FALSE
        )
        , data.frame(
                Text = names(TokensTolerance)
                , Group = docvars(TokensTolerance)
                , ToleranceLength = ntoken(TokensTolerance)
                , stringsAsFactors = FALSE
        )
)
## Joining, by = c("Text", "group")
LengtText$group <- as.factor(LengtText$group)

summary(LengtText)
##      Text             group    NotLikeLength   ToleranceLength
##  Length:51          actor:21   Min.   : 47.0   Min.   : 3.00  
##  Class :character   dance:14   1st Qu.:126.5   1st Qu.:12.50  
##  Mode  :character   music:16   Median :154.0   Median :18.00  
##                                Mean   :174.4   Mean   :20.39  
##                                3rd Qu.:211.5   3rd Qu.:27.00  
##                                Max.   :659.0   Max.   :47.00

График взаимосвязи длины двух текстов

ggplot(LengtText, aes(NotLikeLength, ToleranceLength, col = group)) +
        geom_point()

Не такой как все

ggplot(LengtText, aes(group, NotLikeLength)) +
        geom_boxplot()

Не такой (среднее с стандартное отклонение)

LengtText %>% 
        group_by(group) %>% 
        summarise(среднее = mean(NotLikeLength), sd = sd(NotLikeLength))
## # A tibble: 3 x 3
##   group среднее    sd
##   <fct>   <dbl> <dbl>
## 1 actor    173. 124. 
## 2 dance    192.  69.4
## 3 music    162.  62.2
summary(
aov(NotLikeLength ~ group, LengtText)
)
##             Df Sum Sq Mean Sq F value Pr(>F)
## group        2   6974    3487   0.389   0.68
## Residuals   48 430205    8963

нет достоверных различий между группами

Толерантность

ggplot(LengtText, aes(group, ToleranceLength)) +
        geom_boxplot()

Толерантность (среднее и стандартное отклонение)

LengtText %>% 
        group_by(group) %>% 
        summarise(
          количество = n()
          , минимум = min(ToleranceLength)
          , максимум = max(ToleranceLength)
          , среднее = mean(ToleranceLength)
          , медиана = median(ToleranceLength)
          , sd = sd(ToleranceLength))
## # A tibble: 3 x 7
##   group количество минимум максимум среднее медиана    sd
##   <fct>      <int>   <int>    <int>   <dbl>   <dbl> <dbl>
## 1 actor         21       4       44    18.5      18 10.4 
## 2 dance         14       6       47    23.9      20 13.6 
## 3 music         16       3       31    19.8      19  8.05
summary(
aov(ToleranceLength ~ group, LengtText)
)
##             Df Sum Sq Mean Sq F value Pr(>F)
## group        2    258   128.8   1.115  0.336
## Residuals   48   5543   115.5

Нет значимых различий между группами