rm(list = ls())
date()
## [1] "Sun Oct 6 22:16:29 2019"
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.6.1 magrittr_1.5 tools_3.6.1 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.2 stringi_1.4.3 rmarkdown_1.16
## [9] knitr_1.25 stringr_1.4.0 xfun_0.10 digest_0.6.21
## [13] evaluate_0.14
Библиотеки
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
library(ggplot2)
#library(stringr)
Загрузка корпусов
load(file = "ToleranceS.RData")
load(file = "NotLikeS.RData")
ndoc(ToleranceS)
## [1] 51
ndoc(NotLikeS)
## [1] 51
Токенизация
TokensTolerance <- tokens(ToleranceS, what = "fastestword")
TokensNotLike <- tokens(NotLikeS, what = "fastestword")
Удаление знаков предложения
TokensNotLike <- tokens_remove(TokensNotLike, pattern = "\\s")
TokensTolerance <- tokens_remove(TokensTolerance, pattern = "\\s")
Создаём таблицу объёмами текстов
LengtText <- full_join(
data.frame(
Text = names(TokensNotLike)
, Group = docvars(TokensNotLike)
, NotLikeLength = ntoken(TokensNotLike)
, stringsAsFactors = FALSE
)
, data.frame(
Text = names(TokensTolerance)
, Group = docvars(TokensTolerance)
, ToleranceLength = ntoken(TokensTolerance)
, stringsAsFactors = FALSE
)
)
## Joining, by = c("Text", "group")
LengtText$group <- as.factor(LengtText$group)
summary(LengtText)
## Text group NotLikeLength ToleranceLength
## Length:51 actor:21 Min. : 47.0 Min. : 3.00
## Class :character dance:14 1st Qu.:126.5 1st Qu.:12.50
## Mode :character music:16 Median :154.0 Median :18.00
## Mean :174.4 Mean :20.39
## 3rd Qu.:211.5 3rd Qu.:27.00
## Max. :659.0 Max. :47.00
ggplot(LengtText, aes(NotLikeLength, ToleranceLength, col = group)) +
geom_point()
ggplot(LengtText, aes(group, NotLikeLength)) +
geom_boxplot()
Не такой (среднее с стандартное отклонение)
LengtText %>%
group_by(group) %>%
summarise(среднее = mean(NotLikeLength), sd = sd(NotLikeLength))
## # A tibble: 3 x 3
## group среднее sd
## <fct> <dbl> <dbl>
## 1 actor 173. 124.
## 2 dance 192. 69.4
## 3 music 162. 62.2
summary(
aov(NotLikeLength ~ group, LengtText)
)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 6974 3487 0.389 0.68
## Residuals 48 430205 8963
нет достоверных различий между группами
ggplot(LengtText, aes(group, ToleranceLength)) +
geom_boxplot()
Толерантность (среднее и стандартное отклонение)
LengtText %>%
group_by(group) %>%
summarise(
количество = n()
, минимум = min(ToleranceLength)
, максимум = max(ToleranceLength)
, среднее = mean(ToleranceLength)
, медиана = median(ToleranceLength)
, sd = sd(ToleranceLength))
## # A tibble: 3 x 7
## group количество минимум максимум среднее медиана sd
## <fct> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 actor 21 4 44 18.5 18 10.4
## 2 dance 14 6 47 23.9 20 13.6
## 3 music 16 3 31 19.8 19 8.05
summary(
aov(ToleranceLength ~ group, LengtText)
)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 258 128.8 1.115 0.336
## Residuals 48 5543 115.5
Нет значимых различий между группами