rm(list = ls())
date()
## [1] "Mon Feb 21 15:35:36 2022"
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.29 R6_2.5.1 jsonlite_1.7.3 magrittr_2.0.1
## [5] evaluate_0.14 rlang_0.4.12 stringi_1.7.6 jquerylib_0.1.4
## [9] bslib_0.3.1 rmarkdown_2.11 tools_4.1.2 stringr_1.4.0
## [13] xfun_0.29 yaml_2.2.1 fastmap_1.1.0 compiler_4.1.2
## [17] htmltools_0.5.2 knitr_1.37 sass_0.4.0
Библиотеки и загрузка объектов
library(quanteda)
## Package version: 3.2.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
library(quanteda.textmodels)
library(quanteda.textstats)
library(stm)
## stm v1.3.6 successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
#library(topicmodels)
library(ldatuning)
library(stringr)
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
load(file = "DFM.RData")
Подбор параметров-1
DFM2topicmodels <- convert(DFM, to = "topicmodels")
t <- Sys.time()
t
## [1] "2022-02-20 21:29:38 MSK"
LDAtuning.metrics_Gibbs_10_110 <- FindTopicsNumber(DFM2topicmodels
, topics = seq(from = 5, to = 255, by = 50)
, metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014")
, method = "Gibbs"
, control = list(seed = 77)
, mc.cores = 2L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
Sys.time() - t
## Time difference of 31.53412 mins
FindTopicsNumber_plot(LDAtuning.metrics_Gibbs_10_110)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

t <- Sys.time()
t
## [1] "2022-02-20 22:01:11 MSK"
LDAtuning.metrics_Gibbs_5_50 <- FindTopicsNumber(DFM2topicmodels
, topics = seq(from = 5, to = 50, by = 10)
, metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014")
, method = "Gibbs"
, control = list(seed = 77)
, mc.cores = 2L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
Sys.time() - t
## Time difference of 3.978375 mins
FindTopicsNumber_plot(LDAtuning.metrics_Gibbs_5_50)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

t <- Sys.time()
t
## [1] "2022-02-20 22:05:10 MSK"
LDAtuning.metrics_Gibbs_5_35 <- FindTopicsNumber(DFM2topicmodels
, topics = seq(from = 5, to = 35, by = 5)
, metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014")
, method = "Gibbs"
, control = list(seed = 77)
, mc.cores = 2L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
Sys.time() - t
## Time difference of 5.60961 mins
FindTopicsNumber_plot(LDAtuning.metrics_Gibbs_5_35)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

t <- Sys.time()
t
## [1] "2022-02-20 22:10:47 MSK"
LDAtuning.metrics_Gibbs_10_25 <- FindTopicsNumber(DFM2topicmodels
, topics = seq(from = 10, to = 25, by = 2)
, metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014")
, method = "Gibbs"
, control = list(seed = 77)
, mc.cores = 2L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
Sys.time() - t
## Time difference of 4.53642 mins
FindTopicsNumber_plot(LDAtuning.metrics_Gibbs_10_25)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

t <- Sys.time()
t
## [1] "2022-02-20 22:15:20 MSK"
LDAtuning.metrics_Gibbs_14_18 <- FindTopicsNumber(DFM2topicmodels
, topics = seq(from = 14, to = 18, by = 1)
, metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014")
, method = "Gibbs"
, control = list(seed = 77)
, mc.cores = 2L, verbose = TRUE
)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
Sys.time() - t
## Time difference of 2.769603 mins
FindTopicsNumber_plot(LDAtuning.metrics_Gibbs_14_18)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Подбор параметров-2
DFM2stm <- convert(DFM, to = "stm")
t <- Sys.time()
t
## [1] "2022-02-20 22:18:07 MSK"
IdealK_5_255 <- searchK(DFM2stm$documents, DFM2stm$vocab
, K = seq(5, 255, by = 50), max.em.its = 75
, cores = 4
, seed = 9999
)
## Using multiple-cores. Progress will not be shown.
Sys.time() - t
## Time difference of 42.92988 mins
plot(IdealK_5_255)

t <- Sys.time()
t
## [1] "2022-02-20 23:01:03 MSK"
IdealK_5_50 <- searchK(DFM2stm$documents, DFM2stm$vocab
, K = seq(5, 50, by = 10), max.em.its = 75
, cores = 4
, seed = 9999
)
## Using multiple-cores. Progress will not be shown.
Sys.time() - t
## Time difference of 5.124475 mins
plot(IdealK_5_50)

t <- Sys.time()
t
## [1] "2022-02-20 23:06:11 MSK"
IdealK_10_20 <- searchK(DFM2stm$documents, DFM2stm$vocab
, K = seq(10, 20, by = 2), max.em.its = 75
, cores = 4
, seed = 9999
)
## Using multiple-cores. Progress will not be shown.
Sys.time() - t
## Time difference of 3.572476 mins
plot(IdealK_10_20)

t <- Sys.time()
t
## [1] "2022-02-20 23:09:46 MSK"
IdealK_13_17 <- searchK(DFM2stm$documents, DFM2stm$vocab
, K = seq(14, 17, by = 1), max.em.its = 75
, cores = 4
, seed = 9999
)
## Using multiple-cores. Progress will not be shown.
Sys.time() - t
## Time difference of 2.658564 mins
plot(IdealK_13_17)

Сохраняю на диск
save(DFM, file = "DFM.RData")