tread_hse <- read_csv("/srv/store/students/vvsuschevskiy/noobsQA/littleone/littleone_csv/Thread_6941702_full_306_pages.csv")
## Parsed with column specification:
## cols(
## post_text = col_character(),
## user_title = col_character(),
## Адрес = col_character(),
## Регистрация = col_character(),
## Сообщений = col_double()
## )
tread_hse$uni = "hse"
tread_leti <- read_csv("/srv/store/students/vvsuschevskiy/noobsQA/littleone/littleone_csv/Thread_7376205_full_509_pages.csv")
## Parsed with column specification:
## cols(
## post_text = col_character(),
## user_title = col_character(),
## Адрес = col_character(),
## Регистрация = col_character(),
## Сообщений = col_double()
## )
tread_leti$uni = "leti"
tread_poly <- read_csv("/srv/store/students/vvsuschevskiy/noobsQA/littleone/littleone_csv/Thread_5600277_full_490_pages.csv")
## Parsed with column specification:
## cols(
## post_text = col_character(),
## user_title = col_character(),
## Адрес = col_character(),
## Регистрация = col_character(),
## Сообщений = col_double()
## )
tread_poly$uni = "poly"
treads = bind_rows(tread_hse, tread_leti, tread_poly) %>%
select(post_text, uni)
remove(tread_hse, tread_leti, tread_poly)
stop_words_ru = get_stopwords("ru", "snowball")
# Thread_8210092 %>%
# unnest_tokens(word, post_text) %>%
# anti_join(stop_words_ru) -> Thread_tidy
library(tidytext)
# library(SnowballC)
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
##
## tokenize
## The following objects are masked from 'package:quanteda':
##
## tokens, types
library(analogue)
## Loading required package: vegan
## Loading required package: permute
##
## Attaching package: 'permute'
## The following object is masked from 'package:igraph':
##
## permute
## Loading required package: lattice
## This is vegan 2.5-6
##
## Attaching package: 'vegan'
## The following object is masked from 'package:igraph':
##
## diversity
## analogue version 0.17-5
##
## Attaching package: 'analogue'
## The following object is masked from 'package:igraph':
##
## compare
# treads$post_text %>% tail(100) %>%
# str_remove_all("\\n\\n\\n\\n\\n\\n Сообщение от.*\\s") %>%
# rm_between_multiple("\\n\\n","\\n\\n") %>%
# rm_between_multiple("quote", "quote") %>%
# str_remove_all('\\"') %>%
# str_remove_all("[:punct:]") %>%
# str_remove_all("[:digit:]") %>%
# str_to_lower()
# library(qdapRegex)
treads_1 = treads %>%
mutate(
post_text = post_text %>%
str_remove_all("\\n\\n\\n\\n\\n\\n Сообщение от.*\\s") %>%
qdapRegex::rm_between_multiple("\\n\\n","\\n\\n") %>%
qdapRegex::rm_between_multiple("quote", "quote") %>%
str_remove_all('\\"') %>%
str_remove_all("[:punct:]") %>%
str_remove_all("[:digit:]") %>%
str_to_lower()
)
treads_1$index <- c(0, rep(1:(nrow(treads_1)-1)%/%10))
treads_1 = treads_1 %>%
mutate(index = case_when(
uni == "hse"~index+1000,
uni == "leti"~index+2000,
uni == "poly"~index+3000
)) %>%
ungroup() %>%
group_by(index) %>%
summarise(text = paste(post_text, collapse = " "), uni = uni) %>%
unique()
## `summarise()` regrouping output by 'index' (override with `.groups` argument)
word_tokens <- treads_1 %>%
unnest_tokens(word, text)
# library(SnowballC)
word_tokens_s = word_tokens %>%
ungroup() %>%
filter(!word %in% stop_words_ru$word) %>%
mutate(word_stem = SnowballC::wordStem(word, language = "russian")) %>%
select(index, uni, word_stem) %>%
rename("word" = "word_stem")
stopstems_littleone = c("эт", "так", "мо")
word_tokens_s %>%
filter(!word %in% stopstems_littleone) %>%
add_count(word) %>%
filter(n > 100) %>%
select(-n) -> tidy_treads
word_tokens_s %>%
filter(!word %in% stopstems_littleone) %>%
group_by(uni) %>%
add_count(word) %>%
mutate(n = ifelse(uni=="hse", n*((nrow(word_tokens_s %>% filter(uni == "poly")))/nrow(word_tokens_s %>% filter(uni == "hse"))), n)
) %>%
group_by(word, uni) %>%
summarise(n = sum(n)) %>%
filter(n > 100) %>%
ungroup() %>%
top_n(40, n) %>%
ggplot(aes(x = n, y= word, fill = uni))+
geom_bar(stat = "identity") +
theme_minimal()+
# facet_grid(~uni)+
coord_flip()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `summarise()` regrouping output by 'word' (override with `.groups` argument)
treads_sparse <- tidy_treads %>%
count(index, word) %>%
cast_sparse(index, word, n)
?cast_sparse
library("future")
##
## Attaching package: 'future'
## The following objects are masked from 'package:igraph':
##
## %->%, %<-%
# library(purrr)
library(furrr)
library(stm)
## stm v1.3.5 successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
##
## Attaching package: 'stm'
## The following object is masked from 'package:lattice':
##
## cloud
plan(multiprocess, workers =6 )
## Warning: [ONE-TIME WARNING] Forked processing ('multicore') is disabled
## in future (>= 1.13.0) when running R from RStudio, because it is
## considered unstable. Because of this, plan("multicore") will fall
## back to plan("sequential"), and plan("multiprocess") will fall back to
## plan("multisession") - not plan("multicore") as in the past. For more details,
## how to control forked processing or not, and how to silence this warning in
## future R sessions, see ?future::supportsMulticore
many_models_trd <- tibble(K = c(20, 40, 50, 60, 70, 80, 100)) %>%
mutate(topic_model = future_map(K, ~stm(treads_sparse, K = .,
verbose = FALSE)))
heldout_treads <- make.heldout(treads_sparse)
k_result_treads <- many_models_trd %>%
mutate(exclusivity = map(topic_model, exclusivity),
semantic_coherence = map(topic_model, semanticCoherence, treads_sparse),
eval_heldout = map(topic_model, eval.heldout, heldout_treads$missing),
residual = map(topic_model, checkResiduals, treads_sparse),
bound = map_dbl(topic_model, function(x) max(x$convergence$bound)),
lfact = map_dbl(topic_model, function(x) lfactorial(x$settings$dim$K)),
lbound = bound + lfact,
iterations = map_dbl(topic_model, function(x) length(x$convergence$bound)))
k_result_treads %>%
transmute(K,
`Lower bound` = lbound,
Residuals = map_dbl(residual, "dispersion"),
`Semantic coherence` = map_dbl(semantic_coherence, mean),
`Held-out likelihood` = map_dbl(eval_heldout, "expected.heldout")) %>%
gather(Metric, Value, -K) %>%
ggplot(aes(K, Value, color = Metric)) +
geom_line(size = 2.5, alpha = 0.7, show.legend = FALSE) +
facet_wrap(~Metric, scales = "free_y") +
labs(x = "K (number of topics)",
y = NULL,
title = "Model diagnostics by number of topics",
subtitle = "These diagnostics indicate that a good number of topics would be around 25-30")+
theme_minimal()
k_result_treads %>%
select(K, exclusivity, semantic_coherence) %>%
filter(K %in% c(40, 60, 80)) %>%
unnest(cols = c(exclusivity, semantic_coherence)) %>%
mutate(K = as.factor(K)) %>%
ggplot(aes(semantic_coherence, exclusivity, color = K)) +
geom_point(size = 2, alpha = 0.7) +
labs(x = "Semantic coherence",
y = "Exclusivity",
title = "Comparing exclusivity and semantic coherence",
subtitle = "Models with fewer topics have higher semantic coherence for more topics, but lower exclusivity")+
theme_minimal()
topic_model_trd <- k_result_treads %>%
filter(K == 60) %>%
pull(topic_model) %>%
.[[1]]
topic_model_trd
## A topic model with 60 topics, 1306 documents and a 408 word dictionary.
# topic_model_trd
td_beta_trd <- tidy(topic_model_trd)
td_beta_trd %>%head()
## # A tibble: 6 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 буд 0.00000358
## 2 2 буд 0.00000131
## 3 3 буд 0.000000375
## 4 4 буд 0.00560
## 5 5 буд 0.0000180
## 6 6 буд 0.000752
td_gamma_trd <- tidy(topic_model_trd, matrix = "gamma",
document_names = treads_1$index)
td_gamma_trd %>%
group_by(document) %>%
filter(gamma == max(gamma)) %>%
filter(topic == 16)%>% arrange(-gamma)
## # A tibble: 16 x 3
## # Groups: document [16]
## document topic gamma
## <dbl> <int> <dbl>
## 1 2370 16 0.794
## 2 4179 16 0.621
## 3 2632 16 0.587
## 4 2368 16 0.582
## 5 2369 16 0.533
## 6 2598 16 0.383
## 7 2807 16 0.318
## 8 1249 16 0.220
## 9 1231 16 0.198
## 10 1245 16 0.196
## 11 2309 16 0.187
## 12 1230 16 0.183
## 13 4118 16 0.165
## 14 4059 16 0.163
## 15 2556 16 0.130
## 16 2737 16 0.114
top_terms <- td_beta_trd %>%
arrange(beta) %>%
group_by(topic) %>%
top_n(7, beta) %>%
arrange(-beta) %>%
select(topic, term) %>%
summarise(terms = list(term)) %>%
mutate(terms = map(terms, paste, collapse = ", ")) %>%
unnest(cols = c(terms))
## `summarise()` ungrouping output (override with `.groups` argument)
gamma_terms<- td_gamma_trd %>%
group_by(topic) %>%
summarise(gamma = mean(gamma)) %>%
arrange(desc(gamma)) %>%
left_join(top_terms, by = "topic") %>%
mutate(topic = paste0("Topic ", topic),
topic = reorder(topic, gamma))
## `summarise()` ungrouping output (override with `.groups` argument)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
gamma_terms %>%
top_n(20, gamma) %>%
# knitr::kable(digits = 3,
# col.names = c("Topic", "Expected topic proportion (gamma)", "Top 7 journals"),
# #"latex",
# booktabs = T,longtable = T, caption = "The most common journals") %>%
# kable_styling(full_width = F) %>%
# column_spec(1, width = "5em") %>%
# column_spec(2, width = "5em") %>%
# column_spec(3, width = "30em")
ggplot(aes(topic, gamma, label = terms, fill = topic)) +
geom_col(show.legend = FALSE, fill = "white", color = "black") +
geom_text(hjust = 0, nudge_y = 0.0005, size = 3,
family = "IBMPlexSans") +
coord_flip() +
scale_y_continuous(expand = c(0,0),
limits = c(0, 0.09),
labels = percent_format()) +
theme(plot.title = element_text(size = 16,
family="IBMPlexSans-Bold"),
plot.subtitle = element_text(size = 13)) +
labs(x = NULL, y = expression(gamma),
title = "Top 20 topics by prevalence among JASSS authors' journals",
subtitle = "With the top journals that contribute to each topic")+
theme_minimal()
library(kableExtra)
gamma_terms %>%
select(topic, gamma, terms) %>%
kable(digits = 3,
col.names = c("Topic", "Expected topic proportion", "Top 7 terms"))
Topic | Expected topic proportion | Top 7 terms |
---|---|---|
Topic 58 | 0.035 | очен, уч, нрав, сын, интересн, сложн, понрав |
Topic 48 | 0.028 | студент, дет, сво, сам, родител, проблем, мам |
Topic 52 | 0.028 | поздравля, спасиб, удач, всем, молодец, ваш, пуст |
Topic 57 | 0.028 | добр, спасиб, ден, знает, ком, пожалуйст, девочк |
Topic 44 | 0.027 | год, наш, след, прошл, плат, скольк, девочк |
Topic 36 | 0.026 | ход, занят, физр, физкультур, зал, платн, занима |
Topic 42 | 0.025 | физик, школ, математик, курс, информатик, репетитор, класс |
Topic 5 | 0.024 | сдал, мата, физик, матан, сам, оста, говор |
Topic 43 | 0.024 | вуз, друг, количеств, котор, например, имен, студент |
Topic 18 | 0.023 | расписан, пар, утр, суббот, час, перв, ден |
Topic 30 | 0.023 | экзам, зачет, экзамен, сдава, автомат, сдан, семестр |
Topic 1 | 0.023 | балл, год, прошл, проходн, направлен, поступ, меньш |
Topic 53 | 0.022 | работ, очен, хорош, студент, знан, имеет, желан |
Topic 40 | 0.022 | пок, дом, как, врод, собира, дают, дела |
Topic 46 | 0.021 | сесс, каникул, последн, го, перв, оста, экзам |
Topic 29 | 0.020 | политех, институт, факультет, фтк, техническ, как, программирован |
Topic 47 | 0.020 | групп, сын, фкти, остальн, друг, вмест, врод |
Topic 35 | 0.020 | курс, законч, учеб, окончан, перв, прост, оста |
Topic 27 | 0.019 | лэт, вуз, итм, уч, поступ, хочет, спбгу |
Topic 15 | 0.019 | лекц, преподавател, учебник, занима, чита, студент, говор |
Topic 54 | 0.018 | сын, преподавател, деканат, качеств, сдава, ден, сдат |
Topic 45 | 0.017 | специальн, программ, бакалавриат, план, учебн, курс, направлен |
Topic 21 | 0.017 | работа, магистратур, работ, дипл, диплом, врем, сво |
Topic 6 | 0.017 | нов, будут, корпус, чтот, какт, завтр, полн |
Topic 28 | 0.016 | хим, лаб, семестр, две, предмет, готов, очен |
Topic 32 | 0.016 | курс, одн, контрольн, предмет, кажд, задан, сын |
Topic 33 | 0.016 | втор, перв, волн, оста, помн, трет, могут |
Topic 37 | 0.015 | кафедр, воен, направлен, факультет, друг, интересн, здоров |
Topic 51 | 0.015 | вышк, вшэ, москв, истор, студент, информац, интерес |
Topic 12 | 0.015 | стипенд, тройк, троек, семестр, получа, сесс, студент |
Topic 22 | 0.015 | дума, сильн, фэл, ваш, одн, сын, зна |
Topic 34 | 0.015 | бюджет, перевод, мест, платн, курс, те, реальн |
Topic 7 | 0.015 | математик, вшэ, экономик, менеджмент, зна, программ, направлен |
Topic 9 | 0.015 | куратор, курс, зна, сво, оп, учеб, групп |
Topic 3 | 0.014 | рейтинг, скидк, оп, получ, студент, котор, курс |
Topic 8 | 0.014 | подскаж, первокурсник, спасиб, карт, пожалуйст, сын, повод |
Topic 31 | 0.014 | английск, язык, курс, школ, предмет, сам, дочк |
Topic 41 | 0.014 | мест, заявлен, пода, документ, абитуриент, сво, мал |
Topic 39 | 0.014 | вопрос, ответ, обучен, оплат, понима, семестр, учебн |
Topic 49 | 0.013 | вчер, студенческ, жизн, билет, лет, сво, выда |
Topic 24 | 0.013 | сайт, собран, спасиб, будут, написа, информац, посмотрет |
Topic 38 | 0.013 | приказ, виж, смотр, появ, чтот, направлен, видим |
Topic 50 | 0.013 | оценк, предмет, экзам, преподавател, писа, тем, наш |
Topic 2 | 0.013 | сентябр, медосмотр, проход, нужн, здоров, сын, сказа |
Topic 13 | 0.013 | егэ, подготовк, олимпиад, поступа, курс, поступлен, готов |
Topic 10 | 0.013 | недел, зачетн, семестр, стал, час, меньш, сраз |
Topic 25 | 0.012 | практик, курс, старш, младш, проход, го, год |
Topic 17 | 0.012 | допс, долг, пересдач, комисс, сдал, зна, отличн |
Topic 20 | 0.012 | списк, контракт, нам, тех, вообщ, сказа, стал |
Topic 4 | 0.012 | вк, офицер, поступ, лет, сын, год, магистратур |
Topic 23 | 0.011 | общежит, общаг, месяц, говор, общ, очен, информац |
Topic 11 | 0.011 | приоритет, направлен, перв, сам, систем, специальн, сын |
Topic 55 | 0.011 | поступлен, конкурс, средн, высок, нужн, ребят, общ |
Topic 56 | 0.011 | очеред, постав, сдела, жела, сам, друг, люд |
Topic 19 | 0.010 | врач, справк, сын, ден, завтр, дом, врем |
Topic 26 | 0.010 | результат, тест, писа, написа, говор, программ, котор |
Topic 16 | 0.009 | военкомат, сказа, сын, выда, ден, дела, дал |
Topic 60 | 0.007 | получ, допуск, врем, всем, понятн, дел, дума |
Topic 59 | 0.006 | отправл, реальн, дочк, сам, ещ, час, как |
Topic 14 | 0.005 | tapatalk, нужн, пок, поэт, завис, дан, выбира |