Start

Data

tread_hse <- read_csv("/srv/store/students/vvsuschevskiy/noobsQA/littleone/littleone_csv/Thread_6941702_full_306_pages.csv")
## Parsed with column specification:
## cols(
##   post_text = col_character(),
##   user_title = col_character(),
##   Адрес = col_character(),
##   Регистрация = col_character(),
##   Сообщений = col_double()
## )
tread_hse$uni = "hse"

tread_leti <- read_csv("/srv/store/students/vvsuschevskiy/noobsQA/littleone/littleone_csv/Thread_7376205_full_509_pages.csv")
## Parsed with column specification:
## cols(
##   post_text = col_character(),
##   user_title = col_character(),
##   Адрес = col_character(),
##   Регистрация = col_character(),
##   Сообщений = col_double()
## )
tread_leti$uni = "leti"

tread_poly <- read_csv("/srv/store/students/vvsuschevskiy/noobsQA/littleone/littleone_csv/Thread_5600277_full_490_pages.csv")
## Parsed with column specification:
## cols(
##   post_text = col_character(),
##   user_title = col_character(),
##   Адрес = col_character(),
##   Регистрация = col_character(),
##   Сообщений = col_double()
## )
tread_poly$uni = "poly"

treads = bind_rows(tread_hse, tread_leti, tread_poly) %>% 
  select(post_text, uni)

remove(tread_hse, tread_leti, tread_poly)

Stopwords

stop_words_ru = get_stopwords("ru", "snowball")

# Thread_8210092  %>% 
#   unnest_tokens(word, post_text) %>% 
#   anti_join(stop_words_ru) -> Thread_tidy

preprocessing

library(tidytext)
# library(SnowballC)
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()
## 
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
## 
##     tokenize
## The following objects are masked from 'package:quanteda':
## 
##     tokens, types
library(analogue)
## Loading required package: vegan
## Loading required package: permute
## 
## Attaching package: 'permute'
## The following object is masked from 'package:igraph':
## 
##     permute
## Loading required package: lattice
## This is vegan 2.5-6
## 
## Attaching package: 'vegan'
## The following object is masked from 'package:igraph':
## 
##     diversity
## analogue version 0.17-5
## 
## Attaching package: 'analogue'
## The following object is masked from 'package:igraph':
## 
##     compare
# treads$post_text %>% tail(100) %>% 
#   str_remove_all("\\n\\n\\n\\n\\n\\n Сообщение от.*\\s") %>% 
#   rm_between_multiple("\\n\\n","\\n\\n") %>% 
#   rm_between_multiple("quote", "quote") %>% 
#   str_remove_all('\\"') %>% 
#   str_remove_all("[:punct:]") %>%
#   str_remove_all("[:digit:]") %>% 
#   str_to_lower()

# library(qdapRegex)
treads_1 =  treads %>% 
  mutate(
  post_text = post_text %>% 
      str_remove_all("\\n\\n\\n\\n\\n\\n Сообщение от.*\\s") %>% 
  qdapRegex::rm_between_multiple("\\n\\n","\\n\\n") %>% 
  qdapRegex::rm_between_multiple("quote", "quote") %>% 
  str_remove_all('\\"') %>% 
  str_remove_all("[:punct:]") %>%
  str_remove_all("[:digit:]") %>% 
  str_to_lower()
  ) 

treads_1$index <- c(0, rep(1:(nrow(treads_1)-1)%/%10))

treads_1 = treads_1 %>% 
  mutate(index = case_when(
    uni == "hse"~index+1000,
    uni == "leti"~index+2000,
    uni == "poly"~index+3000
  )) %>% 
  ungroup() %>% 
  group_by(index) %>% 
  summarise(text = paste(post_text, collapse = " "), uni = uni) %>% 
  unique()
## `summarise()` regrouping output by 'index' (override with `.groups` argument)
word_tokens <- treads_1 %>%
  unnest_tokens(word, text) 

stemming

# library(SnowballC)

word_tokens_s = word_tokens %>%
  ungroup() %>% 
  filter(!word %in% stop_words_ru$word) %>%
  mutate(word_stem = SnowballC::wordStem(word, language = "russian")) %>% 
  select(index, uni, word_stem) %>% 
  rename("word" = "word_stem")

top

stopstems_littleone = c("эт", "так", "мо")

word_tokens_s %>% 
  filter(!word %in% stopstems_littleone) %>% 
  add_count(word) %>%
  filter(n > 100) %>%
  select(-n) -> tidy_treads

word_tokens_s %>% 
  filter(!word %in% stopstems_littleone) %>% 
  group_by(uni) %>% 
  add_count(word) %>%
  mutate(n = ifelse(uni=="hse", n*((nrow(word_tokens_s %>% filter(uni == "poly")))/nrow(word_tokens_s %>% filter(uni == "hse"))), n)
         ) %>% 
  group_by(word, uni) %>% 
  summarise(n = sum(n)) %>% 
  filter(n > 100) %>% 
  ungroup() %>% 
  top_n(40, n) %>% 
  ggplot(aes(x = n, y= word, fill = uni))+
  geom_bar(stat = "identity") +
  theme_minimal()+
  # facet_grid(~uni)+
  coord_flip()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `summarise()` regrouping output by 'word' (override with `.groups` argument)

STM

treads_sparse <- tidy_treads %>%
  count(index, word) %>%
  cast_sparse(index, word, n)

?cast_sparse

library("future")
## 
## Attaching package: 'future'
## The following objects are masked from 'package:igraph':
## 
##     %->%, %<-%
# library(purrr)
library(furrr)
library(stm)
## stm v1.3.5 successfully loaded. See ?stm for help. 
##  Papers, resources, and other materials at structuraltopicmodel.com
## 
## Attaching package: 'stm'
## The following object is masked from 'package:lattice':
## 
##     cloud
plan(multiprocess, workers =6 )
## Warning: [ONE-TIME WARNING] Forked processing ('multicore') is disabled
## in future (>= 1.13.0) when running R from RStudio, because it is
## considered unstable. Because of this, plan("multicore") will fall
## back to plan("sequential"), and plan("multiprocess") will fall back to
## plan("multisession") - not plan("multicore") as in the past. For more details,
## how to control forked processing or not, and how to silence this warning in
## future R sessions, see ?future::supportsMulticore
many_models_trd <- tibble(K = c(20, 40, 50, 60, 70, 80, 100)) %>%
  mutate(topic_model = future_map(K, ~stm(treads_sparse, K = .,
                                          verbose = FALSE)))

heldout_treads <- make.heldout(treads_sparse)

k_result_treads <- many_models_trd %>%
  mutate(exclusivity = map(topic_model, exclusivity),
         semantic_coherence = map(topic_model, semanticCoherence, treads_sparse),
         eval_heldout = map(topic_model, eval.heldout, heldout_treads$missing),
         residual = map(topic_model, checkResiduals, treads_sparse),
         bound =  map_dbl(topic_model, function(x) max(x$convergence$bound)),
         lfact = map_dbl(topic_model, function(x) lfactorial(x$settings$dim$K)),
         lbound = bound + lfact,
         iterations = map_dbl(topic_model, function(x) length(x$convergence$bound)))


k_result_treads %>%
  transmute(K,
            `Lower bound` = lbound,
            Residuals = map_dbl(residual, "dispersion"),
            `Semantic coherence` = map_dbl(semantic_coherence, mean),
            `Held-out likelihood` = map_dbl(eval_heldout, "expected.heldout")) %>%
  gather(Metric, Value, -K) %>%
  ggplot(aes(K, Value, color = Metric)) +
  geom_line(size = 2.5, alpha = 0.7, show.legend = FALSE) +
  facet_wrap(~Metric, scales = "free_y") +
  labs(x = "K (number of topics)",
       y = NULL,
       title = "Model diagnostics by number of topics",
       subtitle = "These diagnostics indicate that a good number of topics would be around 25-30")+
  theme_minimal()

K elbow is

k_result_treads %>%
  select(K, exclusivity, semantic_coherence) %>%
  filter(K %in% c(40, 60, 80)) %>%
  unnest(cols = c(exclusivity, semantic_coherence)) %>%
  mutate(K = as.factor(K)) %>%
  ggplot(aes(semantic_coherence, exclusivity, color = K)) +
  geom_point(size = 2, alpha = 0.7) +
  labs(x = "Semantic coherence",
       y = "Exclusivity",
       title = "Comparing exclusivity and semantic coherence",
       subtitle = "Models with fewer topics have higher semantic coherence for more topics, but lower exclusivity")+
  theme_minimal()

topic_model_trd <- k_result_treads %>% 
  filter(K == 60) %>% 
  pull(topic_model) %>% 
  .[[1]]

topic_model_trd
## A topic model with 60 topics, 1306 documents and a 408 word dictionary.
# topic_model_trd

td_beta_trd <- tidy(topic_model_trd)

td_beta_trd %>%head() 
## # A tibble: 6 x 3
##   topic term         beta
##   <int> <chr>       <dbl>
## 1     1 буд   0.00000358 
## 2     2 буд   0.00000131 
## 3     3 буд   0.000000375
## 4     4 буд   0.00560    
## 5     5 буд   0.0000180  
## 6     6 буд   0.000752
td_gamma_trd <- tidy(topic_model_trd, matrix = "gamma",
                 document_names = treads_1$index)

td_gamma_trd %>% 
  group_by(document) %>% 
  filter(gamma == max(gamma)) %>% 
  filter(topic == 16)%>% arrange(-gamma)
## # A tibble: 16 x 3
## # Groups:   document [16]
##    document topic gamma
##       <dbl> <int> <dbl>
##  1     2370    16 0.794
##  2     4179    16 0.621
##  3     2632    16 0.587
##  4     2368    16 0.582
##  5     2369    16 0.533
##  6     2598    16 0.383
##  7     2807    16 0.318
##  8     1249    16 0.220
##  9     1231    16 0.198
## 10     1245    16 0.196
## 11     2309    16 0.187
## 12     1230    16 0.183
## 13     4118    16 0.165
## 14     4059    16 0.163
## 15     2556    16 0.130
## 16     2737    16 0.114
top_terms <- td_beta_trd %>%
  arrange(beta) %>%
  group_by(topic) %>%
  top_n(7, beta) %>%
  arrange(-beta) %>%
  select(topic, term) %>%
  summarise(terms = list(term)) %>%
  mutate(terms = map(terms, paste, collapse = ", ")) %>% 
  unnest(cols = c(terms))
## `summarise()` ungrouping output (override with `.groups` argument)
gamma_terms<- td_gamma_trd %>%
  group_by(topic) %>%
  summarise(gamma = mean(gamma)) %>%
  arrange(desc(gamma)) %>%
  left_join(top_terms, by = "topic") %>%
  mutate(topic = paste0("Topic ", topic),
         topic = reorder(topic, gamma))
## `summarise()` ungrouping output (override with `.groups` argument)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
gamma_terms %>%
  top_n(20, gamma) %>%
# knitr::kable(digits = 3, 
#         col.names = c("Topic", "Expected topic proportion (gamma)", "Top 7 journals"),
#         #"latex", 
#         booktabs = T,longtable = T, caption = "The most common journals") %>% 
#   kable_styling(full_width = F) %>%
#   column_spec(1, width = "5em") %>% 
#   column_spec(2, width = "5em") %>% 
#   column_spec(3, width = "30em")
ggplot(aes(topic, gamma, label = terms, fill = topic)) +
  geom_col(show.legend = FALSE, fill = "white", color = "black") +
  geom_text(hjust = 0, nudge_y = 0.0005,  size = 3,
            family = "IBMPlexSans") +
  coord_flip() +
  scale_y_continuous(expand = c(0,0),
                     limits = c(0, 0.09),
                     labels = percent_format()) +
  theme(plot.title = element_text(size = 16,
                                  family="IBMPlexSans-Bold"),
        plot.subtitle = element_text(size = 13)) +
  labs(x = NULL, y = expression(gamma),
       title = "Top 20 topics by prevalence among JASSS authors' journals",
       subtitle = "With the top journals that contribute to each topic")+
  theme_minimal()

library(kableExtra)
gamma_terms %>%
  select(topic, gamma, terms) %>%
  kable(digits = 3, 
        col.names = c("Topic", "Expected topic proportion", "Top 7 terms"))
Topic Expected topic proportion Top 7 terms
Topic 58 0.035 очен, уч, нрав, сын, интересн, сложн, понрав
Topic 48 0.028 студент, дет, сво, сам, родител, проблем, мам
Topic 52 0.028 поздравля, спасиб, удач, всем, молодец, ваш, пуст
Topic 57 0.028 добр, спасиб, ден, знает, ком, пожалуйст, девочк
Topic 44 0.027 год, наш, след, прошл, плат, скольк, девочк
Topic 36 0.026 ход, занят, физр, физкультур, зал, платн, занима
Topic 42 0.025 физик, школ, математик, курс, информатик, репетитор, класс
Topic 5 0.024 сдал, мата, физик, матан, сам, оста, говор
Topic 43 0.024 вуз, друг, количеств, котор, например, имен, студент
Topic 18 0.023 расписан, пар, утр, суббот, час, перв, ден
Topic 30 0.023 экзам, зачет, экзамен, сдава, автомат, сдан, семестр
Topic 1 0.023 балл, год, прошл, проходн, направлен, поступ, меньш
Topic 53 0.022 работ, очен, хорош, студент, знан, имеет, желан
Topic 40 0.022 пок, дом, как, врод, собира, дают, дела
Topic 46 0.021 сесс, каникул, последн, го, перв, оста, экзам
Topic 29 0.020 политех, институт, факультет, фтк, техническ, как, программирован
Topic 47 0.020 групп, сын, фкти, остальн, друг, вмест, врод
Topic 35 0.020 курс, законч, учеб, окончан, перв, прост, оста
Topic 27 0.019 лэт, вуз, итм, уч, поступ, хочет, спбгу
Topic 15 0.019 лекц, преподавател, учебник, занима, чита, студент, говор
Topic 54 0.018 сын, преподавател, деканат, качеств, сдава, ден, сдат
Topic 45 0.017 специальн, программ, бакалавриат, план, учебн, курс, направлен
Topic 21 0.017 работа, магистратур, работ, дипл, диплом, врем, сво
Topic 6 0.017 нов, будут, корпус, чтот, какт, завтр, полн
Topic 28 0.016 хим, лаб, семестр, две, предмет, готов, очен
Topic 32 0.016 курс, одн, контрольн, предмет, кажд, задан, сын
Topic 33 0.016 втор, перв, волн, оста, помн, трет, могут
Topic 37 0.015 кафедр, воен, направлен, факультет, друг, интересн, здоров
Topic 51 0.015 вышк, вшэ, москв, истор, студент, информац, интерес
Topic 12 0.015 стипенд, тройк, троек, семестр, получа, сесс, студент
Topic 22 0.015 дума, сильн, фэл, ваш, одн, сын, зна
Topic 34 0.015 бюджет, перевод, мест, платн, курс, те, реальн
Topic 7 0.015 математик, вшэ, экономик, менеджмент, зна, программ, направлен
Topic 9 0.015 куратор, курс, зна, сво, оп, учеб, групп
Topic 3 0.014 рейтинг, скидк, оп, получ, студент, котор, курс
Topic 8 0.014 подскаж, первокурсник, спасиб, карт, пожалуйст, сын, повод
Topic 31 0.014 английск, язык, курс, школ, предмет, сам, дочк
Topic 41 0.014 мест, заявлен, пода, документ, абитуриент, сво, мал
Topic 39 0.014 вопрос, ответ, обучен, оплат, понима, семестр, учебн
Topic 49 0.013 вчер, студенческ, жизн, билет, лет, сво, выда
Topic 24 0.013 сайт, собран, спасиб, будут, написа, информац, посмотрет
Topic 38 0.013 приказ, виж, смотр, появ, чтот, направлен, видим
Topic 50 0.013 оценк, предмет, экзам, преподавател, писа, тем, наш
Topic 2 0.013 сентябр, медосмотр, проход, нужн, здоров, сын, сказа
Topic 13 0.013 егэ, подготовк, олимпиад, поступа, курс, поступлен, готов
Topic 10 0.013 недел, зачетн, семестр, стал, час, меньш, сраз
Topic 25 0.012 практик, курс, старш, младш, проход, го, год
Topic 17 0.012 допс, долг, пересдач, комисс, сдал, зна, отличн
Topic 20 0.012 списк, контракт, нам, тех, вообщ, сказа, стал
Topic 4 0.012 вк, офицер, поступ, лет, сын, год, магистратур
Topic 23 0.011 общежит, общаг, месяц, говор, общ, очен, информац
Topic 11 0.011 приоритет, направлен, перв, сам, систем, специальн, сын
Topic 55 0.011 поступлен, конкурс, средн, высок, нужн, ребят, общ
Topic 56 0.011 очеред, постав, сдела, жела, сам, друг, люд
Topic 19 0.010 врач, справк, сын, ден, завтр, дом, врем
Topic 26 0.010 результат, тест, писа, написа, говор, программ, котор
Topic 16 0.009 военкомат, сказа, сын, выда, ден, дела, дал
Topic 60 0.007 получ, допуск, врем, всем, понятн, дел, дума
Topic 59 0.006 отправл, реальн, дочк, сам, ещ, час, как
Topic 14 0.005 tapatalk, нужн, пок, поэт, завис, дан, выбира