1 Loading, setting up

library(here)
library(tidyverse)
library(quanteda)
library(topicmodels)
library(ldatuning)
data_path <- here("data", "processed")
file_names <- list.files(data_path)
file_paths <- file.path(data_path, file_names)
d <- file_paths %>% set_names() %>% map_df(read_csv)
write_csv(d, here("data", "combined-data.csv"))
d$post <- tokens(d$post, remove_url = TRUE, remove_numbers = TRUE)

2 N-grams analysis

2.1 Words (grams)

my_dfm <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE)
my_dfm
## Document-feature matrix of: 10,898 documents, 14,182 features (99.8% sparse).
topfeatures(my_dfm, 20)
## server   game    can    get   like   just   work   make    use   need 
##   4694   4459   4076   2726   2637   2554   2257   2252   2131   2011 
##    one  skill player   time   play    now   also    tri    see  thank 
##   1800   1753   1733   1653   1628   1491   1478   1476   1400   1376

2.2 Pairs of words (bi-grams) - with and without stemming after creating the bi-gram

my_dfm_bi <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 2)
my_dfm_bi
## Document-feature matrix of: 10,898 documents, 178,531 features (100.0% sparse).
topfeatures(my_dfm_bi, 20)
##     in_the   the_game     of_the the_server     i_have    you_can 
##       2179       2157       1730       1498       1424       1295 
##     on_the      to_be     to_the     if_you    for_the    need_to 
##       1292       1270       1258       1213       1080        966 
##   would_be    want_to       i_am    have_to       is_a     have_a 
##        944        925        876        871        827        806 
##     abl_to      it_is 
##        794        772
my_dfm_bi_s <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 2) %>% 
  dfm_remove(pattern = c(paste0("^", stopwords("english"), "_"), 
                         paste0("_", stopwords("english"), "$")), 
             valuetype = "regex")
my_dfm_bi_s
## Document-feature matrix of: 10,898 documents, 81,553 features (100.0% sparse).
topfeatures(my_dfm_bi_s, 20)
##  skill_point    right_now    make_sure      can_see    look_like 
##          431          306          192          150          139 
##      can_get   skill_tree   everi_time  someth_like blast_furnac 
##          122          119          116          113          104 
##     can_make  server_list    seem_like    real_life   eco_server 
##          101          101           98           98           97 
##      can_use     can_also    feel_like singl_player server_admin 
##           94           87           86           83           80

2.3 Groups of three words (tri-grams)- with and without stemming after creating the tri-gram

my_dfm_tri <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 3)
my_dfm_tri
## Document-feature matrix of: 10,898 documents, 418,785 features (100.0% sparse).
topfeatures(my_dfm_tri, 20)
##     be_abl_to      a_lot_of   of_the_game   in_the_game   it_would_be 
##           538           306           284           278           260 
##     i_want_to    there_is_a   you_need_to      a_way_to on_the_server 
##           230           228           223           203           193 
##    need_to_be   you_have_to would_like_to   you_want_to   there_is_no 
##           192           188           180           178           175 
##   the_game_is  the_game_and    is_there_a     to_be_abl    seem_to_be 
##           173           166           163           159           159
my_dfm_tri_s <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 3) %>% 
  dfm_remove(pattern = c(paste0("^", stopwords("english"), "_"), 
                         paste0("_", stopwords("english"), "$")), 
             valuetype = "regex")
my_dfm_tri_s
## Document-feature matrix of: 10,898 documents, 130,405 features (100.0% sparse).
topfeatures(my_dfm_tri_s, 20)
##                         play_the_game 
##                                   114 
##                           let_me_know 
##                                    77 
##                           like_to_see 
##                                    71 
##                        run_the_server 
##                                    60 
##              boolean_publicon_boolean 
##                                    56 
##                       bought_the_game 
##                                    54 
##                    restart_the_server 
##                                    52 
##                            tri_to_get 
##                                    51 
##                        play_this_game 
##                                    49 
##                          want_to_play 
##                                    47 
##                            abl_to_get 
##                                    46 
## system.activator.createinst_type_type 
##                                    45 
##                         like_the_idea 
##                                    43 
##                            way_to_get 
##                                    42 
##                          seem_to_work 
##                                    42 
##                          want_to_make 
##                                    41 
##                           tri_to_make 
##                                    40 
##                           love_to_see 
##                                    37 
##                          need_to_make 
##                                    36 
##                          buy_the_game 
##                                    36

2.4 Groups of four words (quad-grams) - with and without stemming after creating the quad-gram

my_dfm_quad <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 4)
my_dfm_quad
## Document-feature matrix of: 10,898 documents, 523,062 features (100.0% sparse).
topfeatures(my_dfm_quad, 20)
##                             to_be_abl_to 
##                                      158 
##                          i_would_like_to 
##                                      125 
##                         should_be_abl_to 
##                                       95 
##                           is_there_a_way 
##                                       86 
##                           if_you_want_to 
##                                       80 
##                           there_a_way_to 
##                                       72 
##                         it_would_be_nice 
##                                       68 
##                         would_be_nice_to 
##                                       53 
##                            i_am_not_sure 
##                                       52 
##                        you_should_be_abl 
##                                       49 
##                         at_the_same_time 
##                                       46 
##                              as_far_as_i 
##                                       44 
##                             i_was_abl_to 
##                                       44 
##                            when_i_tri_to 
##                                       43 
## at_system.activator.createinst_type_type 
##                                       42 
##                    in_filenam_unknown_at 
##                                       42 
##                         you_will_need_to 
##                                       41 
##                         figur_out_how_to 
##                                       41 
##                          i_would_love_to 
##                                       41 
##                       thank_you_for_your 
##                                       39
my_dfm_quad_s <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 4) %>% 
  dfm_remove(pattern = c(paste0("^", stopwords("english"), "_"), 
                         paste0("_", stopwords("english"), "$")), 
             valuetype = "regex")
my_dfm_quad_s
## Document-feature matrix of: 10,898 documents, 147,713 features (>99.99% sparse).
topfeatures(my_dfm_quad_s, 20)
##                                            thank_you_veri_much 
##                                                             31 
##                                            version_of_the_game 
##                                                             31 
##                                            end_of_inner_except 
##                                                             30 
##                                       inner_except_stack_trace 
##                                                             30 
##                          boolean_publicon_boolean_skipcheckthi 
##                                                             29 
##                          publicon_boolean_skipcheckthi_boolean 
##                                                             29 
##                          boolean_skipcheckthi_boolean_fillcach 
##                                                             29 
##                                         except_has_been_thrown 
##                                                             28 
##                                           thrown_by_the_target 
##                                                             28 
## system.runtimetype.createinstanceslow_boolean_publicon_boolean 
##                                                             28 
##                                             target_of_an_invoc 
##                                                             27 
##    system.runtimetypehandle.createinst_runtimetyp_type_boolean 
##                                                             27 
##                               runtimetyp_type_boolean_publicon 
##                                                             27 
##                                  type_boolean_publicon_boolean 
##                                                             27 
##                               boolean_publicon_boolean_nocheck 
##                                                             27 
##                               publicon_boolean_nocheck_boolean 
##                                                             27 
##                              boolean_nocheck_boolean_canbecach 
##                                                             27 
##            nocheck_boolean_canbecach_runtimemethodhandleintern 
##                                                             27 
##               boolean_canbecach_runtimemethodhandleintern_ctor 
##                                                             27 
##               canbecach_runtimemethodhandleintern_ctor_boolean 
##                                                             27

3 Topic models

p <- function(model) {

topics <- tidytext::tidy(model, matrix = "beta")

top_terms <- topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

p <- top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

p
}

3.1 With words (grams)

my_dtm <- my_dfm[ntoken(my_dfm) > 0,]
my_dtm <- convert(my_dtm, to = "topicmodels")

result <- FindTopicsNumber(
  my_dtm,
  topics = c(2, 5, 10, 15, 20, 25, 30, 35),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 4L,
  verbose = TRUE
)
## fit models... done.
## calculate metrics:
##   Griffiths2004... done.
##   CaoJuan2009... done.
##   Arun2010... done.
##   Deveaud2014... done.
FindTopicsNumber_plot(result)

Looking at 15 topics based on the fit measures.

m_grams <- LDA(my_dtm, 15)
p(m_grams)

3.2 With pairs of words (bi-grams, stemmed)

my_dtm_1 <- my_dfm_bi_s[ntoken(my_dfm_bi_s) > 0,]
my_dtm_1 <- convert(my_dtm_1, to = "topicmodels")

result1 <- FindTopicsNumber(
  my_dtm_1,
  topics = c(2, 5, 10, 15, 20, 25, 30, 35),
  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
  method = "Gibbs",
  control = list(seed = 77),
  mc.cores = 4L,
  verbose = TRUE
)
## fit models... done.
## calculate metrics:
##   Griffiths2004... done.
##   CaoJuan2009... done.
##   Arun2010... done.
##   Deveaud2014... done.
FindTopicsNumber_plot(result)

Looking at 15 topics based on the fit measures.

m_bigrams <- LDA(my_dtm_1, 15)
p(m_bigrams)