library(here)
library(tidyverse)
library(quanteda)
library(topicmodels)
library(ldatuning)
data_path <- here("data", "processed")
file_names <- list.files(data_path)
file_paths <- file.path(data_path, file_names)
d <- file_paths %>% set_names() %>% map_df(read_csv)
write_csv(d, here("data", "combined-data.csv"))
d$post <- tokens(d$post, remove_url = TRUE, remove_numbers = TRUE)
my_dfm <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE)
my_dfm
## Document-feature matrix of: 10,898 documents, 14,182 features (99.8% sparse).
topfeatures(my_dfm, 20)
## server game can get like just work make use need
## 4694 4459 4076 2726 2637 2554 2257 2252 2131 2011
## one skill player time play now also tri see thank
## 1800 1753 1733 1653 1628 1491 1478 1476 1400 1376
my_dfm_bi <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 2)
my_dfm_bi
## Document-feature matrix of: 10,898 documents, 178,531 features (100.0% sparse).
topfeatures(my_dfm_bi, 20)
## in_the the_game of_the the_server i_have you_can
## 2179 2157 1730 1498 1424 1295
## on_the to_be to_the if_you for_the need_to
## 1292 1270 1258 1213 1080 966
## would_be want_to i_am have_to is_a have_a
## 944 925 876 871 827 806
## abl_to it_is
## 794 772
my_dfm_bi_s <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 2) %>%
dfm_remove(pattern = c(paste0("^", stopwords("english"), "_"),
paste0("_", stopwords("english"), "$")),
valuetype = "regex")
my_dfm_bi_s
## Document-feature matrix of: 10,898 documents, 81,553 features (100.0% sparse).
topfeatures(my_dfm_bi_s, 20)
## skill_point right_now make_sure can_see look_like
## 431 306 192 150 139
## can_get skill_tree everi_time someth_like blast_furnac
## 122 119 116 113 104
## can_make server_list seem_like real_life eco_server
## 101 101 98 98 97
## can_use can_also feel_like singl_player server_admin
## 94 87 86 83 80
my_dfm_tri <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 3)
my_dfm_tri
## Document-feature matrix of: 10,898 documents, 418,785 features (100.0% sparse).
topfeatures(my_dfm_tri, 20)
## be_abl_to a_lot_of of_the_game in_the_game it_would_be
## 538 306 284 278 260
## i_want_to there_is_a you_need_to a_way_to on_the_server
## 230 228 223 203 193
## need_to_be you_have_to would_like_to you_want_to there_is_no
## 192 188 180 178 175
## the_game_is the_game_and is_there_a to_be_abl seem_to_be
## 173 166 163 159 159
my_dfm_tri_s <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 3) %>%
dfm_remove(pattern = c(paste0("^", stopwords("english"), "_"),
paste0("_", stopwords("english"), "$")),
valuetype = "regex")
my_dfm_tri_s
## Document-feature matrix of: 10,898 documents, 130,405 features (100.0% sparse).
topfeatures(my_dfm_tri_s, 20)
## play_the_game
## 114
## let_me_know
## 77
## like_to_see
## 71
## run_the_server
## 60
## boolean_publicon_boolean
## 56
## bought_the_game
## 54
## restart_the_server
## 52
## tri_to_get
## 51
## play_this_game
## 49
## want_to_play
## 47
## abl_to_get
## 46
## system.activator.createinst_type_type
## 45
## like_the_idea
## 43
## way_to_get
## 42
## seem_to_work
## 42
## want_to_make
## 41
## tri_to_make
## 40
## love_to_see
## 37
## need_to_make
## 36
## buy_the_game
## 36
my_dfm_quad <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 4)
my_dfm_quad
## Document-feature matrix of: 10,898 documents, 523,062 features (100.0% sparse).
topfeatures(my_dfm_quad, 20)
## to_be_abl_to
## 158
## i_would_like_to
## 125
## should_be_abl_to
## 95
## is_there_a_way
## 86
## if_you_want_to
## 80
## there_a_way_to
## 72
## it_would_be_nice
## 68
## would_be_nice_to
## 53
## i_am_not_sure
## 52
## you_should_be_abl
## 49
## at_the_same_time
## 46
## as_far_as_i
## 44
## i_was_abl_to
## 44
## when_i_tri_to
## 43
## at_system.activator.createinst_type_type
## 42
## in_filenam_unknown_at
## 42
## you_will_need_to
## 41
## figur_out_how_to
## 41
## i_would_love_to
## 41
## thank_you_for_your
## 39
my_dfm_quad_s <- dfm(d$post, remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, ngrams = 4) %>%
dfm_remove(pattern = c(paste0("^", stopwords("english"), "_"),
paste0("_", stopwords("english"), "$")),
valuetype = "regex")
my_dfm_quad_s
## Document-feature matrix of: 10,898 documents, 147,713 features (>99.99% sparse).
topfeatures(my_dfm_quad_s, 20)
## thank_you_veri_much
## 31
## version_of_the_game
## 31
## end_of_inner_except
## 30
## inner_except_stack_trace
## 30
## boolean_publicon_boolean_skipcheckthi
## 29
## publicon_boolean_skipcheckthi_boolean
## 29
## boolean_skipcheckthi_boolean_fillcach
## 29
## except_has_been_thrown
## 28
## thrown_by_the_target
## 28
## system.runtimetype.createinstanceslow_boolean_publicon_boolean
## 28
## target_of_an_invoc
## 27
## system.runtimetypehandle.createinst_runtimetyp_type_boolean
## 27
## runtimetyp_type_boolean_publicon
## 27
## type_boolean_publicon_boolean
## 27
## boolean_publicon_boolean_nocheck
## 27
## publicon_boolean_nocheck_boolean
## 27
## boolean_nocheck_boolean_canbecach
## 27
## nocheck_boolean_canbecach_runtimemethodhandleintern
## 27
## boolean_canbecach_runtimemethodhandleintern_ctor
## 27
## canbecach_runtimemethodhandleintern_ctor_boolean
## 27
p <- function(model) {
topics <- tidytext::tidy(model, matrix = "beta")
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
p <- top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
p
}
my_dtm <- my_dfm[ntoken(my_dfm) > 0,]
my_dtm <- convert(my_dtm, to = "topicmodels")
result <- FindTopicsNumber(
my_dtm,
topics = c(2, 5, 10, 15, 20, 25, 30, 35),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 4L,
verbose = TRUE
)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
FindTopicsNumber_plot(result)
Looking at 15 topics based on the fit measures.
m_grams <- LDA(my_dtm, 15)
p(m_grams)
my_dtm_1 <- my_dfm_bi_s[ntoken(my_dfm_bi_s) > 0,]
my_dtm_1 <- convert(my_dtm_1, to = "topicmodels")
result1 <- FindTopicsNumber(
my_dtm_1,
topics = c(2, 5, 10, 15, 20, 25, 30, 35),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 4L,
verbose = TRUE
)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
FindTopicsNumber_plot(result)
Looking at 15 topics based on the fit measures.
m_bigrams <- LDA(my_dtm_1, 15)
p(m_bigrams)