library(dplyr)
library(data.table)
library(ggplot2)
#install.packages('topicmodels')

6.1 Latent Dirichlet allocation

library(topicmodels)

data("AssociatedPress")
AssociatedPress
# set a seed so that the output of the model is predictable
ap_lda <- LDA(AssociatedPress, k = 2, control = list(seed = 1234))
ap_lda

6.1.1 Word-topic probabilities

library(tidytext)

ap_topics <- tidy(ap_lda, matrix = "beta")
ap_topics

library(ggplot2)
library(dplyr)

ap_top_terms <- ap_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

ap_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()
library(tidyr)

beta_spread <- ap_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread

6.1.2 Document-topic probabilities

ap_documents <- tidy(ap_lda, matrix = "gamma")
ap_documents
tidy(AssociatedPress) %>%
  filter(document == 6) %>%
  arrange(desc(count))

6.2 Example: the great library heist

lyric_data=fread('../data/prince_raw_data.csv')
lyric_data %>% 
  group_by(album) %>%
  summarise(
    count=n()
  ) %>%
  as.data.frame %>%
  .[order(.$count,decreasing=T),]
albums=c('Other Songs','Crystal Ball','Emancipation','Rave Un2 the Joy Fantastic')
library(stringr)

# divide into documents, each representing one chapter
by_album <- lyric_data[,1:6] %>%
  filter(album %in% albums) %>%
  group_by(album) %>%
  mutate(linenumber=row_number()) %>%
  ungroup() %>%
  unite(document, album)

# split into words
by_album_word <- by_album %>%
  unnest_tokens(word, text)

# find document-word counts
word_counts <- by_album_word %>%
  anti_join(stop_words) %>%
  count(document, word, sort = TRUE) %>%
  ungroup()

word_counts

6.2.1 LDA on albums

albums_dtm <- word_counts %>%
  cast_dtm(document, word, n)

albums_dtm
albums_lda <- LDA(albums_dtm, k = 4, control = list(seed = 1234))
albums_lda
album_topics <- tidy(albums_lda, matrix = "beta")
album_topics
top_terms <- album_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms
library(ggplot2)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

6.2.2 Per-document classification

albums_gamma <- tidy(albums_lda, matrix = "gamma")
albums_gamma
names(albums_gamma)[1]='album'
names(albums_gamma)
# reorder albums in order of topic 1, topic 2, etc before plotting
albums_gamma %>%
  mutate(album = reorder(album, gamma * topic)) %>%
  ggplot(aes(factor(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ album)

6.3 Alternative LDA implementations

#install.packages('mallet')
library(mallet)

# create a vector with one string per chapter
collapsed <- by_album_word %>%
  anti_join(stop_words, by = "word") %>%
  mutate(word = str_replace(word, "'", "")) %>%
  group_by(document) %>%
  summarize(text = paste(word, collapse = " "))

# create an empty file of "stopwords"
file.create(empty_file <- tempfile())
docs <- mallet.import(collapsed$document, collapsed$text, empty_file)

mallet_model <- MalletLDA(num.topics = 4)
mallet_model$loadDocuments(docs)
mallet_model$train(100)
# word-topic pairs
tidy(mallet_model)

# document-topic pairs
tidy(mallet_model, matrix = "gamma")

# column needs to be named "term" for "augment"
term_counts <- rename(word_counts, term = word)
augment(mallet_model, term_counts)
LS0tDQp0aXRsZTogIkNINiBUb3BpYyBtb2RlbGluZyINCmF1dGhvcjogJ+WKieiCsumKmCcNCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCg0KYGBge3J9DQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShkYXRhLnRhYmxlKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KYGBgDQoNCg0KDQpgYGB7cn0NCiNpbnN0YWxsLnBhY2thZ2VzKCd0b3BpY21vZGVscycpDQpgYGANCg0KDQojIyM2LjEgTGF0ZW50IERpcmljaGxldCBhbGxvY2F0aW9uDQoNCg0KYGBge3J9DQpsaWJyYXJ5KHRvcGljbW9kZWxzKQ0KDQpkYXRhKCJBc3NvY2lhdGVkUHJlc3MiKQ0KQXNzb2NpYXRlZFByZXNzDQpgYGANCg0KDQoNCmBgYHtyfQ0KIyBzZXQgYSBzZWVkIHNvIHRoYXQgdGhlIG91dHB1dCBvZiB0aGUgbW9kZWwgaXMgcHJlZGljdGFibGUNCmFwX2xkYSA8LSBMREEoQXNzb2NpYXRlZFByZXNzLCBrID0gMiwgY29udHJvbCA9IGxpc3Qoc2VlZCA9IDEyMzQpKQ0KYXBfbGRhDQpgYGANCg0KDQoNCiMjIyM2LjEuMSBXb3JkLXRvcGljIHByb2JhYmlsaXRpZXMNCg0KYGBge3J9DQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KDQphcF90b3BpY3MgPC0gdGlkeShhcF9sZGEsIG1hdHJpeCA9ICJiZXRhIikNCmFwX3RvcGljcw0KDQpgYGANCg0KYGBge3J9DQoNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoZHBseXIpDQoNCmFwX3RvcF90ZXJtcyA8LSBhcF90b3BpY3MgJT4lDQogIGdyb3VwX2J5KHRvcGljKSAlPiUNCiAgdG9wX24oMTAsIGJldGEpICU+JQ0KICB1bmdyb3VwKCkgJT4lDQogIGFycmFuZ2UodG9waWMsIC1iZXRhKQ0KDQphcF90b3BfdGVybXMgJT4lDQogIG11dGF0ZSh0ZXJtID0gcmVvcmRlcih0ZXJtLCBiZXRhKSkgJT4lDQogIGdncGxvdChhZXModGVybSwgYmV0YSwgZmlsbCA9IGZhY3Rvcih0b3BpYykpKSArDQogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsNCiAgZmFjZXRfd3JhcCh+IHRvcGljLCBzY2FsZXMgPSAiZnJlZSIpICsNCiAgY29vcmRfZmxpcCgpDQpgYGANCg0KDQoNCmBgYHtyfQ0KbGlicmFyeSh0aWR5cikNCg0KYmV0YV9zcHJlYWQgPC0gYXBfdG9waWNzICU+JQ0KICBtdXRhdGUodG9waWMgPSBwYXN0ZTAoInRvcGljIiwgdG9waWMpKSAlPiUNCiAgc3ByZWFkKHRvcGljLCBiZXRhKSAlPiUNCiAgZmlsdGVyKHRvcGljMSA+IC4wMDEgfCB0b3BpYzIgPiAuMDAxKSAlPiUNCiAgbXV0YXRlKGxvZ19yYXRpbyA9IGxvZzIodG9waWMyIC8gdG9waWMxKSkNCg0KYmV0YV9zcHJlYWQNCmBgYA0KDQoNCg0KIyMjIzYuMS4yIERvY3VtZW50LXRvcGljIHByb2JhYmlsaXRpZXMNCg0KYGBge3J9DQphcF9kb2N1bWVudHMgPC0gdGlkeShhcF9sZGEsIG1hdHJpeCA9ICJnYW1tYSIpDQphcF9kb2N1bWVudHMNCmBgYA0KDQoNCmBgYHtyfQ0KdGlkeShBc3NvY2lhdGVkUHJlc3MpICU+JQ0KICBmaWx0ZXIoZG9jdW1lbnQgPT0gNikgJT4lDQogIGFycmFuZ2UoZGVzYyhjb3VudCkpDQpgYGANCg0KDQoNCg0KIyMjNi4yIEV4YW1wbGU6IHRoZSBncmVhdCBsaWJyYXJ5IGhlaXN0DQoNCg0KDQpgYGB7cn0NCmx5cmljX2RhdGE9ZnJlYWQoJy4uL2RhdGEvcHJpbmNlX3Jhd19kYXRhLmNzdicpDQpgYGANCg0KDQpgYGB7cn0NCmx5cmljX2RhdGEgJT4lIA0KICBncm91cF9ieShhbGJ1bSkgJT4lDQogIHN1bW1hcmlzZSgNCiAgICBjb3VudD1uKCkNCiAgKSAlPiUNCiAgYXMuZGF0YS5mcmFtZSAlPiUNCiAgLltvcmRlciguJGNvdW50LGRlY3JlYXNpbmc9VCksXQ0KDQpgYGANCg0KDQoNCmBgYHtyfQ0KYWxidW1zPWMoJ090aGVyIFNvbmdzJywnQ3J5c3RhbCBCYWxsJywnRW1hbmNpcGF0aW9uJywnUmF2ZSBVbjIgdGhlIEpveSBGYW50YXN0aWMnKQ0KYGBgDQoNCg0KYGBge3J9DQpsaWJyYXJ5KHN0cmluZ3IpDQoNCiMgZGl2aWRlIGludG8gZG9jdW1lbnRzLCBlYWNoIHJlcHJlc2VudGluZyBvbmUgY2hhcHRlcg0KYnlfYWxidW0gPC0gbHlyaWNfZGF0YVssMTo2XSAlPiUNCiAgZmlsdGVyKGFsYnVtICVpbiUgYWxidW1zKSAlPiUNCiAgZ3JvdXBfYnkoYWxidW0pICU+JQ0KICBtdXRhdGUobGluZW51bWJlcj1yb3dfbnVtYmVyKCkpICU+JQ0KICB1bmdyb3VwKCkgJT4lDQogIHVuaXRlKGRvY3VtZW50LCBhbGJ1bSkNCg0KIyBzcGxpdCBpbnRvIHdvcmRzDQpieV9hbGJ1bV93b3JkIDwtIGJ5X2FsYnVtICU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpDQoNCiMgZmluZCBkb2N1bWVudC13b3JkIGNvdW50cw0Kd29yZF9jb3VudHMgPC0gYnlfYWxidW1fd29yZCAlPiUNCiAgYW50aV9qb2luKHN0b3Bfd29yZHMpICU+JQ0KICBjb3VudChkb2N1bWVudCwgd29yZCwgc29ydCA9IFRSVUUpICU+JQ0KICB1bmdyb3VwKCkNCg0Kd29yZF9jb3VudHMNCmBgYA0KDQoNCg0KDQojIyMjNi4yLjEgTERBIG9uIGFsYnVtcw0KDQpgYGB7cn0NCmFsYnVtc19kdG0gPC0gd29yZF9jb3VudHMgJT4lDQogIGNhc3RfZHRtKGRvY3VtZW50LCB3b3JkLCBuKQ0KDQphbGJ1bXNfZHRtDQpgYGANCg0KDQpgYGB7cn0NCmFsYnVtc19sZGEgPC0gTERBKGFsYnVtc19kdG0sIGsgPSA0LCBjb250cm9sID0gbGlzdChzZWVkID0gMTIzNCkpDQphbGJ1bXNfbGRhDQpgYGANCg0KDQpgYGB7cn0NCmFsYnVtX3RvcGljcyA8LSB0aWR5KGFsYnVtc19sZGEsIG1hdHJpeCA9ICJiZXRhIikNCmFsYnVtX3RvcGljcw0KYGBgDQoNCg0KYGBge3J9DQp0b3BfdGVybXMgPC0gYWxidW1fdG9waWNzICU+JQ0KICBncm91cF9ieSh0b3BpYykgJT4lDQogIHRvcF9uKDUsIGJldGEpICU+JQ0KICB1bmdyb3VwKCkgJT4lDQogIGFycmFuZ2UodG9waWMsIC1iZXRhKQ0KDQp0b3BfdGVybXMNCmBgYA0KDQoNCmBgYHtyfQ0KbGlicmFyeShnZ3Bsb3QyKQ0KDQp0b3BfdGVybXMgJT4lDQogIG11dGF0ZSh0ZXJtID0gcmVvcmRlcih0ZXJtLCBiZXRhKSkgJT4lDQogIGdncGxvdChhZXModGVybSwgYmV0YSwgZmlsbCA9IGZhY3Rvcih0b3BpYykpKSArDQogIGdlb21fY29sKHNob3cubGVnZW5kID0gRkFMU0UpICsNCiAgZmFjZXRfd3JhcCh+IHRvcGljLCBzY2FsZXMgPSAiZnJlZSIpICsNCiAgY29vcmRfZmxpcCgpDQpgYGANCg0KDQoNCiMjIyM2LjIuMiBQZXItZG9jdW1lbnQgY2xhc3NpZmljYXRpb24NCg0KYGBge3J9DQphbGJ1bXNfZ2FtbWEgPC0gdGlkeShhbGJ1bXNfbGRhLCBtYXRyaXggPSAiZ2FtbWEiKQ0KYWxidW1zX2dhbW1hDQpgYGANCg0KYGBge3J9DQpuYW1lcyhhbGJ1bXNfZ2FtbWEpWzFdPSdhbGJ1bScNCm5hbWVzKGFsYnVtc19nYW1tYSkNCmBgYA0KDQoNCmBgYHtyfQ0KIyByZW9yZGVyIGFsYnVtcyBpbiBvcmRlciBvZiB0b3BpYyAxLCB0b3BpYyAyLCBldGMgYmVmb3JlIHBsb3R0aW5nDQphbGJ1bXNfZ2FtbWEgJT4lDQogIG11dGF0ZShhbGJ1bSA9IHJlb3JkZXIoYWxidW0sIGdhbW1hICogdG9waWMpKSAlPiUNCiAgZ2dwbG90KGFlcyhmYWN0b3IodG9waWMpLCBnYW1tYSkpICsNCiAgZ2VvbV9ib3hwbG90KCkgKw0KICBmYWNldF93cmFwKH4gYWxidW0pDQpgYGANCg0KDQoNCiMjIzYuMyBBbHRlcm5hdGl2ZSBMREEgaW1wbGVtZW50YXRpb25zDQoNCmBgYHtyfQ0KI2luc3RhbGwucGFja2FnZXMoJ21hbGxldCcpDQpsaWJyYXJ5KG1hbGxldCkNCg0KIyBjcmVhdGUgYSB2ZWN0b3Igd2l0aCBvbmUgc3RyaW5nIHBlciBjaGFwdGVyDQpjb2xsYXBzZWQgPC0gYnlfYWxidW1fd29yZCAlPiUNCiAgYW50aV9qb2luKHN0b3Bfd29yZHMsIGJ5ID0gIndvcmQiKSAlPiUNCiAgbXV0YXRlKHdvcmQgPSBzdHJfcmVwbGFjZSh3b3JkLCAiJyIsICIiKSkgJT4lDQogIGdyb3VwX2J5KGRvY3VtZW50KSAlPiUNCiAgc3VtbWFyaXplKHRleHQgPSBwYXN0ZSh3b3JkLCBjb2xsYXBzZSA9ICIgIikpDQoNCiMgY3JlYXRlIGFuIGVtcHR5IGZpbGUgb2YgInN0b3B3b3JkcyINCmZpbGUuY3JlYXRlKGVtcHR5X2ZpbGUgPC0gdGVtcGZpbGUoKSkNCmRvY3MgPC0gbWFsbGV0LmltcG9ydChjb2xsYXBzZWQkZG9jdW1lbnQsIGNvbGxhcHNlZCR0ZXh0LCBlbXB0eV9maWxlKQ0KDQptYWxsZXRfbW9kZWwgPC0gTWFsbGV0TERBKG51bS50b3BpY3MgPSA0KQ0KbWFsbGV0X21vZGVsJGxvYWREb2N1bWVudHMoZG9jcykNCm1hbGxldF9tb2RlbCR0cmFpbigxMDApDQpgYGANCg0KDQpgYGB7cn0NCiMgd29yZC10b3BpYyBwYWlycw0KdGlkeShtYWxsZXRfbW9kZWwpDQoNCiMgZG9jdW1lbnQtdG9waWMgcGFpcnMNCnRpZHkobWFsbGV0X21vZGVsLCBtYXRyaXggPSAiZ2FtbWEiKQ0KDQojIGNvbHVtbiBuZWVkcyB0byBiZSBuYW1lZCAidGVybSIgZm9yICJhdWdtZW50Ig0KdGVybV9jb3VudHMgPC0gcmVuYW1lKHdvcmRfY291bnRzLCB0ZXJtID0gd29yZCkNCmF1Z21lbnQobWFsbGV0X21vZGVsLCB0ZXJtX2NvdW50cykNCmBgYA0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KPHN0eWxlPg0KDQplbSB7DQogICAgY29sb3I6ICNGRkVBNkM7DQogICAgYmFja2dyb3VuZDogIzdEN0Q3RDsNCn0NCg0KLmNhcHRpb24gew0KICBjb2xvcjogIzc3NzsNCiAgbWFyZ2luLXRvcDogMTBweDsNCn0NCnAgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcHJlIHsNCiAgd29yZC1icmVhazogbm9ybWFsOw0KICB3b3JkLXdyYXA6IG5vcm1hbDsNCiAgbGluZS1oZWlnaHQ6IDE7DQp9DQpwcmUgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcCxsaSB7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoucnsNCiAgbGluZS1oZWlnaHQ6IDEuMjsNCn0NCg0KLnFpeiB7DQogIGxpbmUtaGVpZ2h0OiAxLjc1Ow0KICBiYWNrZ3JvdW5kOiAjZjBmMGYwOw0KICBib3JkZXItbGVmdDogMTJweCBzb2xpZCAjY2NmZmNjOw0KICBwYWRkaW5nOiA0cHg7DQogIHBhZGRpbmctbGVmdDogMTBweDsNCiAgY29sb3I6ICMwMDk5MDA7DQp9DQoNCnRpdGxlew0KICBjb2xvcjogI2NjMDAwMDsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmJvZHl7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpoMSxoMixoMyxoNCxoNXsNCiAgY29sb3I6ICMwMDY2ZmY7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoNCmgzew0KICBjb2xvcjogI2IzNmIwMDsNCiAgYmFja2dyb3VuZDogI2ZmZTBiMzsNCiAgbGluZS1oZWlnaHQ6IDI7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNXsNCiAgY29sb3I6ICMwMDYwMDA7DQogIGJhY2tncm91bmQ6ICNmOGY4Zjg7DQogIGxpbmUtaGVpZ2h0OiAxLjU7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNiB7DQogICAgY29sb3I6ICMwMDYwMDA7DQogICAgYmFja2dyb3VuZDogIzAwZmZmZjsNCiAgICBsaW5lLWhlaWdodDogMjsNCiAgICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KPC9zdHlsZT4NCg==