library(dplyr)
library(data.table)
library(ggplot2)

5.1 Tidying a document-term matrix

5.1.1 Tidying DocumentTermMatrix objects

library(tm)
data("AssociatedPress", package = "topicmodels")
AssociatedPress
<<DocumentTermMatrix (documents: 2246, terms: 10473)>>
Non-/sparse entries: 302031/23220327
Sparsity           : 99%
Maximal term length: 18
Weighting          : term frequency (tf)
terms <- Terms(AssociatedPress)
head(terms)
[1] "aaron"      "abandon"    "abandoned"  "abandoning" "abbott"     "abboud"    
library(ggplot2)
ap_sentiments %>%
  count(sentiment, term, wt = count) %>%
  ungroup() %>%
  filter(n >= 200) %>%
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(term, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  ylab("Contribution to sentiment") +
  coord_flip()

5.1.2 Tidying dfm objects

data("data_corpus_inaugural", package = "quanteda")
inaug_dfm <- quanteda::dfm(data_corpus_inaugural, verbose = FALSE)
inaug_dfm
Document-feature matrix of: 58 documents, 9,357 features (91.8% sparse).
inaug_td <- tidy(inaug_dfm)
inaug_td
inaug_tf_idf <- inaug_td %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))
inaug_tf_idf
library(tidyr)
year_term_counts <- inaug_td %>%
  extract(document, "year", "(\\d+)", convert = TRUE) %>%
  complete(year, term, fill = list(count = 0)) %>%
  group_by(year) %>%
  mutate(year_total = sum(count))
year_term_counts %>%
  filter(term %in% c("god", "america", "foreign", "union", "constitution", "freedom")) %>%
  ggplot(aes(year, count / year_total)) +
  geom_point() +
  geom_smooth() +
  facet_wrap(~ term, scales = "free_y") +
  scale_y_continuous(labels = scales::percent_format()) +
  ylab("% frequency of word in inaugural address")

5.2 Casting tidy text data into a matrix

lyric_data=fread('../data/prince_raw_data.csv')
lyric_data %>% 
  group_by(album) %>%
  summarise(
    count=n()
  ) %>%
  as.data.frame %>%
  .[order(.$count,decreasing=T),]
albums=c('Other Songs','Crystal Ball','Emancipation','Rave Un2 the Joy Fantastic')
library(stringr)
# divide into documents, each representing one chapter
by_album <- lyric_data[,1:6] %>%
  filter(album %in% albums) %>%
  group_by(album) %>%
  mutate(linenumber=row_number()) %>%
  ungroup() %>%
  unite(document, album)
# split into words
by_album_word <- by_album %>%
  unnest_tokens(word, text)
# find document-word counts
word_counts <- by_album_word %>%
  anti_join(stop_words) %>%
  count(document, word, sort = TRUE) %>%
  ungroup()
Joining, by = "word"
word_counts
word_counts %>%
 cast_dtm(document, word, n)
Trying to compute distinct() for variables not found in the data:
- `row_col`, `column_col`
This is an error, but only a warning is raised for compatibility reasons.
The operation will return the input unchanged.
<<DocumentTermMatrix (documents: 4, terms: 3503)>>
Non-/sparse entries: 5081/8931
Sparsity           : 64%
Maximal term length: 17
Weighting          : term frequency (tf)
word_counts %>%
  cast_dfm(document,word,n)
Trying to compute distinct() for variables not found in the data:
- `row_col`, `column_col`
This is an error, but only a warning is raised for compatibility reasons.
The operation will return the input unchanged.
Document-feature matrix of: 4 documents, 3,503 features (63.7% sparse).
library(Matrix)
package 愼㸱愼㸵Matrix愼㸱愼㸶 was built under R version 3.4.4
Attaching package: 愼㸱愼㸵Matrix愼㸱愼㸶

The following object is masked from 愼㸱愼㸵package:tidyr愼㸱愼㸶:

    expand
# cast into a Matrix object
m <- word_counts %>%
  cast_sparse(document, word, n)
Trying to compute distinct() for variables not found in the data:
- `row_col`, `column_col`
This is an error, but only a warning is raised for compatibility reasons.
The operation will return the input unchanged.
class(m)
[1] "dgCMatrix"
attr(,"package")
[1] "Matrix"
dim(m)
[1]    4 3503

5.3 Tidying corpus objects with metadata

data("acq")
acq
<<VCorpus>>
Metadata:  corpus specific: 0, document level (indexed): 0
Content:  documents: 50
#first document
acq[[1]]
<<PlainTextDocument>>
Metadata:  15
Content:  chars: 1287
acq_td <- tidy(acq)
acq_td
acq_tokens <- acq_td %>%
  select(-places) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
# most common words
acq_tokens %>%
  count(word, sort = TRUE)
# tf-idf
acq_tokens %>%
  count(id, word) %>%
  bind_tf_idf(word, id, n) %>%
  arrange(desc(tf_idf))
LS0tDQp0aXRsZTogIkNvbnZlcnRpbmcgdG8gYW5kIGZyb20gbm9uLXRpZHkgZm9ybWF0cyINCmF1dGhvcjogJ+WKieiCsumKmCcNCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCg0KYGBge3J9DQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShkYXRhLnRhYmxlKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KYGBgDQoNCg0KIyMjNS4xIFRpZHlpbmcgYSBkb2N1bWVudC10ZXJtIG1hdHJpeA0KDQoNCiMjIyM1LjEuMSBUaWR5aW5nIERvY3VtZW50VGVybU1hdHJpeCBvYmplY3RzDQoNCg0KYGBge3J9DQpsaWJyYXJ5KHRtKQ0KDQpkYXRhKCJBc3NvY2lhdGVkUHJlc3MiLCBwYWNrYWdlID0gInRvcGljbW9kZWxzIikNCkFzc29jaWF0ZWRQcmVzcw0KYGBgDQoNCg0KDQoNCmBgYHtyfQ0KdGVybXMgPC0gVGVybXMoQXNzb2NpYXRlZFByZXNzKQ0KaGVhZCh0ZXJtcykNCmBgYA0KDQoNCg0KYGBge3J9DQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeSh0aWR5dGV4dCkNCg0KYXBfdGQgPC0gdGlkeShBc3NvY2lhdGVkUHJlc3MpDQphcF90ZA0KYGBgDQoNCg0KYGBge3J9DQphcF9zZW50aW1lbnRzIDwtIGFwX3RkICU+JQ0KICBpbm5lcl9qb2luKGdldF9zZW50aW1lbnRzKCJiaW5nIiksIGJ5ID0gYyh0ZXJtID0gIndvcmQiKSkNCg0KYXBfc2VudGltZW50cw0KYGBgDQoNCg0KYGBge3J9DQpsaWJyYXJ5KGdncGxvdDIpDQoNCmFwX3NlbnRpbWVudHMgJT4lDQogIGNvdW50KHNlbnRpbWVudCwgdGVybSwgd3QgPSBjb3VudCkgJT4lDQogIHVuZ3JvdXAoKSAlPiUNCiAgZmlsdGVyKG4gPj0gMjAwKSAlPiUNCiAgbXV0YXRlKG4gPSBpZmVsc2Uoc2VudGltZW50ID09ICJuZWdhdGl2ZSIsIC1uLCBuKSkgJT4lDQogIG11dGF0ZSh0ZXJtID0gcmVvcmRlcih0ZXJtLCBuKSkgJT4lDQogIGdncGxvdChhZXModGVybSwgbiwgZmlsbCA9IHNlbnRpbWVudCkpICsNCiAgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIpICsNCiAgeWxhYigiQ29udHJpYnV0aW9uIHRvIHNlbnRpbWVudCIpICsNCiAgY29vcmRfZmxpcCgpDQpgYGANCg0KIyMjIzUuMS4yIFRpZHlpbmcgZGZtIG9iamVjdHMNCg0KYGBge3J9DQpkYXRhKCJkYXRhX2NvcnB1c19pbmF1Z3VyYWwiLCBwYWNrYWdlID0gInF1YW50ZWRhIikNCmluYXVnX2RmbSA8LSBxdWFudGVkYTo6ZGZtKGRhdGFfY29ycHVzX2luYXVndXJhbCwgdmVyYm9zZSA9IEZBTFNFKQ0KaW5hdWdfZGZtDQpgYGANCg0KYGBge3J9DQppbmF1Z190ZCA8LSB0aWR5KGluYXVnX2RmbSkNCmluYXVnX3RkDQpgYGANCg0KYGBge3J9DQppbmF1Z190Zl9pZGYgPC0gaW5hdWdfdGQgJT4lDQogIGJpbmRfdGZfaWRmKHRlcm0sIGRvY3VtZW50LCBjb3VudCkgJT4lDQogIGFycmFuZ2UoZGVzYyh0Zl9pZGYpKQ0KDQppbmF1Z190Zl9pZGYNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkodGlkeXIpDQoNCnllYXJfdGVybV9jb3VudHMgPC0gaW5hdWdfdGQgJT4lDQogIGV4dHJhY3QoZG9jdW1lbnQsICJ5ZWFyIiwgIihcXGQrKSIsIGNvbnZlcnQgPSBUUlVFKSAlPiUNCiAgY29tcGxldGUoeWVhciwgdGVybSwgZmlsbCA9IGxpc3QoY291bnQgPSAwKSkgJT4lDQogIGdyb3VwX2J5KHllYXIpICU+JQ0KICBtdXRhdGUoeWVhcl90b3RhbCA9IHN1bShjb3VudCkpDQpgYGANCg0KDQpgYGB7cn0NCnllYXJfdGVybV9jb3VudHMgJT4lDQogIGZpbHRlcih0ZXJtICVpbiUgYygiZ29kIiwgImFtZXJpY2EiLCAiZm9yZWlnbiIsICJ1bmlvbiIsICJjb25zdGl0dXRpb24iLCAiZnJlZWRvbSIpKSAlPiUNCiAgZ2dwbG90KGFlcyh5ZWFyLCBjb3VudCAvIHllYXJfdG90YWwpKSArDQogIGdlb21fcG9pbnQoKSArDQogIGdlb21fc21vb3RoKCkgKw0KICBmYWNldF93cmFwKH4gdGVybSwgc2NhbGVzID0gImZyZWVfeSIpICsNCiAgc2NhbGVfeV9jb250aW51b3VzKGxhYmVscyA9IHNjYWxlczo6cGVyY2VudF9mb3JtYXQoKSkgKw0KICB5bGFiKCIlIGZyZXF1ZW5jeSBvZiB3b3JkIGluIGluYXVndXJhbCBhZGRyZXNzIikNCmBgYA0KDQoNCg0KIyMjNS4yIENhc3RpbmcgdGlkeSB0ZXh0IGRhdGEgaW50byBhIG1hdHJpeA0KDQoNCmBgYHtyfQ0KbHlyaWNfZGF0YT1mcmVhZCgnLi4vZGF0YS9wcmluY2VfcmF3X2RhdGEuY3N2JykNCmBgYA0KDQoNCg0KYGBge3J9DQpseXJpY19kYXRhICU+JSANCiAgZ3JvdXBfYnkoYWxidW0pICU+JQ0KICBzdW1tYXJpc2UoDQogICAgY291bnQ9bigpDQogICkgJT4lDQogIGFzLmRhdGEuZnJhbWUgJT4lDQogIC5bb3JkZXIoLiRjb3VudCxkZWNyZWFzaW5nPVQpLF0NCg0KYGBgDQoNCg0KYGBge3J9DQphbGJ1bXM9YygnT3RoZXIgU29uZ3MnLCdDcnlzdGFsIEJhbGwnLCdFbWFuY2lwYXRpb24nLCdSYXZlIFVuMiB0aGUgSm95IEZhbnRhc3RpYycpDQpgYGANCg0KDQoNCmBgYHtyfQ0KbGlicmFyeShzdHJpbmdyKQ0KDQojIGRpdmlkZSBpbnRvIGRvY3VtZW50cywgZWFjaCByZXByZXNlbnRpbmcgb25lIGNoYXB0ZXINCmJ5X2FsYnVtIDwtIGx5cmljX2RhdGFbLDE6Nl0gJT4lDQogIGZpbHRlcihhbGJ1bSAlaW4lIGFsYnVtcykgJT4lDQogIGdyb3VwX2J5KGFsYnVtKSAlPiUNCiAgbXV0YXRlKGxpbmVudW1iZXI9cm93X251bWJlcigpKSAlPiUNCiAgdW5ncm91cCgpICU+JQ0KICB1bml0ZShkb2N1bWVudCwgYWxidW0pDQoNCiMgc3BsaXQgaW50byB3b3Jkcw0KYnlfYWxidW1fd29yZCA8LSBieV9hbGJ1bSAlPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQ0KDQojIGZpbmQgZG9jdW1lbnQtd29yZCBjb3VudHMNCndvcmRfY291bnRzIDwtIGJ5X2FsYnVtX3dvcmQgJT4lDQogIGFudGlfam9pbihzdG9wX3dvcmRzKSAlPiUNCiAgY291bnQoZG9jdW1lbnQsIHdvcmQsIHNvcnQgPSBUUlVFKSAlPiUNCiAgdW5ncm91cCgpDQoNCndvcmRfY291bnRzDQpgYGANCg0KDQpgYGB7cn0NCndvcmRfY291bnRzICU+JQ0KIGNhc3RfZHRtKGRvY3VtZW50LCB3b3JkLCBuKQ0KDQoNCmBgYA0KDQoNCmBgYHtyfQ0KDQp3b3JkX2NvdW50cyAlPiUNCiAgY2FzdF9kZm0oZG9jdW1lbnQsd29yZCxuKQ0KDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KE1hdHJpeCkNCg0KIyBjYXN0IGludG8gYSBNYXRyaXggb2JqZWN0DQptIDwtIHdvcmRfY291bnRzICU+JQ0KICBjYXN0X3NwYXJzZShkb2N1bWVudCwgd29yZCwgbikNCg0KY2xhc3MobSkNCmBgYA0KDQpgYGB7cn0NCmRpbShtKQ0KYGBgDQoNCg0KDQojIyM1LjMgVGlkeWluZyBjb3JwdXMgb2JqZWN0cyB3aXRoIG1ldGFkYXRhDQoNCmBgYHtyfQ0KZGF0YSgiYWNxIikNCmFjcQ0KYGBgDQoNCg0KDQoNCmBgYHtyfQ0KI2ZpcnN0IGRvY3VtZW50DQphY3FbWzFdXQ0KDQoNCmBgYA0KDQoNCg0KYGBge3J9DQphY3FfdGQgPC0gdGlkeShhY3EpDQphY3FfdGQNCmBgYA0KDQoNCg0KYGBge3J9DQphY3FfdG9rZW5zIDwtIGFjcV90ZCAlPiUNCiAgc2VsZWN0KC1wbGFjZXMpICU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpICU+JQ0KICBhbnRpX2pvaW4oc3RvcF93b3JkcywgYnkgPSAid29yZCIpDQoNCiMgbW9zdCBjb21tb24gd29yZHMNCmFjcV90b2tlbnMgJT4lDQogIGNvdW50KHdvcmQsIHNvcnQgPSBUUlVFKQ0KYGBgDQoNCg0KYGBge3J9DQojIHRmLWlkZg0KYWNxX3Rva2VucyAlPiUNCiAgY291bnQoaWQsIHdvcmQpICU+JQ0KICBiaW5kX3RmX2lkZih3b3JkLCBpZCwgbikgJT4lDQogIGFycmFuZ2UoZGVzYyh0Zl9pZGYpKQ0KYGBgDQoNCg0KDQoNCg0KPHN0eWxlPg0KDQplbSB7DQogICAgY29sb3I6ICNGRkVBNkM7DQogICAgYmFja2dyb3VuZDogIzdEN0Q3RDsNCn0NCg0KLmNhcHRpb24gew0KICBjb2xvcjogIzc3NzsNCiAgbWFyZ2luLXRvcDogMTBweDsNCn0NCnAgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcHJlIHsNCiAgd29yZC1icmVhazogbm9ybWFsOw0KICB3b3JkLXdyYXA6IG5vcm1hbDsNCiAgbGluZS1oZWlnaHQ6IDE7DQp9DQpwcmUgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcCxsaSB7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoucnsNCiAgbGluZS1oZWlnaHQ6IDEuMjsNCn0NCg0KLnFpeiB7DQogIGxpbmUtaGVpZ2h0OiAxLjc1Ow0KICBiYWNrZ3JvdW5kOiAjZjBmMGYwOw0KICBib3JkZXItbGVmdDogMTJweCBzb2xpZCAjY2NmZmNjOw0KICBwYWRkaW5nOiA0cHg7DQogIHBhZGRpbmctbGVmdDogMTBweDsNCiAgY29sb3I6ICMwMDk5MDA7DQp9DQoNCnRpdGxlew0KICBjb2xvcjogI2NjMDAwMDsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmJvZHl7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpoMSxoMixoMyxoNCxoNXsNCiAgY29sb3I6ICMwMDY2ZmY7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoNCmgzew0KICBjb2xvcjogI2IzNmIwMDsNCiAgYmFja2dyb3VuZDogI2ZmZTBiMzsNCiAgbGluZS1oZWlnaHQ6IDI7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNXsNCiAgY29sb3I6ICMwMDYwMDA7DQogIGJhY2tncm91bmQ6ICNmOGY4Zjg7DQogIGxpbmUtaGVpZ2h0OiAxLjU7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNiB7DQogICAgY29sb3I6ICMwMDYwMDA7DQogICAgYmFja2dyb3VuZDogIzAwZmZmZjsNCiAgICBsaW5lLWhlaWdodDogMjsNCiAgICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KPC9zdHlsZT4NCg==