library(data.table)
library(data.table)
library(dplyr) #data manipulation
library(ggplot2) #visualizations
library(gridExtra) #viewing multiple plots together
library(tidytext) #text mining
#library(wordcloud2) #creative visualizations
load data
lyric_data=fread('../data/prince_raw_data.csv')
3.1 Term frequency
library(dplyr)
library(janeaustenr)
library(tidytext)
lyric_words <- lyric_data[,1:6] %>%
unnest_tokens(word, text) %>%
count(album, word, sort = TRUE) %>%
ungroup()
total_words <- lyric_words %>%
group_by(album) %>%
summarize(total = sum(n))
lyric_words <- left_join(lyric_words, total_words)
Joining, by = "album"
lyric_words
library(ggplot2)
lyric_words_4albums<-lyric_words %>%
filter(.,album %in%
c('Other Songs',
'Emancipation',
'Crystal Ball',
'Crystal Ball'))
ggplot(lyric_words_4albums, aes(n/total, fill = album)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~album, ncol = 2, scales = "free_y")

3.2 Zipf’s law
freq_by_rank <- lyric_words_4albums %>%
group_by(album) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = album)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
Call:
lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
Coefficients:
(Intercept) log10(rank)
-0.4922 -1.1332
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color =album)) +
geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

lyric_words_4albums <- lyric_words_4albums %>%
bind_tf_idf(word, album, n)
lyric_words_4albums
lyric_words_4albums %>%
select(-total) %>%
arrange(desc(tf_idf))
lyric_words_4albums %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(album) %>%
top_n(15) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = album)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~album, ncol = 2, scales = "free") +
coord_flip()
Selecting by tf_idf

3.4 A corpus of physics texts
LS0tDQp0aXRsZTogIkNIMyBBbmFseXppbmcgd29yZCBhbmQgZG9jdW1lbnQgZnJlcXVlbmN5OiB0Zi1pZGYiDQphdXRob3I6ICflionogrLpipgnDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCmBgYHtyfQ0KbGlicmFyeShkYXRhLnRhYmxlKQ0KbGlicmFyeShkYXRhLnRhYmxlKQ0KbGlicmFyeShkcGx5cikgI2RhdGEgbWFuaXB1bGF0aW9uDQpsaWJyYXJ5KGdncGxvdDIpICN2aXN1YWxpemF0aW9ucw0KbGlicmFyeShncmlkRXh0cmEpICN2aWV3aW5nIG11bHRpcGxlIHBsb3RzIHRvZ2V0aGVyDQpsaWJyYXJ5KHRpZHl0ZXh0KSAjdGV4dCBtaW5pbmcNCiNsaWJyYXJ5KHdvcmRjbG91ZDIpICNjcmVhdGl2ZSB2aXN1YWxpemF0aW9ucw0KYGBgDQoNCg0KIyMjIyNsb2FkIGRhdGENCg0KYGBge3J9DQpseXJpY19kYXRhPWZyZWFkKCcuLi9kYXRhL3ByaW5jZV9yYXdfZGF0YS5jc3YnKQ0KYGBgDQoNCg0KIyMjMy4xIFRlcm0gZnJlcXVlbmN5IA0KDQpgYGB7cn0NCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KGphbmVhdXN0ZW5yKQ0KbGlicmFyeSh0aWR5dGV4dCkNCg0KDQpseXJpY193b3JkcyA8LSBseXJpY19kYXRhWywxOjZdICU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpICU+JQ0KICBjb3VudChhbGJ1bSwgd29yZCwgc29ydCA9IFRSVUUpICU+JQ0KICB1bmdyb3VwKCkNCg0KdG90YWxfd29yZHMgPC0gbHlyaWNfd29yZHMgJT4lIA0KICBncm91cF9ieShhbGJ1bSkgJT4lIA0KICBzdW1tYXJpemUodG90YWwgPSBzdW0obikpDQoNCmx5cmljX3dvcmRzIDwtIGxlZnRfam9pbihseXJpY193b3JkcywgdG90YWxfd29yZHMpDQoNCmx5cmljX3dvcmRzDQoNCg0KDQpgYGANCg0KDQoNCmBgYHtyfQ0KbGlicmFyeShnZ3Bsb3QyKQ0KDQpseXJpY193b3Jkc180YWxidW1zPC1seXJpY193b3JkcyAlPiUNCiAgZmlsdGVyKC4sYWxidW0gJWluJSANCiAgICAgICAgICAgYygnT3RoZXIgU29uZ3MnLA0KICAgICAgICAgICAgICdFbWFuY2lwYXRpb24nLA0KICAgICAgICAgICAgICdDcnlzdGFsIEJhbGwnLA0KICAgICAgICAgICAgICdDcnlzdGFsIEJhbGwnKSkNCg0KZ2dwbG90KGx5cmljX3dvcmRzXzRhbGJ1bXMsIGFlcyhuL3RvdGFsLCBmaWxsID0gYWxidW0pKSArDQogIGdlb21faGlzdG9ncmFtKHNob3cubGVnZW5kID0gRkFMU0UpICsNCiAgeGxpbShOQSwgMC4wMDA5KSArDQogIGZhY2V0X3dyYXAofmFsYnVtLCBuY29sID0gMiwgc2NhbGVzID0gImZyZWVfeSIpDQpgYGANCg0KDQojIyMzLjIgWmlwZuKAmXMgbGF3DQoNCmBgYHtyfQ0KZnJlcV9ieV9yYW5rIDwtIGx5cmljX3dvcmRzXzRhbGJ1bXMgJT4lIA0KICBncm91cF9ieShhbGJ1bSkgJT4lIA0KICBtdXRhdGUocmFuayA9IHJvd19udW1iZXIoKSwgDQogICAgICAgICBgdGVybSBmcmVxdWVuY3lgID0gbi90b3RhbCkNCg0KZnJlcV9ieV9yYW5rDQpgYGANCg0KDQoNCmBgYHtyfQ0KZnJlcV9ieV9yYW5rICU+JSANCiAgZ2dwbG90KGFlcyhyYW5rLCBgdGVybSBmcmVxdWVuY3lgLCBjb2xvciA9IGFsYnVtKSkgKyANCiAgZ2VvbV9saW5lKHNpemUgPSAxLjEsIGFscGhhID0gMC44LCBzaG93LmxlZ2VuZCA9IEZBTFNFKSArIA0KICBzY2FsZV94X2xvZzEwKCkgKw0KICBzY2FsZV95X2xvZzEwKCkNCmBgYA0KDQoNCg0KYGBge3J9DQpyYW5rX3N1YnNldCA8LSBmcmVxX2J5X3JhbmsgJT4lIA0KICBmaWx0ZXIocmFuayA8IDUwMCwNCiAgICAgICAgIHJhbmsgPiAxMCkNCg0KbG0obG9nMTAoYHRlcm0gZnJlcXVlbmN5YCkgfiBsb2cxMChyYW5rKSwgZGF0YSA9IHJhbmtfc3Vic2V0KQ0KYGBgDQoNCg0KYGBge3J9DQpmcmVxX2J5X3JhbmsgJT4lIA0KICBnZ3Bsb3QoYWVzKHJhbmssIGB0ZXJtIGZyZXF1ZW5jeWAsIGNvbG9yID1hbGJ1bSkpICsgDQogIGdlb21fYWJsaW5lKGludGVyY2VwdCA9IC0wLjYyLCBzbG9wZSA9IC0xLjEsIGNvbG9yID0gImdyYXk1MCIsIGxpbmV0eXBlID0gMikgKw0KICBnZW9tX2xpbmUoc2l6ZSA9IDEuMSwgYWxwaGEgPSAwLjgsIHNob3cubGVnZW5kID0gRkFMU0UpICsgDQogIHNjYWxlX3hfbG9nMTAoKSArDQogIHNjYWxlX3lfbG9nMTAoKQ0KYGBgDQoNCg0KYGBge3J9DQpseXJpY193b3Jkc180YWxidW1zIDwtIGx5cmljX3dvcmRzXzRhbGJ1bXMgJT4lDQogIGJpbmRfdGZfaWRmKHdvcmQsIGFsYnVtLCBuKQ0KbHlyaWNfd29yZHNfNGFsYnVtcw0KYGBgDQoNCg0KYGBge3J9DQpseXJpY193b3Jkc180YWxidW1zICU+JQ0KICBzZWxlY3QoLXRvdGFsKSAlPiUNCiAgYXJyYW5nZShkZXNjKHRmX2lkZikpDQpgYGANCg0KDQoNCmBgYHtyfQ0KbHlyaWNfd29yZHNfNGFsYnVtcyAlPiUNCiAgYXJyYW5nZShkZXNjKHRmX2lkZikpICU+JQ0KICBtdXRhdGUod29yZCA9IGZhY3Rvcih3b3JkLCBsZXZlbHMgPSByZXYodW5pcXVlKHdvcmQpKSkpICU+JSANCiAgZ3JvdXBfYnkoYWxidW0pICU+JSANCiAgdG9wX24oMTUpICU+JSANCiAgdW5ncm91cCAlPiUNCiAgZ2dwbG90KGFlcyh3b3JkLCB0Zl9pZGYsIGZpbGwgPSBhbGJ1bSkpICsNCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKw0KICBsYWJzKHggPSBOVUxMLCB5ID0gInRmLWlkZiIpICsNCiAgZmFjZXRfd3JhcCh+YWxidW0sIG5jb2wgPSAyLCBzY2FsZXMgPSAiZnJlZSIpICsNCiAgY29vcmRfZmxpcCgpDQpgYGANCg0KDQoNCiMjIyMzLjQgQSBjb3JwdXMgb2YgcGh5c2ljcyB0ZXh0cw0KDQpgYGB7cn0NCg0KYGBgDQoNCg0KPHN0eWxlPg0KDQplbSB7DQogICAgY29sb3I6ICNGRkVBNkM7DQogICAgYmFja2dyb3VuZDogIzdEN0Q3RDsNCn0NCg0KLmNhcHRpb24gew0KICBjb2xvcjogIzc3NzsNCiAgbWFyZ2luLXRvcDogMTBweDsNCn0NCnAgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcHJlIHsNCiAgd29yZC1icmVhazogbm9ybWFsOw0KICB3b3JkLXdyYXA6IG5vcm1hbDsNCiAgbGluZS1oZWlnaHQ6IDE7DQp9DQpwcmUgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcCxsaSB7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoucnsNCiAgbGluZS1oZWlnaHQ6IDEuMjsNCn0NCg0KLnFpeiB7DQogIGxpbmUtaGVpZ2h0OiAxLjc1Ow0KICBiYWNrZ3JvdW5kOiAjZjBmMGYwOw0KICBib3JkZXItbGVmdDogMTJweCBzb2xpZCAjY2NmZmNjOw0KICBwYWRkaW5nOiA0cHg7DQogIHBhZGRpbmctbGVmdDogMTBweDsNCiAgY29sb3I6ICMwMDk5MDA7DQp9DQoNCnRpdGxlew0KICBjb2xvcjogI2NjMDAwMDsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmJvZHl7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpoMSxoMixoMyxoNCxoNXsNCiAgY29sb3I6ICMwMDY2ZmY7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoNCmgzew0KICBjb2xvcjogI2IzNmIwMDsNCiAgYmFja2dyb3VuZDogI2ZmZTBiMzsNCiAgbGluZS1oZWlnaHQ6IDI7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNXsNCiAgY29sb3I6ICMwMDYwMDA7DQogIGJhY2tncm91bmQ6ICNmOGY4Zjg7DQogIGxpbmUtaGVpZ2h0OiAxLjU7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNiB7DQogICAgY29sb3I6ICMwMDYwMDA7DQogICAgYmFja2dyb3VuZDogIzAwZmZmZjsNCiAgICBsaW5lLWhlaWdodDogMjsNCiAgICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KPC9zdHlsZT4NCg==