library(tidyverse)
library(tidytext)
library(wordcloud2)
marina <- read_csv("matd.txt")
Warning: One or more parsing issues, see `problems()` for details
Rows: 658 Columns: 1
-- Column specification --------------------------------------------------------------------------------------------------------------
Delimiter: ","
chr (1): lyrics
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
- Here I chose the album ’Froot” by Marina and the diamonds and unnested the lyrics.
marina_words <- marina %>%
unnest_tokens(word, lyrics) %>%
select(word)
marina_words
There are 10 songs each of over 3 minutes long on the album so there are lots of words here.
- Here I removed the stopwords, and then created a table and word cloud with the words counts.
marina_words %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
top_n(100) %>%
wordcloud2(size = .5)
Joining, by = "word"
Selecting by n
“Yeah” is used the most at 56 times.
- Here is a sentiment analysis using bing and a graph of the words that contribute most to each sentiment.
bing <- get_sentiments("bing")
bing
marina_words %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(sentiment), scales = "free") +
labs(y = "Marina's Froot album: Words that contribute the most to each sentiment ",
x = NULL) +
scale_fill_viridis_d() +
coord_flip() +
theme_minimal()
Joining, by = "word"
Selecting by n

By the scales at the bottom you can see that there are many more positive words than negative words.
- Here is a table and word cloud of the most common bigrams.
marina_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram, sort = T)
NA
marina_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
unite(bigram, word1, word2, sep = " ") %>%
count(bigram, sort =T) %>%
filter(n > 1) %>%
wordcloud2(size = .5)
I find it interesting that there are many bigrams used 6 times. Also “NA NA” is a funny bigram.
- Here I used the bigram method to find the most common words that come after the words I and you
first_word <- c("i", "you")
marina_bigrams %>%
count(bigram, sort = T) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(word1 %in% first_word) %>%
count(word1, word2, wt = n, sort = TRUE) %>%
mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>%
group_by(word1) %>%
top_n(5) %>%
ggplot(aes(word2, n, fill = word1)) +
scale_fill_viridis_d() +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL, title = "Word following:") +
facet_wrap(~word1, scales = "free") +
coord_flip() +
theme_minimal()
Selecting by n

I find it interesting both can and can’t follow “you”.
LS0tDQp0aXRsZTogIlRleHQgQW5hbHlzaXMiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeSh0aWR5dGV4dCkNCmxpYnJhcnkod29yZGNsb3VkMikNCmBgYA0KDQoNCg0KYGBge3J9DQptYXJpbmEgPC0gcmVhZF9jc3YoIm1hdGQudHh0IikNCmBgYA0KDQoxLiBIZXJlIEkgY2hvc2UgdGhlIGFsYnVtICdGcm9vdCIgYnkgTWFyaW5hIGFuZCB0aGUgZGlhbW9uZHMgYW5kIHVubmVzdGVkIHRoZSBseXJpY3MuIA0KYGBge3J9DQptYXJpbmFfd29yZHMgPC0gbWFyaW5hICU+JQ0KICB1bm5lc3RfdG9rZW5zKHdvcmQsIGx5cmljcykgJT4lIA0KICBzZWxlY3Qod29yZCkNCg0KbWFyaW5hX3dvcmRzDQpgYGANClRoZXJlIGFyZSAxMCBzb25ncyBlYWNoIG9mIG92ZXIgMyBtaW51dGVzIGxvbmcgb24gdGhlIGFsYnVtIHNvIHRoZXJlIGFyZSBsb3RzIG9mIHdvcmRzIGhlcmUuDQoNCg0KDQoyLiBIZXJlIEkgcmVtb3ZlZCB0aGUgc3RvcHdvcmRzLCBhbmQgdGhlbiBjcmVhdGVkIGEgdGFibGUgYW5kIHdvcmQgY2xvdWQgd2l0aCB0aGUgd29yZHMgY291bnRzLiANCg0KYGBge3J9DQptYXJpbmFfd29yZHMgJT4lIA0KICBhbnRpX2pvaW4oc3RvcF93b3JkcykgJT4lIA0KICBjb3VudCh3b3JkLCBzb3J0ID0gVCkgJT4lDQogIHRvcF9uKDEwMCkgJT4lDQogIHdvcmRjbG91ZDIoc2l6ZSA9IC41KQ0KDQpgYGANCiJZZWFoIiBpcyB1c2VkIHRoZSBtb3N0IGF0IDU2IHRpbWVzLg0KDQoNCjMuIEhlcmUgaXMgYSBzZW50aW1lbnQgYW5hbHlzaXMgdXNpbmcgYmluZyBhbmQgYSBncmFwaCBvZiB0aGUgd29yZHMgdGhhdCBjb250cmlidXRlIG1vc3QgdG8gZWFjaCBzZW50aW1lbnQuIA0KYGBge3J9DQpiaW5nIDwtIGdldF9zZW50aW1lbnRzKCJiaW5nIikNCmJpbmcNCmBgYA0KDQpgYGB7cn0NCm1hcmluYV93b3JkcyAlPiUgDQogIGlubmVyX2pvaW4oYmluZykgJT4lIA0KICBjb3VudCh3b3JkLCBzZW50aW1lbnQsIHNvcnQgPSBUUlVFKSAlPiUNCiAgZ3JvdXBfYnkoc2VudGltZW50KSAlPiUNCiAgdG9wX24oMTApICU+JQ0KICB1bmdyb3VwKCkgJT4lDQogIG11dGF0ZSh3b3JkID0gcmVvcmRlcih3b3JkLCBuKSkgJT4lDQogIGdncGxvdChhZXMod29yZCwgbiwgZmlsbCA9IHNlbnRpbWVudCkpICsNCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKw0KICBmYWNldF93cmFwKHZhcnMoc2VudGltZW50KSwgc2NhbGVzID0gImZyZWUiKSArDQogIGxhYnMoeSA9ICJNYXJpbmEncyBGcm9vdCBhbGJ1bTogV29yZHMgdGhhdCBjb250cmlidXRlIHRoZSBtb3N0IHRvIGVhY2ggc2VudGltZW50ICIsDQogICAgICAgeCA9IE5VTEwpICsNCiAgc2NhbGVfZmlsbF92aXJpZGlzX2QoKSArDQogIGNvb3JkX2ZsaXAoKSArDQogIHRoZW1lX21pbmltYWwoKQ0KYGBgDQpCeSB0aGUgc2NhbGVzIGF0IHRoZSBib3R0b20geW91IGNhbiBzZWUgdGhhdCB0aGVyZSBhcmUgbWFueSBtb3JlIHBvc2l0aXZlIHdvcmRzIHRoYW4gbmVnYXRpdmUgd29yZHMuDQoNCjQuIEhlcmUgaXMgYSB0YWJsZSBhbmQgd29yZCBjbG91ZCBvZiB0aGUgbW9zdCBjb21tb24gYmlncmFtcy4gIA0KDQpgYGB7cn0NCm1hcmluYV9iaWdyYW1zICU+JQ0KICAgIHNlcGFyYXRlKGJpZ3JhbSwgYygid29yZDEiLCAid29yZDIiKSwgc2VwID0gIiAiKSAlPiUgDQogIGZpbHRlcighd29yZDEgJWluJSBzdG9wX3dvcmRzJHdvcmQpICU+JQ0KICBmaWx0ZXIoIXdvcmQyICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUgDQogIHVuaXRlKGJpZ3JhbSwgd29yZDEsIHdvcmQyLCBzZXAgPSAiICIpICU+JQ0KICBjb3VudChiaWdyYW0sIHNvcnQgPSBUKQ0KDQpgYGANCg0KDQoNCmBgYHtyfQ0KbWFyaW5hX2JpZ3JhbXMgJT4lIA0KICBzZXBhcmF0ZShiaWdyYW0sIGMoIndvcmQxIiwgIndvcmQyIiksIHNlcCA9ICIgIikgJT4lIA0KICBmaWx0ZXIoIXdvcmQxICVpbiUgc3RvcF93b3JkcyR3b3JkKSAlPiUNCiAgZmlsdGVyKCF3b3JkMiAlaW4lIHN0b3Bfd29yZHMkd29yZCkgJT4lIA0KICB1bml0ZShiaWdyYW0sIHdvcmQxLCB3b3JkMiwgc2VwID0gIiAiKSAlPiUNCiAgY291bnQoYmlncmFtLCBzb3J0ID1UKSAlPiUNCiAgZmlsdGVyKG4gPiAxKSAlPiUgDQogIHdvcmRjbG91ZDIoc2l6ZSA9IC41KQ0KYGBgDQpJIGZpbmQgaXQgaW50ZXJlc3RpbmcgdGhhdCB0aGVyZSBhcmUgbWFueSBiaWdyYW1zIHVzZWQgNiB0aW1lcy4gQWxzbyAiTkEgTkEiIGlzIGEgZnVubnkgYmlncmFtLg0KDQo1LiBIZXJlIEkgdXNlZCB0aGUgYmlncmFtIG1ldGhvZCB0byBmaW5kIHRoZSBtb3N0IGNvbW1vbiB3b3JkcyB0aGF0IGNvbWUgYWZ0ZXIgdGhlIHdvcmRzIEkgYW5kIHlvdQ0KYGBge3J9DQpmaXJzdF93b3JkIDwtIGMoImkiLCAieW91IikgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgDQoNCm1hcmluYV9iaWdyYW1zICU+JSANCiAgY291bnQoYmlncmFtLCBzb3J0ID0gVCkgJT4lIA0KICBzZXBhcmF0ZShiaWdyYW0sIGMoIndvcmQxIiwgIndvcmQyIiksIHNlcCA9ICIgIikgJT4lICAgICAgIA0KICBmaWx0ZXIod29yZDEgJWluJSBmaXJzdF93b3JkKSAlPiUgICAgICAgICAgICAgICAgICAgICAgICAgIA0KICBjb3VudCh3b3JkMSwgd29yZDIsIHd0ID0gbiwgc29ydCA9IFRSVUUpICU+JQ0KICBtdXRhdGUod29yZDIgPSBmYWN0b3Iod29yZDIsIGxldmVscyA9IHJldih1bmlxdWUod29yZDIpKSkpICU+JSAgICAgDQogIGdyb3VwX2J5KHdvcmQxKSAlPiUgDQogIHRvcF9uKDUpICU+JSANCiAgZ2dwbG90KGFlcyh3b3JkMiwgbiwgZmlsbCA9IHdvcmQxKSkgKyAgICAgICAgICAgICAgICAgICAgICAgICAgDQogIHNjYWxlX2ZpbGxfdmlyaWRpc19kKCkgKyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICANCiAgZ2VvbV9jb2woc2hvdy5sZWdlbmQgPSBGQUxTRSkgKw0KICBsYWJzKHggPSBOVUxMLCB5ID0gTlVMTCwgdGl0bGUgPSAiV29yZCBmb2xsb3dpbmc6IikgKw0KICBmYWNldF93cmFwKH53b3JkMSwgc2NhbGVzID0gImZyZWUiKSArDQogIGNvb3JkX2ZsaXAoKSArDQogIHRoZW1lX21pbmltYWwoKQ0KDQpgYGANCkkgZmluZCBpdCBpbnRlcmVzdGluZyBib3RoIGNhbiBhbmQgY2FuJ3QgZm9sbG93ICJ5b3UiLg==