load library
library(data.table)
library(dplyr) #data manipulation
library(ggplot2) #visualizations
library(gridExtra) #viewing multiple plots together
library(tidytext) #text mining
#library(wordcloud2) #creative visualizations
load data
lyric_data=fread('../data/prince_raw_data.csv')
1.1 Contrasting tidy text with other data structures
1.2 The unnest_tokens function
把以下4個句子轉成tidy data
text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
[1] "Because I could not stop for Death -"
[2] "He kindly stopped for me -"
[3] "The Carriage held but just Ourselves -"
[4] "and Immortality"
轉成tidy data前先轉成dataframe
text_df <- data_frame(line = 1:4, text = text)
text_df
library(tidytext)
1.3 Tidying the words of lyric data
library(janeaustenr)
library(dplyr)
library(stringr)
original_songs <- lyric_data[,1:6] %>%
group_by(album) %>%
mutate(linenumber = row_number()) %>%
ungroup()
original_songs
one-token-per-row format:
library(tidytext)
tidy_songs <- original_songs %>%
unnest_tokens(word, text)
tidy_songs
data(stop_words)
tidy_songs <- tidy_songs %>%
anti_join(stop_words)
Joining, by = "word"
tidy_songs %>%
count(word, sort = TRUE)
tidy_songs %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()

LS0tDQp0aXRsZTogIkNIMSBUaGUgdGlkeSB0ZXh0IGZvcm1hdCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KYXV0aG9yOiAn5YqJ6IKy6YqYJw0KLS0tDQoNCg0KDQojIyMjI2xvYWQgbGlicmFyeQ0KDQpgYGB7cn0NCmxpYnJhcnkoZGF0YS50YWJsZSkNCmxpYnJhcnkoZHBseXIpICNkYXRhIG1hbmlwdWxhdGlvbg0KbGlicmFyeShnZ3Bsb3QyKSAjdmlzdWFsaXphdGlvbnMNCmxpYnJhcnkoZ3JpZEV4dHJhKSAjdmlld2luZyBtdWx0aXBsZSBwbG90cyB0b2dldGhlcg0KbGlicmFyeSh0aWR5dGV4dCkgI3RleHQgbWluaW5nDQojbGlicmFyeSh3b3JkY2xvdWQyKSAjY3JlYXRpdmUgdmlzdWFsaXphdGlvbnMNCmBgYA0KDQojIyMjI2xvYWQgZGF0YQ0KDQpgYGB7cn0NCmx5cmljX2RhdGE9ZnJlYWQoJy4uL2RhdGEvcHJpbmNlX3Jhd19kYXRhLmNzdicpDQpgYGANCg0KDQoNCg0KIyMjMS4xIENvbnRyYXN0aW5nIHRpZHkgdGV4dCB3aXRoIG90aGVyIGRhdGEgc3RydWN0dXJlcw0KDQoNCg0KDQojIyMxLjIgVGhlIHVubmVzdF90b2tlbnMgZnVuY3Rpb24NCg0K5oqK5Lul5LiLNOWAi+WPpeWtkOi9ieaIkHRpZHkgZGF0YQ0KDQpgYGB7cn0NCnRleHQgPC0gYygiQmVjYXVzZSBJIGNvdWxkIG5vdCBzdG9wIGZvciBEZWF0aCAtIiwNCiAgICAgICAgICAiSGUga2luZGx5IHN0b3BwZWQgZm9yIG1lIC0iLA0KICAgICAgICAgICJUaGUgQ2FycmlhZ2UgaGVsZCBidXQganVzdCBPdXJzZWx2ZXMgLSIsDQogICAgICAgICAgImFuZCBJbW1vcnRhbGl0eSIpDQoNCnRleHQNCmBgYA0K6L2J5oiQdGlkeSBkYXRh5YmN5YWI6L2J5oiQZGF0YWZyYW1lDQoNCmBgYHtyfQ0KdGV4dF9kZiA8LSBkYXRhX2ZyYW1lKGxpbmUgPSAxOjQsIHRleHQgPSB0ZXh0KQ0KDQp0ZXh0X2RmDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KYGBgDQoNCiMjIyMjb25lLXRva2VuLXBlci1yb3cgZm9ybWF0Og0KDQpgYGB7cn0NCg0KdGV4dF9kZiAlPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQ0KYGBgDQoNCiMjIzEuMyBUaWR5aW5nIHRoZSB3b3JkcyBvZiBseXJpYyBkYXRhDQoNCmBgYHtyfQ0KbGlicmFyeShqYW5lYXVzdGVucikNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHN0cmluZ3IpDQpgYGANCg0KDQoNCg0KYGBge3J9DQpvcmlnaW5hbF9zb25ncyA8LSBseXJpY19kYXRhWywxOjZdICU+JQ0KICBncm91cF9ieShhbGJ1bSkgJT4lDQogIG11dGF0ZShsaW5lbnVtYmVyID0gcm93X251bWJlcigpKSAlPiUgDQogIHVuZ3JvdXAoKQ0KDQpvcmlnaW5hbF9zb25ncw0KDQpgYGANCg0KDQpvbmUtdG9rZW4tcGVyLXJvdyBmb3JtYXQ6DQoNCg0KYGBge3J9DQpsaWJyYXJ5KHRpZHl0ZXh0KQ0KdGlkeV9zb25ncyA8LSBvcmlnaW5hbF9zb25ncyAlPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0KQ0KDQp0aWR5X3NvbmdzDQpgYGANCg0KDQoNCmBgYHtyfQ0KZGF0YShzdG9wX3dvcmRzKQ0KDQp0aWR5X3NvbmdzIDwtIHRpZHlfc29uZ3MgJT4lDQogIGFudGlfam9pbihzdG9wX3dvcmRzKQ0KYGBgDQoNCmBgYHtyfQ0KdGlkeV9zb25ncyAlPiUNCiAgY291bnQod29yZCwgc29ydCA9IFRSVUUpIA0KYGBgDQoNCg0KYGBge3J9DQp0aWR5X3NvbmdzICU+JQ0KICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkgJT4lDQogIGZpbHRlcihuID4gNjAwKSAlPiUNCiAgbXV0YXRlKHdvcmQgPSByZW9yZGVyKHdvcmQsIG4pKSAlPiUNCiAgZ2dwbG90KGFlcyh3b3JkLCBuKSkgKw0KICBnZW9tX2NvbCgpICsNCiAgeGxhYihOVUxMKSArDQogIGNvb3JkX2ZsaXAoKQ0KYGBgDQoNCg0KDQoNCg0KDQoNCg0KPHN0eWxlPg0KDQplbSB7DQogICAgY29sb3I6ICNGRkVBNkM7DQogICAgYmFja2dyb3VuZDogIzdEN0Q3RDsNCn0NCg0KLmNhcHRpb24gew0KICBjb2xvcjogIzc3NzsNCiAgbWFyZ2luLXRvcDogMTBweDsNCn0NCnAgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcHJlIHsNCiAgd29yZC1icmVhazogbm9ybWFsOw0KICB3b3JkLXdyYXA6IG5vcm1hbDsNCiAgbGluZS1oZWlnaHQ6IDE7DQp9DQpwcmUgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcCxsaSB7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoucnsNCiAgbGluZS1oZWlnaHQ6IDEuMjsNCn0NCg0KLnFpeiB7DQogIGxpbmUtaGVpZ2h0OiAxLjc1Ow0KICBiYWNrZ3JvdW5kOiAjZjBmMGYwOw0KICBib3JkZXItbGVmdDogMTJweCBzb2xpZCAjY2NmZmNjOw0KICBwYWRkaW5nOiA0cHg7DQogIHBhZGRpbmctbGVmdDogMTBweDsNCiAgY29sb3I6ICMwMDk5MDA7DQp9DQoNCnRpdGxlew0KICBjb2xvcjogI2NjMDAwMDsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmJvZHl7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpoMSxoMixoMyxoNCxoNXsNCiAgY29sb3I6ICMwMDY2ZmY7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoNCmgzew0KICBjb2xvcjogI2IzNmIwMDsNCiAgYmFja2dyb3VuZDogI2ZmZTBiMzsNCiAgbGluZS1oZWlnaHQ6IDI7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNXsNCiAgY29sb3I6ICMwMDYwMDA7DQogIGJhY2tncm91bmQ6ICNmOGY4Zjg7DQogIGxpbmUtaGVpZ2h0OiAxLjU7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNiB7DQogICAgY29sb3I6ICMwMDYwMDA7DQogICAgYmFja2dyb3VuZDogIzAwZmZmZjsNCiAgICBsaW5lLWhlaWdodDogMjsNCiAgICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KPC9zdHlsZT4=