b_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/beyonce_lyrics.csv')
## Rows: 22616 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): line, song_name, artist_name
## dbl (3): song_id, artist_id, song_line
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ts_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/taylor_swift_lyrics.csv')
## Rows: 132 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Artist, Album, Title, Lyrics
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sales <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/sales.csv')
## Rows: 48 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): artist, title, country, released, re_release, label, formats
## dbl (1): sales
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(here)
## here() starts at /Users/ying/Desktop
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s5/2203ms4n7rd94rl_mjh0mt_80000gn/T//RtmpD4ALt0/downloaded_packages
library(stringr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(forcats)
install.packages("tidytext", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s5/2203ms4n7rd94rl_mjh0mt_80000gn/T//RtmpD4ALt0/downloaded_packages
install.packages("textdata", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/s5/2203ms4n7rd94rl_mjh0mt_80000gn/T//RtmpD4ALt0/downloaded_packages
rd <- sales$released
rd1 <- str_remove(rd, "\\s*\\([^\\)]+\\)")
rd2 <- str_remove(rd1, "\\[[^\\]]*\\]")
sales$released_new <- mdy(rd2)
country_factors <- c("AUS", "CAN", "FRA", "FR", "JPN", "UK", "US", "World", "WW")
sales$country <- factor(sales$country, country_factors)
final_sales <- sales %>%
mutate(country = fct_collapse(country, "FRA" = c("FRA", "FR"),
"World" = c("World", "WW"))) %>%
mutate(sales_in_millions = sales/1000000) %>%
filter(country == c("US", "UK", "World"))
final_sales
## # A tibble: 19 × 10
## artist title country sales relea…¹ re_re…² label formats released…³ sales…⁴
## <chr> <chr> <fct> <dbl> <chr> <chr> <chr> <chr> <date> <dbl>
## 1 Taylor… Tayl… US 5.72e6 Octobe… March … Big … CD, CD… 2006-10-24 5.72
## 2 Taylor… Fear… UK 6.09e5 Novemb… Octobe… Big … CD, CD… 2008-11-11 0.609
## 3 Taylor… Spea… World 5 e6 Octobe… <NA> Big … CD, CD… 2010-10-25 5
## 4 Taylor… Spea… US 4.69e6 Octobe… <NA> Big … CD, CD… 2010-10-25 4.69
## 5 Taylor… Spea… UK 1.69e5 Octobe… <NA> Big … CD, CD… 2010-10-25 0.169
## 6 Taylor… Red World 6 e6 Octobe… <NA> Big … CD, CD… 2012-10-22 6
## 7 Taylor… Red US 4.46e6 Octobe… <NA> Big … CD, CD… 2012-10-22 4.46
## 8 Taylor… Repu… UK 3.78e5 Novemb… <NA> Big … CD, CD… 2017-11-10 0.378
## 9 Taylor… Lover World 3.2 e6 August… <NA> Repu… CD, LP… 2019-08-23 3.2
## 10 Taylor… Lover US 1.08e6 August… <NA> Repu… CD, LP… 2019-08-23 1.08
## 11 Taylor… Lover UK 2.22e5 August… <NA> Repu… CD, LP… 2019-08-23 0.222
## 12 Beyoncé 4 US 1.5 e6 June 2… <NA> Park… CD, di… 2011-06-24 1.5
## 13 Beyoncé 4 UK 7.91e5 June 2… <NA> Park… CD, di… 2011-06-24 0.791
## 14 Beyoncé Beyo… World 5 e6 Decemb… <NA> Park… CD, CD… 2013-12-13 5
## 15 Beyoncé Beyo… US 2.51e6 Decemb… <NA> Park… CD, CD… 2013-12-13 2.51
## 16 Beyoncé Beyo… UK 4.18e5 Decemb… <NA> Park… CD, CD… 2013-12-13 0.418
## 17 Beyoncé Lemo… World 2.5 e6 April … <NA> Park… CD/DVD… 2016-04-23 2.5
## 18 Beyoncé Lemo… US 1.55e6 April … <NA> Park… CD/DVD… 2016-04-23 1.55
## 19 Beyoncé Lemo… UK 3.28e5 April … <NA> Park… CD/DVD… 2016-04-23 0.328
## # … with abbreviated variable names ¹released, ²re_release, ³released_new,
## # ⁴sales_in_millions
x <- today()
final_sales_US_only <- final_sales %>%
filter(country == "US") %>%
mutate(years_since_release = round(time_length(difftime(x, released_new), "years")))
view(final_sales_US_only)
final_sales_US_only %>%
mutate(years_since_release = round(time_length(difftime(x, released_new), "years"))) %>%
group_by(artist) %>%
summarise(oldest = max(years_since_release),
most_recent = min(years_since_release),
median = median(years_since_release))
## # A tibble: 2 × 4
## artist oldest most_recent median
## <chr> <dbl> <dbl> <dbl>
## 1 Beyoncé 11 6 9
## 2 Taylor Swift 16 3 11
final_sales %>%
group_by(country) %>%
count
## # A tibble: 3 × 2
## # Groups: country [3]
## country n
## <fct> <int>
## 1 UK 7
## 2 US 7
## 3 World 5
ggplot(final_sales, aes(fill=country, y=sales_in_millions, x=artist)) +
geom_bar(position="fill", stat="identity")+
labs(title = "Percentage of sales of studio albums", x = "Artist", y = "Sales (in millions)", color = " ",
subtitle = "In general, Taylor Swift is more popular in the US and Beyonce is more popular worldwide.", caption = "Created by Ying Zhang")
## Part 1D
all_titles <- c(unique(final_sales$title))
all_titles
## [1] "Taylor Swift" "Fearless" "Speak Now" "Red" "Reputation"
## [6] "Lover" "4" "Beyoncé" "Lemonade"
final_sales$title <- factor(final_sales$title, all_titles)
final_sales$title
## [1] Taylor Swift Fearless Speak Now Speak Now Speak Now
## [6] Red Red Reputation Lover Lover
## [11] Lover 4 4 Beyoncé Beyoncé
## [16] Beyoncé Lemonade Lemonade Lemonade
## 9 Levels: Taylor Swift Fearless Speak Now Red Reputation Lover 4 ... Lemonade
final_sales %>%
filter(country == "World") %>%
mutate(title = fct_reorder(title, sales_in_millions)) %>%
ggplot(aes(x = sales_in_millions, y = title, fill = artist)) +
geom_bar(stat = "identity")+
labs(title = "Sales of studio albums", x = "Sales (in millions)", y = "Album title", color = " ",
subtitle = "Worldwide, TS's Red is the best-selling album among all the albums of TS and Beyonce", caption = "Created by Ying Zhang")
## Part 1E
final_sales %>%
ggplot(aes(x = released_new, y = sales_in_millions, color = artist)) +
geom_point()+
facet_grid(country~.)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
labs(title = "Albums sales and Date", x = "released rate", y = "sales (in millions)", color = " ",
subtitle = "Album sales in the UK is lower than that in the US", caption = "Created by Ying Zhang")
library(tidytext)
ts_token <-
ts_lyrics %>%
unnest_tokens(output = line,
input = Lyrics,
token = "lines")
filter(ts_token, grepl('hello', line)) ## 6 lines
## # A tibble: 6 × 4
## Artist Album Title line
## <chr> <chr> <chr> <chr>
## 1 Taylor Swift Fearless Love Story "and say, \"hello\""
## 2 Taylor Swift Red I Almost Do "that i can't say \"hello\" to y…
## 3 Taylor Swift Red Everything Has Changed "'cause all i know is we said he…
## 4 Taylor Swift Red Everything Has Changed "'cause all i know is we said he…
## 5 Taylor Swift Red Everything Has Changed "all i know is we said hello"
## 6 Taylor Swift Red Everything Has Changed "all i know is we said hello"
filter(ts_token, grepl("goodbye", line)) ## 12 lines
## # A tibble: 12 × 4
## Artist Album Title line
## <chr> <chr> <chr> <chr>
## 1 Taylor Swift Taylor Swift Tied Together With A Smile "goodbye, baby"
## 2 Taylor Swift Speak Now Mine "braced myself for the …
## 3 Taylor Swift Speak Now Back to December "you gave me all your l…
## 4 Taylor Swift Speak Now Long Live "and force us into a go…
## 5 Taylor Swift Red I Almost Do "and risk another goodb…
## 6 Taylor Swift Red Come Back Be Here "stumbled through the l…
## 7 Taylor Swift 1989 All You Had to Do Was Stay "but people like me are…
## 8 Taylor Swift reputation Getaway Car "said goodbye in "
## 9 Taylor Swift reputation Getaway Car "said goodbye in "
## 10 Taylor Swift Lover Death By A Thousand Cuts "saying goodbye is deat…
## 11 Taylor Swift Lover Death By A Thousand Cuts "'cause saying goodbye …
## 12 Taylor Swift Lover Daylight "i'll tell you truth, b…
b_token <-
b_lyrics %>%
unnest_tokens(output = line_new,
input = line,
token = "lines")
filter(b_token, grepl('hello', line_new)) ## 91 lines
## # A tibble: 91 × 6
## song_id song_name artis…¹ artis…² song_…³ line_…⁴
## <dbl> <chr> <dbl> <chr> <dbl> <chr>
## 1 2220711 "Dreamgirls Medley (The Beyonce Expe… 498 Beyoncé 5 hello …
## 2 1981227 "Fingertips/Master Blaster (Jammin')… 498 Beyoncé 6 hello …
## 3 2715227 "FREEDOM (2016 BET Awards) (Ft. Kend… 498 Beyoncé 52 fellow…
## 4 80249 "Hello" 498 Beyoncé 15 you ha…
## 5 80249 "Hello" 498 Beyoncé 16 hello …
## 6 80249 "Hello" 498 Beyoncé 17 hello …
## 7 80249 "Hello" 498 Beyoncé 18 you ha…
## 8 80249 "Hello" 498 Beyoncé 19 hello …
## 9 80249 "Hello" 498 Beyoncé 20 hello …
## 10 80249 "Hello" 498 Beyoncé 24 'cause…
## # … with 81 more rows, and abbreviated variable names ¹artist_id, ²artist_name,
## # ³song_line, ⁴line_new
filter(b_token, grepl("goodbye", line_new)) ## 12 lines
## # A tibble: 12 × 6
## song_id song_name artis…¹ artis…² song_…³ line_…⁴
## <dbl> <chr> <dbl> <chr> <dbl> <chr>
## 1 139043 Back to Black (Ft. André 3000) 498 Beyoncé 12 we onl…
## 2 139043 Back to Black (Ft. André 3000) 498 Beyoncé 21 we onl…
## 3 139043 Back to Black (Ft. André 3000) 498 Beyoncé 24 we onl…
## 4 51492 Best Thing I Never Had 498 Beyoncé 38 thank …
## 5 1946060 Best Thing I Never Had (Lars B Remix) 498 Beyoncé 42 thank …
## 6 4241137 Best Thing I Never Had [Original Ver… 498 Beyoncé 38 thank …
## 7 435491 Gift from Virgo 498 Beyoncé 23 it's s…
## 8 435491 Gift from Virgo 498 Beyoncé 24 i neve…
## 9 435491 Gift from Virgo 498 Beyoncé 25 i neve…
## 10 1844620 Hard To Say Goodbye 498 Beyoncé 29 we've …
## 11 1224115 Slow Love 498 Beyoncé 42 don't …
## 12 141848 Yes 498 Beyoncé 27 somewh…
## # … with abbreviated variable names ¹artist_id, ²artist_name, ³song_line,
## # ⁴line_new
b_token_w <-
b_lyrics %>%
unnest_tokens(output = word,
input = line,
token = "words") %>%
anti_join(stop_words)
## Joining, by = "word"
b_token_w %>%
count(word)
## # A tibble: 5,937 × 2
## word n
## <chr> <int>
## 1 03 1
## 2 1 20
## 3 10 5
## 4 100 1
## 5 11 3
## 6 12 5
## 7 13 2
## 8 14 4
## 9 15 6
## 10 16 4
## # … with 5,927 more rows
bws <- b_token_w %>%
inner_join(get_sentiments("bing")) %>%
count(word)
## Joining, by = "word"
bw <- bws %>%
arrange(desc(n)) %>%
inner_join(get_sentiments("bing")) %>%
head(25)
## Joining, by = "word"
print(as_tibble(bw))
## # A tibble: 25 × 3
## word n sentiment
## <chr> <int> <chr>
## 1 love 1362 positive
## 2 crazy 308 negative
## 3 top 241 positive
## 4 bad 132 negative
## 5 beautiful 131 positive
## 6 whoa 121 positive
## 7 damn 106 negative
## 8 hurt 90 negative
## 9 hard 87 negative
## 10 ready 85 positive
## # … with 15 more rows
bw %>%
ggplot(aes(x = n, y = reorder(word, n), fill = sentiment))+
geom_bar(stat = "identity")+
labs(title = "lyrics and sentiment from Beyonce", x = "word frequency", y = "word", color = " ",
subtitle = "Beyonce loves love", caption = "Created by Ying Zhang")
## Part 2D
ts_token_w <-
ts_lyrics %>%
unnest_tokens(output = word,
input = Lyrics,
token = "words") %>%
anti_join(stop_words)
## Joining, by = "word"
ts_token_w %>%
count(word)
## # A tibble: 2,579 × 2
## word n
## <chr> <int>
## 1 1 2
## 2 16 1
## 3 16th 3
## 4 2 4
## 5 3 1
## 6 45 1
## 7 4am 1
## 8 58 1
## 9 7 1
## 10 a.m 8
## # … with 2,569 more rows
tsws <- ts_token_w %>%
inner_join(get_sentiments("bing")) %>%
count(word)
## Joining, by = "word"
tsw <- tsws %>%
arrange(desc(n)) %>%
inner_join(get_sentiments("bing")) %>%
head(25)
## Joining, by = "word"
print(as_tibble(tsw))
## # A tibble: 25 × 3
## word n sentiment
## <chr> <int> <chr>
## 1 love 248 positive
## 2 bad 80 negative
## 3 shake 73 negative
## 4 break 59 negative
## 5 mad 48 negative
## 6 beautiful 46 positive
## 7 smile 45 positive
## 8 hate 44 negative
## 9 fall 43 negative
## 10 whoa 36 positive
## # … with 15 more rows
tsw %>%
ggplot(aes(x = n, y = reorder(word, n), fill = sentiment))+
geom_bar(stat = "identity")+
labs(title = "lyrics and sentiment from Taylor Swift", x = "word frequency", y = "word", color = " ",
subtitle = "Taylor Swift also loves love", caption = "Created by Ying Zhang")
## Part 2E
ts_token_w %>%
group_by(Album) %>%
count(word)
## # A tibble: 4,968 × 3
## # Groups: Album [8]
## Album word n
## <chr> <chr> <int>
## 1 1989 2 3
## 2 1989 a.m 3
## 3 1989 ace 1
## 4 1989 admit 1
## 5 1989 afraid 1
## 6 1989 ah 24
## 7 1989 ahead 1
## 8 1989 aids 2
## 9 1989 airplanes 1
## 10 1989 alright 2
## # … with 4,958 more rows
ts_token_w1 <- ts_token_w %>%
inner_join(get_sentiments("afinn"))
## Joining, by = "word"
ts_token_w2 <- ts_token_w1 %>%
group_by(Album) %>%
summarise(average = mean(value))
colnames(ts_token_w2)[1] <- "title"
df<-
final_sales %>%
right_join(ts_token_w2) %>%
filter(!is.na(artist))
## Joining, by = "title"
view(df)
ggplot(df, aes(x = released_new, y = average))+
geom_point(aes(size = sales_in_millions)) +
geom_hline(yintercept=0) +
labs(title = "Taylor Swift's lyrics sentiment overtime", x = "Released date", y = "Average sentiment score", color = " ",
subtitle = "Taylor Swift's songs have been going negative in the past decade. However, the album sales are not affected.", caption = "Created by Ying Zhang")