Project3

 b_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/beyonce_lyrics.csv')

## Rows: 22616 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): line, song_name, artist_name
## dbl (3): song_id, artist_id, song_line
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ts_lyrics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/taylor_swift_lyrics.csv')

## Rows: 132 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Artist, Album, Title, Lyrics
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sales <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/sales.csv')

## Rows: 48 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): artist, title, country, released, re_release, label, formats
## dbl (1): sales
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

library(here)

## here() starts at /Users/ying/Desktop

install.packages("tidyverse", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s5/2203ms4n7rd94rl_mjh0mt_80000gn/T//RtmpD4ALt0/downloaded_packages

library(stringr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ forcats 0.5.2 
## ✔ readr   2.1.3      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(forcats)
install.packages("tidytext", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s5/2203ms4n7rd94rl_mjh0mt_80000gn/T//RtmpD4ALt0/downloaded_packages

install.packages("textdata", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/s5/2203ms4n7rd94rl_mjh0mt_80000gn/T//RtmpD4ALt0/downloaded_packages

Part 1A

rd <- sales$released 
rd1 <- str_remove(rd, "\\s*\\([^\\)]+\\)")
rd2 <- str_remove(rd1,  "\\[[^\\]]*\\]")
sales$released_new <- mdy(rd2)

country_factors <- c("AUS", "CAN", "FRA", "FR", "JPN", "UK", "US", "World", "WW")
sales$country <- factor(sales$country, country_factors)
final_sales <- sales %>% 
  mutate(country = fct_collapse(country, "FRA" = c("FRA", "FR"),
                                "World" = c("World", "WW"))) %>% 
  mutate(sales_in_millions = sales/1000000) %>% 
  filter(country == c("US", "UK", "World"))
final_sales

## # A tibble: 19 × 10
##    artist  title country  sales relea…¹ re_re…² label formats released…³ sales…⁴
##    <chr>   <chr> <fct>    <dbl> <chr>   <chr>   <chr> <chr>   <date>       <dbl>
##  1 Taylor… Tayl… US      5.72e6 Octobe… March … Big … CD, CD… 2006-10-24   5.72 
##  2 Taylor… Fear… UK      6.09e5 Novemb… Octobe… Big … CD, CD… 2008-11-11   0.609
##  3 Taylor… Spea… World   5   e6 Octobe… <NA>    Big … CD, CD… 2010-10-25   5    
##  4 Taylor… Spea… US      4.69e6 Octobe… <NA>    Big … CD, CD… 2010-10-25   4.69 
##  5 Taylor… Spea… UK      1.69e5 Octobe… <NA>    Big … CD, CD… 2010-10-25   0.169
##  6 Taylor… Red   World   6   e6 Octobe… <NA>    Big … CD, CD… 2012-10-22   6    
##  7 Taylor… Red   US      4.46e6 Octobe… <NA>    Big … CD, CD… 2012-10-22   4.46 
##  8 Taylor… Repu… UK      3.78e5 Novemb… <NA>    Big … CD, CD… 2017-11-10   0.378
##  9 Taylor… Lover World   3.2 e6 August… <NA>    Repu… CD, LP… 2019-08-23   3.2  
## 10 Taylor… Lover US      1.08e6 August… <NA>    Repu… CD, LP… 2019-08-23   1.08 
## 11 Taylor… Lover UK      2.22e5 August… <NA>    Repu… CD, LP… 2019-08-23   0.222
## 12 Beyoncé 4     US      1.5 e6 June 2… <NA>    Park… CD, di… 2011-06-24   1.5  
## 13 Beyoncé 4     UK      7.91e5 June 2… <NA>    Park… CD, di… 2011-06-24   0.791
## 14 Beyoncé Beyo… World   5   e6 Decemb… <NA>    Park… CD, CD… 2013-12-13   5    
## 15 Beyoncé Beyo… US      2.51e6 Decemb… <NA>    Park… CD, CD… 2013-12-13   2.51 
## 16 Beyoncé Beyo… UK      4.18e5 Decemb… <NA>    Park… CD, CD… 2013-12-13   0.418
## 17 Beyoncé Lemo… World   2.5 e6 April … <NA>    Park… CD/DVD… 2016-04-23   2.5  
## 18 Beyoncé Lemo… US      1.55e6 April … <NA>    Park… CD/DVD… 2016-04-23   1.55 
## 19 Beyoncé Lemo… UK      3.28e5 April … <NA>    Park… CD/DVD… 2016-04-23   0.328
## # … with abbreviated variable names ¹released, ²re_release, ³released_new,
## #   ⁴sales_in_millions

Part 1B

x <- today()
final_sales_US_only <- final_sales %>% 
  filter(country == "US") %>% 
  mutate(years_since_release = round(time_length(difftime(x, released_new), "years")))

view(final_sales_US_only)

final_sales_US_only %>% 
  mutate(years_since_release = round(time_length(difftime(x, released_new), "years"))) %>% 
  group_by(artist) %>% 
  summarise(oldest = max(years_since_release),
            most_recent = min(years_since_release),
            median = median(years_since_release))

## # A tibble: 2 × 4
##   artist       oldest most_recent median
##   <chr>         <dbl>       <dbl>  <dbl>
## 1 Beyoncé          11           6      9
## 2 Taylor Swift     16           3     11

Part 1C

final_sales %>% 
  group_by(country) %>% 
  count

## # A tibble: 3 × 2
## # Groups:   country [3]
##   country     n
##   <fct>   <int>
## 1 UK          7
## 2 US          7
## 3 World       5

ggplot(final_sales, aes(fill=country, y=sales_in_millions, x=artist)) + 
  geom_bar(position="fill", stat="identity")+
  labs(title = "Percentage of sales of studio albums", x = "Artist", y = "Sales (in millions)", color = " ",
       subtitle = "In general, Taylor Swift is more popular in the US and Beyonce is more popular worldwide.", caption = "Created by Ying Zhang")

## Part 1D

all_titles <- c(unique(final_sales$title))
all_titles

## [1] "Taylor Swift" "Fearless"     "Speak Now"    "Red"          "Reputation"  
## [6] "Lover"        "4"            "Beyoncé"      "Lemonade"

final_sales$title <- factor(final_sales$title, all_titles)
final_sales$title

##  [1] Taylor Swift Fearless     Speak Now    Speak Now    Speak Now   
##  [6] Red          Red          Reputation   Lover        Lover       
## [11] Lover        4            4            Beyoncé      Beyoncé     
## [16] Beyoncé      Lemonade     Lemonade     Lemonade    
## 9 Levels: Taylor Swift Fearless Speak Now Red Reputation Lover 4 ... Lemonade

final_sales %>% 
  filter(country == "World") %>% 
  mutate(title = fct_reorder(title, sales_in_millions)) %>% 
  ggplot(aes(x = sales_in_millions, y = title, fill = artist)) +
  geom_bar(stat = "identity")+
  labs(title = "Sales of studio albums", x = "Sales (in millions)", y = "Album title", color = " ",
       subtitle = "Worldwide, TS's Red is the best-selling album among all the albums of TS and Beyonce", caption = "Created by Ying Zhang")

## Part 1E

final_sales %>% 
  ggplot(aes(x = released_new, y = sales_in_millions, color = artist)) +
  geom_point()+
  facet_grid(country~.)+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  labs(title = "Albums sales and Date", x = "released rate", y = "sales (in millions)", color = " ",
       subtitle = "Album sales in the UK is lower than that in the US", caption = "Created by Ying Zhang")

Part 2A

library(tidytext) 
 
ts_token <- 
  ts_lyrics %>% 
  unnest_tokens(output = line, 
                input = Lyrics, 
                token = "lines")

filter(ts_token, grepl('hello', line)) ## 6 lines

## # A tibble: 6 × 4
##   Artist       Album    Title                  line                             
##   <chr>        <chr>    <chr>                  <chr>                            
## 1 Taylor Swift Fearless Love Story             "and say, \"hello\""             
## 2 Taylor Swift Red      I Almost Do            "that i can't say \"hello\" to y…
## 3 Taylor Swift Red      Everything Has Changed "'cause all i know is we said he…
## 4 Taylor Swift Red      Everything Has Changed "'cause all i know is we said he…
## 5 Taylor Swift Red      Everything Has Changed "all i know is we said hello"    
## 6 Taylor Swift Red      Everything Has Changed "all i know is we said hello"

filter(ts_token, grepl("goodbye", line)) ## 12 lines

## # A tibble: 12 × 4
##    Artist       Album        Title                      line                    
##    <chr>        <chr>        <chr>                      <chr>                   
##  1 Taylor Swift Taylor Swift Tied Together With A Smile "goodbye, baby"         
##  2 Taylor Swift Speak Now    Mine                       "braced myself for the …
##  3 Taylor Swift Speak Now    Back to December           "you gave me all your l…
##  4 Taylor Swift Speak Now    Long Live                  "and force us into a go…
##  5 Taylor Swift Red          I Almost Do                "and risk another goodb…
##  6 Taylor Swift Red          Come Back Be Here          "stumbled through the l…
##  7 Taylor Swift 1989         All You Had to Do Was Stay "but people like me are…
##  8 Taylor Swift reputation   Getaway Car                "said goodbye in "      
##  9 Taylor Swift reputation   Getaway Car                "said goodbye in "      
## 10 Taylor Swift Lover        Death By A Thousand Cuts   "saying goodbye is deat…
## 11 Taylor Swift Lover        Death By A Thousand Cuts   "'cause saying goodbye …
## 12 Taylor Swift Lover        Daylight                   "i'll tell you truth, b…

Part 2B

b_token <- 
  b_lyrics %>% 
  unnest_tokens(output = line_new, 
                input = line, 
                token = "lines")

filter(b_token, grepl('hello', line_new)) ## 91 lines

## # A tibble: 91 × 6
##    song_id song_name                             artis…¹ artis…² song_…³ line_…⁴
##      <dbl> <chr>                                   <dbl> <chr>     <dbl> <chr>  
##  1 2220711 "Dreamgirls Medley (The Beyonce Expe…     498 Beyoncé       5 hello …
##  2 1981227 "Fingertips/Master Blaster (Jammin')…     498 Beyoncé       6 hello …
##  3 2715227 "FREEDOM (2016 BET Awards) (Ft. Kend…     498 Beyoncé      52 fellow…
##  4   80249 "Hello"                                   498 Beyoncé      15 you ha…
##  5   80249 "Hello"                                   498 Beyoncé      16 hello …
##  6   80249 "Hello"                                   498 Beyoncé      17 hello …
##  7   80249 "Hello"                                   498 Beyoncé      18 you ha…
##  8   80249 "Hello"                                   498 Beyoncé      19 hello …
##  9   80249 "Hello"                                   498 Beyoncé      20 hello …
## 10   80249 "Hello"                                   498 Beyoncé      24 'cause…
## # … with 81 more rows, and abbreviated variable names ¹artist_id, ²artist_name,
## #   ³song_line, ⁴line_new

filter(b_token, grepl("goodbye", line_new)) ## 12 lines

## # A tibble: 12 × 6
##    song_id song_name                             artis…¹ artis…² song_…³ line_…⁴
##      <dbl> <chr>                                   <dbl> <chr>     <dbl> <chr>  
##  1  139043 Back to Black (Ft. André 3000)            498 Beyoncé      12 we onl…
##  2  139043 Back to Black (Ft. André 3000)            498 Beyoncé      21 we onl…
##  3  139043 Back to Black (Ft. André 3000)            498 Beyoncé      24 we onl…
##  4   51492 Best Thing I Never Had                    498 Beyoncé      38 thank …
##  5 1946060 Best Thing I Never Had (Lars B Remix)     498 Beyoncé      42 thank …
##  6 4241137 Best Thing I Never Had [Original Ver…     498 Beyoncé      38 thank …
##  7  435491 Gift from Virgo                           498 Beyoncé      23 it's s…
##  8  435491 Gift from Virgo                           498 Beyoncé      24 i neve…
##  9  435491 Gift from Virgo                           498 Beyoncé      25 i neve…
## 10 1844620 Hard To Say Goodbye                       498 Beyoncé      29 we've …
## 11 1224115 Slow Love                                 498 Beyoncé      42 don't …
## 12  141848 Yes                                       498 Beyoncé      27 somewh…
## # … with abbreviated variable names ¹artist_id, ²artist_name, ³song_line,
## #   ⁴line_new

Part 2C

b_token_w <-
  b_lyrics %>% 
  unnest_tokens(output = word, 
                input = line, 
                token = "words") %>% 
  anti_join(stop_words)

## Joining, by = "word"

b_token_w %>% 
  count(word)

## # A tibble: 5,937 × 2
##    word      n
##    <chr> <int>
##  1 03        1
##  2 1        20
##  3 10        5
##  4 100       1
##  5 11        3
##  6 12        5
##  7 13        2
##  8 14        4
##  9 15        6
## 10 16        4
## # … with 5,927 more rows

bws <-  b_token_w %>%
  inner_join(get_sentiments("bing")) %>% 
  count(word)

## Joining, by = "word"

bw <- bws %>% 
  arrange(desc(n)) %>% 
  inner_join(get_sentiments("bing")) %>% 
  head(25)

## Joining, by = "word"

print(as_tibble(bw))

## # A tibble: 25 × 3
##    word          n sentiment
##    <chr>     <int> <chr>    
##  1 love       1362 positive 
##  2 crazy       308 negative 
##  3 top         241 positive 
##  4 bad         132 negative 
##  5 beautiful   131 positive 
##  6 whoa        121 positive 
##  7 damn        106 negative 
##  8 hurt         90 negative 
##  9 hard         87 negative 
## 10 ready        85 positive 
## # … with 15 more rows

bw %>% 
  ggplot(aes(x = n, y = reorder(word, n), fill = sentiment))+
  geom_bar(stat = "identity")+
  labs(title = "lyrics and sentiment from Beyonce", x = "word frequency", y = "word", color = " ",
       subtitle = "Beyonce loves love", caption = "Created by Ying Zhang")

## Part 2D

ts_token_w <-
  ts_lyrics %>% 
  unnest_tokens(output = word, 
                input = Lyrics, 
                token = "words") %>% 
  anti_join(stop_words)

## Joining, by = "word"

ts_token_w %>% 
  count(word)

## # A tibble: 2,579 × 2
##    word      n
##    <chr> <int>
##  1 1         2
##  2 16        1
##  3 16th      3
##  4 2         4
##  5 3         1
##  6 45        1
##  7 4am       1
##  8 58        1
##  9 7         1
## 10 a.m       8
## # … with 2,569 more rows

tsws <-  ts_token_w %>%
  inner_join(get_sentiments("bing")) %>% 
  count(word)

## Joining, by = "word"

tsw <- tsws %>% 
  arrange(desc(n)) %>% 
  inner_join(get_sentiments("bing")) %>% 
  head(25)

## Joining, by = "word"

print(as_tibble(tsw))

## # A tibble: 25 × 3
##    word          n sentiment
##    <chr>     <int> <chr>    
##  1 love        248 positive 
##  2 bad          80 negative 
##  3 shake        73 negative 
##  4 break        59 negative 
##  5 mad          48 negative 
##  6 beautiful    46 positive 
##  7 smile        45 positive 
##  8 hate         44 negative 
##  9 fall         43 negative 
## 10 whoa         36 positive 
## # … with 15 more rows

tsw %>% 
  ggplot(aes(x = n, y = reorder(word, n), fill = sentiment))+
  geom_bar(stat = "identity")+
  labs(title = "lyrics and sentiment from Taylor Swift", x = "word frequency", y = "word", color = " ",
       subtitle = "Taylor Swift also loves love", caption = "Created by Ying Zhang")

## Part 2E

ts_token_w %>% 
  group_by(Album) %>% 
  count(word)

## # A tibble: 4,968 × 3
## # Groups:   Album [8]
##    Album word          n
##    <chr> <chr>     <int>
##  1 1989  2             3
##  2 1989  a.m           3
##  3 1989  ace           1
##  4 1989  admit         1
##  5 1989  afraid        1
##  6 1989  ah           24
##  7 1989  ahead         1
##  8 1989  aids          2
##  9 1989  airplanes     1
## 10 1989  alright       2
## # … with 4,958 more rows

ts_token_w1 <-  ts_token_w %>%
  inner_join(get_sentiments("afinn"))

## Joining, by = "word"

ts_token_w2 <- ts_token_w1 %>% 
  group_by(Album) %>% 
  summarise(average = mean(value))
colnames(ts_token_w2)[1] <- "title"
df<- 
  final_sales %>% 
  right_join(ts_token_w2) %>% 
  filter(!is.na(artist))

## Joining, by = "title"

view(df)

ggplot(df, aes(x = released_new, y = average))+
  geom_point(aes(size = sales_in_millions)) +
  geom_hline(yintercept=0) +
  labs(title = "Taylor Swift's lyrics sentiment overtime", x = "Released date", y = "Average sentiment score", color = " ",
       subtitle = "Taylor Swift's songs have been going negative in the past decade. However, the album sales are not affected.", caption = "Created by Ying Zhang")

Project3

Ying

10/18/2022

Part 1A

Part 1B

Part 1C

Part 2A

Part 2B

Part 2C