Loading Packages

library(tidyverse)
library(stringr)
library(lubridate)
library(skimr)
library(ggpubr)
library(ggimage)

Dates and times

Import the swiftSongs.csv

# Variables to keep
keeps <- c("track_name", "album_name", "youtube_url", "youtube_title", "youtube_publish_date", "youtube_duration", "song_release_date_year", "song_release_date_month", "song_release_date_day")

# Importing CSV file
swiftSongs <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/swiftSongs.csv") %>% 
  dplyr::select(keeps)

Explore high-level characteristics of the data using the glimpse() function.

glimpse(swiftSongs)
## Rows: 151
## Columns: 9
## $ track_name              <chr> "...Ready For It?", "‘tis the damn season", "a…
## $ album_name              <chr> "reputation", "evermore", "folklore", "folklor…
## $ youtube_url             <chr> "http://www.youtube.com/watch?v=wIft-t-MQuE", …
## $ youtube_title           <chr> "Taylor Swift - …Ready For It?", "Taylor Swift…
## $ youtube_publish_date    <dttm> 2017-10-27 04:00:03, 2020-12-11 05:00:05, 202…
## $ youtube_duration        <chr> "PT3M31S", "PT3M56S", "PT4M24S", "PT4M56S", "P…
## $ song_release_date_year  <dbl> 2017, 2020, 2020, 2020, 2020, 2020, 2020, 2020…
## $ song_release_date_month <dbl> 9, 12, 7, 7, 7, 12, 12, 12, 12, 12, 7, 12, 7, …
## $ song_release_date_day   <dbl> 3, 11, 24, 24, 24, 11, 11, 11, 11, 11, 24, 11,…

Are there any noticeable missing value patterns?

Answer: There no missing value patterns

skim(swiftSongs)
Data summary
Name swiftSongs
Number of rows 151
Number of columns 9
_______________________
Column type frequency:
character 5
numeric 3
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
track_name 0 1 2 70 0 151 0
album_name 0 1 3 12 0 10 0
youtube_url 0 1 42 42 0 151 0
youtube_title 0 1 5 79 0 151 0
youtube_duration 0 1 4 7 0 92 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
song_release_date_year 0 1 2014.95 5.17 2006 2010 2017 2020 2022 ▃▅▂▂▇
song_release_date_month 0 1 9.46 1.85 3 8 10 11 12 ▁▁▅▇▅
song_release_date_day 0 1 18.38 6.85 2 11 21 24 28 ▁▅▁▅▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
youtube_publish_date 0 1 2009-06-16 21:42:34 2022-10-25 04:00:09 2018-11-21 08:34:37 134

Coercing strings to a date

ymd("1989-12-13")
## [1] "1989-12-13"
mdy("December 13th, 1989")
## [1] "1989-12-13"
dmy("13-Dec-1989")
## [1] "1989-12-13"
ymd(19891213)
## [1] "1989-12-13"

Create a new character variable song_release_date_char in the swiftSongs data set using the mutate() and str_c() functions.

swiftSongs <- swiftSongs %>% mutate(song_release_date_char = str_c(song_release_date_year, song_release_date_month, song_release_date_day, sep = "-"))

Create a new date / time variable song_release_date using the newly created song_release_date_char variable and the appropriate lubridate helper function.

swiftSongs <- swiftSongs %>% mutate(song_release_date = ymd(song_release_date_char))

Reproduce the scatter plot below showing the relationship between the release date of each of Taylor’s songs, and the release date of the corresponding YouTube video.

ggplot(data = swiftSongs, aes(x=song_release_date,y=youtube_publish_date, color = album_name)) + geom_point() + labs(title ="Taylor swift release dates",
                      x= "Song release date",
                      y = "YouTube video release date",
                      caption = "Data source: Genius API & YouTube API",
                      color = "Album") + theme_bw() + theme(legend.position = "bottom", text = element_text(face = "bold"))

Creating a date from individual components

Recreate the date / time variable song_release_date this time directly using the year, month, and day components with the make_datetime() function.

swiftSongs <- swiftSongs %>% mutate(song_release_date = make_datetime(song_release_date_year, song_release_date_month, song_release_date_day))

Extracting date / time components

Extract the year, month, and day of the release date of the YouTube videos using the youtube_publish_date variable.

swiftSongs <- swiftSongs %>% mutate(youtube_release_year = year(youtube_publish_date),
         youtube_release_month = month(youtube_publish_date),
         youtube_release_day = day(youtube_publish_date))

Extract the day of the week as a string (e.g., Monday, Tuesday, etc.) of the release date of the videos using the youtube_publish_date variable.

swiftSongs <- swiftSongs %>% mutate (youtube_release_day_of_week = wday(youtube_publish_date, label = TRUE, abbr = FALSE))

Reproduce the bar chart below showing the number of Taylor Swift YouTube videos released on each day of the week. The background image is located here, and can be included using the background_image() function from the ggpubr package.

backImage <- png::readPNG("lover-album.png")

ggplot(data = swiftSongs, aes(x=youtube_release_day_of_week))+ background_image(backImage) + geom_bar(color = "#fc94bc", fill = "#69b4dc") + labs(title = "Taylor Swift Youtube videos: day of release", x = "Release day", y= "Number of videos", caption = "Data source: YouTube API") + theme(text = element_text("bold")) + theme_bw()

Calculating difference between date / times

# Calculating someone's age in days
dob <- ymd(19891213)
ts_age <- today() - dob
ts_age
## Time difference of 12210 days
# Calculating in years
interval(dob, today()) / years(1)
## [1] 33.43014

Using the song_release_date variable, calculate how many days it has been since the most recent Taylor Swift song was released.

current_date <- Sys.Date()  # Get the current date
most_recent_release <- max(swiftSongs$song_release_date)  # Get the most recent release date
days_since_release <-  most_recent_release - current_date
days_since_release
## [1] "2022-10-20 18:35:04 UTC"

Bonus (optional)

Using the song_release_date variable, calculate how many hours it has been since the most recent Taylor Swift song was released.

current_datetime <- Sys.time()  # Get the current datetime
most_recent_release <- max(swiftSongs$song_release_date)  # Get the most recent release datetime
hours_since_release <- difftime(current_datetime, most_recent_release, units = "hours")  # Calculate the number of hours

hours_since_release
## Time difference of 5056.087 hours

Calculate how many years it has been since Taylor Swift released her first song.

current_year <- as.integer(format(Sys.Date(), "%Y"))  # Get the current year
first_song_release_year <- 2006  # Assumed release year of Taylor Swift's first song min(swiftSongs$song_release_date)
years_since_first_song <- current_year - first_song_release_year  # Calculate the number of years

years_since_first_song
## [1] 17

How old was Taylor when she released her first song? Hint: looking up her birthday will be helpful.

first_song_release_year <- 2006  # Assumed release year of Taylor Swift's first song
birth_year <- 1989  # Taylor Swift's birth year
age_when_first_song_released <- first_song_release_year - birth_year  # Calculate the age

age_when_first_song_released
## [1] 17

Recreate the scatter plot below using the geom_emoji(image = ‘1f3b5’) function from the ggimage package and the vector of colors c(‘#7f6070’, ‘#964c32’, ‘#bb9559’, ‘#8c8c8c’, ‘#eeadcf’, ‘#7193ac’, ‘#a81e47’, ‘#0c0c0c’, ‘#7d488e’, ‘#01a7d9’).

# Define the vector of colors
colors <- c('#7f6070', '#964c32', '#bb9559', '#8c8c8c', '#eeadcf', '#7193ac',
            '#a81e47', '#0c0c0c', '#7d488e', '#01a7d9')

# Create the scatter plot with emojis and colors
ggplot(data = swiftSongs, aes(x = song_release_date, y = youtube_publish_date, color = album_name)) +
  geom_emoji(image = '1f3b5') +
  scale_color_manual(values = colors) +
  labs(title = "Taylor Swift Release Dates",
       x = "Song Release Date",
       y = "YouTube Video Release Date",
       caption = "Data source: Genius API & YouTube API",
       color = "Album") +
  theme_bw() +
  theme(legend.position = "bottom", text = element_text(face = "bold"))