Loading Packages
library(tidyverse)
library(stringr)
library(lubridate)
library(skimr)
library(ggpubr)
library(ggimage)Dates and times
Import the swiftSongs.csv
# Variables to keep
keeps <- c("track_name", "album_name", "youtube_url", "youtube_title", "youtube_publish_date", "youtube_duration", "song_release_date_year", "song_release_date_month", "song_release_date_day")
# Importing CSV file
swiftSongs <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/swiftSongs.csv") %>%
dplyr::select(keeps)Explore high-level characteristics of the data using the glimpse() function.
glimpse(swiftSongs)## Rows: 151
## Columns: 9
## $ track_name <chr> "...Ready For It?", "‘tis the damn season", "a…
## $ album_name <chr> "reputation", "evermore", "folklore", "folklor…
## $ youtube_url <chr> "http://www.youtube.com/watch?v=wIft-t-MQuE", …
## $ youtube_title <chr> "Taylor Swift - …Ready For It?", "Taylor Swift…
## $ youtube_publish_date <dttm> 2017-10-27 04:00:03, 2020-12-11 05:00:05, 202…
## $ youtube_duration <chr> "PT3M31S", "PT3M56S", "PT4M24S", "PT4M56S", "P…
## $ song_release_date_year <dbl> 2017, 2020, 2020, 2020, 2020, 2020, 2020, 2020…
## $ song_release_date_month <dbl> 9, 12, 7, 7, 7, 12, 12, 12, 12, 12, 7, 12, 7, …
## $ song_release_date_day <dbl> 3, 11, 24, 24, 24, 11, 11, 11, 11, 11, 24, 11,…
Are there any noticeable missing value patterns?
Answer: There no missing value patterns
skim(swiftSongs)| Name | swiftSongs |
| Number of rows | 151 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 3 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| track_name | 0 | 1 | 2 | 70 | 0 | 151 | 0 |
| album_name | 0 | 1 | 3 | 12 | 0 | 10 | 0 |
| youtube_url | 0 | 1 | 42 | 42 | 0 | 151 | 0 |
| youtube_title | 0 | 1 | 5 | 79 | 0 | 151 | 0 |
| youtube_duration | 0 | 1 | 4 | 7 | 0 | 92 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| song_release_date_year | 0 | 1 | 2014.95 | 5.17 | 2006 | 2010 | 2017 | 2020 | 2022 | ▃▅▂▂▇ |
| song_release_date_month | 0 | 1 | 9.46 | 1.85 | 3 | 8 | 10 | 11 | 12 | ▁▁▅▇▅ |
| song_release_date_day | 0 | 1 | 18.38 | 6.85 | 2 | 11 | 21 | 24 | 28 | ▁▅▁▅▇ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| youtube_publish_date | 0 | 1 | 2009-06-16 21:42:34 | 2022-10-25 04:00:09 | 2018-11-21 08:34:37 | 134 |
Coercing strings to a date
ymd("1989-12-13")## [1] "1989-12-13"
mdy("December 13th, 1989")## [1] "1989-12-13"
dmy("13-Dec-1989")## [1] "1989-12-13"
ymd(19891213)## [1] "1989-12-13"
Create a new character variable song_release_date_char in the swiftSongs data set using the mutate() and str_c() functions.
swiftSongs <- swiftSongs %>% mutate(song_release_date_char = str_c(song_release_date_year, song_release_date_month, song_release_date_day, sep = "-"))Create a new date / time variable song_release_date using the newly created song_release_date_char variable and the appropriate lubridate helper function.
swiftSongs <- swiftSongs %>% mutate(song_release_date = ymd(song_release_date_char))Reproduce the scatter plot below showing the relationship between the release date of each of Taylor’s songs, and the release date of the corresponding YouTube video.
ggplot(data = swiftSongs, aes(x=song_release_date,y=youtube_publish_date, color = album_name)) + geom_point() + labs(title ="Taylor swift release dates",
x= "Song release date",
y = "YouTube video release date",
caption = "Data source: Genius API & YouTube API",
color = "Album") + theme_bw() + theme(legend.position = "bottom", text = element_text(face = "bold"))Creating a date from individual components
Recreate the date / time variable song_release_date this time directly using the year, month, and day components with the make_datetime() function.
swiftSongs <- swiftSongs %>% mutate(song_release_date = make_datetime(song_release_date_year, song_release_date_month, song_release_date_day))Extracting date / time components
Extract the year, month, and day of the release date of the YouTube videos using the youtube_publish_date variable.
swiftSongs <- swiftSongs %>% mutate(youtube_release_year = year(youtube_publish_date),
youtube_release_month = month(youtube_publish_date),
youtube_release_day = day(youtube_publish_date))Extract the day of the week as a string (e.g., Monday, Tuesday, etc.) of the release date of the videos using the youtube_publish_date variable.
swiftSongs <- swiftSongs %>% mutate (youtube_release_day_of_week = wday(youtube_publish_date, label = TRUE, abbr = FALSE))Reproduce the bar chart below showing the number of Taylor Swift YouTube videos released on each day of the week. The background image is located here, and can be included using the background_image() function from the ggpubr package.
backImage <- png::readPNG("lover-album.png")
ggplot(data = swiftSongs, aes(x=youtube_release_day_of_week))+ background_image(backImage) + geom_bar(color = "#fc94bc", fill = "#69b4dc") + labs(title = "Taylor Swift Youtube videos: day of release", x = "Release day", y= "Number of videos", caption = "Data source: YouTube API") + theme(text = element_text("bold")) + theme_bw()Calculating difference between date / times
# Calculating someone's age in days
dob <- ymd(19891213)
ts_age <- today() - dob
ts_age## Time difference of 12210 days
# Calculating in years
interval(dob, today()) / years(1)## [1] 33.43014
Using the song_release_date variable, calculate how many days it has been since the most recent Taylor Swift song was released.
current_date <- Sys.Date() # Get the current date
most_recent_release <- max(swiftSongs$song_release_date) # Get the most recent release date
days_since_release <- most_recent_release - current_date
days_since_release## [1] "2022-10-20 18:35:04 UTC"
Bonus (optional)
Using the song_release_date variable, calculate how many hours it has been since the most recent Taylor Swift song was released.
current_datetime <- Sys.time() # Get the current datetime
most_recent_release <- max(swiftSongs$song_release_date) # Get the most recent release datetime
hours_since_release <- difftime(current_datetime, most_recent_release, units = "hours") # Calculate the number of hours
hours_since_release## Time difference of 5056.087 hours
Calculate how many years it has been since Taylor Swift released her first song.
current_year <- as.integer(format(Sys.Date(), "%Y")) # Get the current year
first_song_release_year <- 2006 # Assumed release year of Taylor Swift's first song min(swiftSongs$song_release_date)
years_since_first_song <- current_year - first_song_release_year # Calculate the number of years
years_since_first_song## [1] 17
How old was Taylor when she released her first song? Hint: looking up her birthday will be helpful.
first_song_release_year <- 2006 # Assumed release year of Taylor Swift's first song
birth_year <- 1989 # Taylor Swift's birth year
age_when_first_song_released <- first_song_release_year - birth_year # Calculate the age
age_when_first_song_released## [1] 17
Recreate the scatter plot below using the geom_emoji(image = ‘1f3b5’) function from the ggimage package and the vector of colors c(‘#7f6070’, ‘#964c32’, ‘#bb9559’, ‘#8c8c8c’, ‘#eeadcf’, ‘#7193ac’, ‘#a81e47’, ‘#0c0c0c’, ‘#7d488e’, ‘#01a7d9’).
# Define the vector of colors
colors <- c('#7f6070', '#964c32', '#bb9559', '#8c8c8c', '#eeadcf', '#7193ac',
'#a81e47', '#0c0c0c', '#7d488e', '#01a7d9')
# Create the scatter plot with emojis and colors
ggplot(data = swiftSongs, aes(x = song_release_date, y = youtube_publish_date, color = album_name)) +
geom_emoji(image = '1f3b5') +
scale_color_manual(values = colors) +
labs(title = "Taylor Swift Release Dates",
x = "Song Release Date",
y = "YouTube Video Release Date",
caption = "Data source: Genius API & YouTube API",
color = "Album") +
theme_bw() +
theme(legend.position = "bottom", text = element_text(face = "bold"))