Loading Packages
library(tidyverse)
library(skimr)
library(stringr)
library(lubridate)
Character Strings
Importing Dataset
# Variables to keep
keeps <- c("track_name", "youtube_title", "youtube_duration", "full_lyrics")
# Importing CSV file
swiftSongs <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/swiftSongs.csv") %>% select(keeps)
Explore high-level characteristics of the data using the glimpse() and skim() functions.
## Rows: 151
## Columns: 4
## $ track_name <chr> "...Ready For It?", "‘tis the damn season", "august",…
## $ youtube_title <chr> "Taylor Swift - …Ready For It?", "Taylor Swift - ‘tis…
## $ youtube_duration <chr> "PT3M31S", "PT3M56S", "PT4M24S", "PT4M56S", "PT4M35S"…
## $ full_lyrics <chr> "Knew he was a killer first time that I saw him Wonde…
Data summary
| Name |
swiftSongs |
| Number of rows |
151 |
| Number of columns |
4 |
| _______________________ |
|
| Column type frequency: |
|
| character |
4 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| track_name |
0 |
1 |
2 |
70 |
0 |
151 |
0 |
| youtube_title |
0 |
1 |
5 |
79 |
0 |
151 |
0 |
| youtube_duration |
0 |
1 |
4 |
7 |
0 |
92 |
0 |
| full_lyrics |
0 |
1 |
786 |
3505 |
0 |
151 |
0 |
Matching Strings
# Displaying lyrics
swiftSongs %>% filter(track_name == "It’s Nice To Have A Friend") %>%
pull(full_lyrics)
## [1] "Ooh Ooh School bell rings, walk me home Sidewalk chalk covered in snow Lost my gloves, you give me one \"Wanna hang out?\" Yeah, sounds like fun Video games, you pass me a note Sleeping in tents It's nice to have a friend (Ooh) It's nice to have a friend (Ooh) Light pink sky, up on the roof Sun sinks down, no curfew 20 questions, we tell the truth You've been stressed out lately, yeah, me too Something gave you the nerve To touch my hand It's nice to have a friend (Ooh) It's nice to have a friend (Ooh) Church bells ring, carry me home Rice on the ground looks like snow Call my bluff, call you \"Babe\" Have my back, yeah, every day Feels like home, stay in bed The whole weekend It's nice to have a friend (Ooh) It's nice to have a friend (Ooh) It's nice to have a friend (Ooh) (Ooh)"
# Detecting if a string contains the substring 'Taylor'
str_detect(string = c("Taylor Swift", "Taylor Lautner", "Harry Styles"),
pattern = "Taylor")
## [1] TRUE TRUE FALSE
Using the str_detect() and mutate() functions, add a new boolean variable called contains_midnight to swiftSongs that indicates whether or not a song’s lyrics contain the word “midnight”.
swiftSongs <- swiftSongs %>% mutate(contains_midnight = str_detect(full_lyrics, "\\bmidnight\\b"))
How many of Taylor’s songs mention the word “midnight”?
Answer: 5
sum(swiftSongs$contains_midnight)
## [1] 5
How many of Taylor’s songs mention the word “midnight” or “Midnight”?
Answer : 6
swiftSongs <- swiftSongs %>%
mutate(contains_midnight_or_Midnight = str_detect(full_lyrics, regex("\\bmidnight\\b|\\bMidnight\\b")))
sum(swiftSongs$contains_midnight_or_Midnight)
## [1] 6
Using the str_count() and mutate() functions, add a new variable called love_count to swiftSongs that indicates how many times each song mentions the word “love”.
swiftSongs <- swiftSongs %>% mutate(love_count = str_count(full_lyrics, pattern = "love"))
Which song mentions love the most times, and how many times is it mentioned?
song_with_max_love <- swiftSongs %>%
filter(love_count == max(love_count)) %>%
slice_max(love_count)
# Extract the song name and the love count
song_name <- song_with_max_love$youtube_title
max_love_count <- song_with_max_love$love_count
song_name
## [1] "This Love"
## [1] 52
Modifying Strings
Create a new variable called youtube_time that is the same as youtube_duration, but with a : symbol replacing the M.
swiftSongs <- swiftSongs %>%
mutate(youtube_time = str_replace(youtube_duration, "M", ":"))
Modify youtube_time by removing the P, T, and S letters.
swiftSongs <- swiftSongs %>%
mutate(youtube_time = str_remove_all(youtube_time, pattern = "P|T|S"))
Modify youtube_time to add 0’s when needed using the case_when() function together with str_replace_all() and str_length().
swiftSongs <- swiftSongs %>%
mutate(youtube_time = case_when(
str_length(youtube_time) == 2 ~ str_c(youtube_time, "00"),
str_length(youtube_time) == 3 ~ str_replace_all(youtube_time, pattern = ":", replacement = ":0"),
TRUE ~ youtube_time
))
Coerce youtube_time to be a special date / time variable using the parse_date_time() function from the lubridate package using the code below
# Coercing youtube_time to a date / time variable
swiftSongs <- swiftSongs %>%
dplyr::mutate(youtube_time = lubridate::parse_date_time(youtube_time, orders = "%M:%S"))
Use the minute() and second() functions from the lubridate package, create a new variable, song_duration_s that gives the song duration in seconds using the code below
# Creating song_duration_s variable
swiftSongs <- swiftSongs %>%
dplyr::mutate(song_duration_s = lubridate::second(youtube_time) +
60*lubridate::minute(youtube_time))
The escape sequence \w+ can be used to match any ‘word’ character (although it very slightly over counts). Create a new variable song_words equal to the number of words in the song using the str_count() function and the full_lyrics variable.
# Creating song_words variable
swiftSongs <- swiftSongs %>%
dplyr::mutate(song_words = str_count(full_lyrics, pattern = "\\w+"))
Reproduce the plot below showing the relationship between the duration of each song in seconds and its number of words. Hint: to match the style of the points, use fill = ‘#01a7d9’, pch = 23, color = ‘#7d488e’ inside of the geom_point() layer.
ggplot(data = swiftSongs, aes(x= song_duration_s,y= song_words)) + geom_point(fill = '#01a7d9', pch = 23, color = '#7d488e') + theme_bw() + labs(
title ="Number of words by Taylor Swift song duration",
x = "Song duration (seconds)",
y = "Number of words in lyrics",
caption = "Data source: geniusr R package"
)

Capitalization and spacing
# Setting all characters to lowercase
str_to_lower("It’s nice to have a friend")
## [1] "it’s nice to have a friend"
## [1] "it’s nice to have a friend"
# Setting all characters to uppercase
str_to_upper("It’s nice to have a friend")
## [1] "IT’S NICE TO HAVE A FRIEND"
## [1] "IT’S NICE TO HAVE A FRIEND"
# Setting all characters to title case
str_to_title("It’s nice to have a friend")
## [1] "It’s Nice To Have A Friend"
# Removing spaces at start and end of string
str_trim(" Best believe I'm still bejeweled When I walk in the room I can still make the whole place shimmer ")
## [1] "Best believe I'm still bejeweled When I walk in the room I can still make the whole place shimmer"
## [1] "Best believe I'm still bejeweled When I walk in the room I can still make the whole place shimmer"
# Removing spaces at start and end of string and repetitive spaces
str_squish(" Best believe I'm still bejeweled When I walk in the room I can still make the whole place shimmer ")
## [1] "Best believe I'm still bejeweled When I walk in the room I can still make the whole place shimmer"
Bonus (optional)
Using the title of the YouTube video for each song, create a variable indicating whether or not the video is an official music video, official lyric video, or other type of video.
swiftSongs <- swiftSongs %>%
mutate(video_type = case_when(
str_detect(youtube_title, "(?i)official\\s+music\\s+video") ~ "Official Music Video",
str_detect(youtube_title, "(?i)official\\s+lyric\\s+video") ~ "Official Lyric Video",
TRUE ~ "Other"
))
Use the str_glue() function in tandem with ggplot to create a scatter plot showing the relationship between the total number of characters in each song’s lyrics (full_lyrics) and the total number of characters in each song’s title (track_name), including the correlation between the two variables dynamically in the subtitle.
swiftSongs <- swiftSongs %>%
mutate(total_lyrics_chars = str_length(full_lyrics),
total_title_chars = str_length(youtube_title))
# Calculate the correlation between the two variables
correlation <- cor(swiftSongs$total_lyrics_chars, swiftSongs$total_title_chars)
# Create the scatter plot with dynamic correlation subtitle
ggplot(swiftSongs, aes(x = total_lyrics_chars, y = total_title_chars)) +
geom_point() +
labs(x = "Total Lyrics Characters", y = "Total Title Characters",
subtitle = str_glue("Correlation: {correlation}")) +
theme_bw()

Making a ☁️Word Cloud☁️
library(wordcloud2)
library(tidytext)
# Tallying up frequency of words in all songs
wordFreqs <- swiftSongs %>% unnest_tokens(word, full_lyrics) %>%
count(word)
# Removing 'stop words' (common but not very meaningful words)
wordFreqs <- wordFreqs %>% anti_join(stop_words)
# Creating word cloud
wordcloud2(wordFreqs, size=1.6, color='random-dark',shape = 'star')