String Manipulation with stringr

Learning Objectives

  • Match strings

  • Modify and concatenate strings

  • Create a word cloud

  • Understand the basics of sentiment analysis

First, load the necessary libraries

library(tidyverse)
library(skimr)
library(tidytext)
library(showtext)
library(htmlwidgets)
library(webshot)
library(flextable)

Then, import the swiftSongs.csv file located here using the code below:

# Variables to keep
keeps <- c("track_name", "youtube_title", "album_name", "youtube_duration", "full_lyrics")

# Importing CSV file
swift_songs <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/swiftSongs.csv") |> dplyr::select(keeps)

Explore high-level characteristics of the data using the glimpse() and skim() functions.

# t(glimpse(swift_songs))
skim(swift_songs)
Data summary
Name swift_songs
Number of rows 151
Number of columns 5
_______________________
Column type frequency:
character 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
track_name 0 1 2 70 0 151 0
youtube_title 0 1 5 79 0 151 0
album_name 0 1 3 12 0 10 0
youtube_duration 0 1 4 7 0 92 0
full_lyrics 0 1 786 3505 0 151 0

Matching strings

We can use the str_detect() function to return a boolean value indicating whether a string contains a particular substring or not like below.

# Detecting if a string contains the substring 'Taylor'
str_detect(string = c("Taylor Swift", "Taylor Lautner", "Harry Styles"),
           pattern = "Taylor")
## [1]  TRUE  TRUE FALSE

Using the str_detect() and mutate() functions, add a new boolean variable called contains_midnight to swift_songs that indicates whether or not a song’s lyrics contain the word “midnight”. How many of Taylor’s songs mention the word “midnight”?

# Adding contains_midnight variable
swift_songs <- swift_songs |>
  mutate(contains_midnight = str_detect(string = str_to_lower(full_lyrics), 
                                        pattern = "midnight"))

swift_songs |> 
  count(contains_midnight) |>
  flextable()

contains_midnight

n

FALSE

143

TRUE

8

The | symbol can be used to match multiple strings, which can be helpful for matching different variations of a string. How many of Taylor’s songs mention the word “midnight” or “Midnight”?

# Detecting if a string contains the substring 'midnight' or 'Midnight'
swift_songs <- swift_songs |>
  mutate(contains_midnight = str_detect(string = str_to_lower(full_lyrics), 
                                        pattern = "midnight|Midnight"))

swift_songs |> 
  count(contains_midnight) |>
  flextable()

contains_midnight

n

FALSE

143

TRUE

8

Using the str_count() and mutate() functions, add a new variable called love_count to swift_songs that indicates how many times each song mentions the word “love”.

# Adding love_count variable
swift_songs <- swift_songs |>
  mutate(love_count = str_count(string = str_to_lower(full_lyrics), 
                                        pattern = "love"))
# Songs with most mention of 'love'  
swift_songs |> 
  arrange(desc(love_count)) |>
    dplyr::select(-full_lyrics) |>
    slice_head(n = 4) |>
    flextable()

track_name

youtube_title

album_name

youtube_duration

contains_midnight

love_count

This Love

This Love

1989

PT4M11S

FALSE

52

Blank Space

Taylor Swift - Blank Space

1989

PT4M33S

FALSE

16

Labyrinth

Taylor Swift - Labyrinth (Official Lyric Video)

Midnights

PT4M12S

FALSE

15

London Boy

Taylor Swift - London Boy (Official Audio)

Lover

PT3M12S

FALSE

11

Total cont of word ‘love’

swift_songs |> 
  pull(love_count) |>
  sum()
## [1] 331

Modifying strings

There are also several functions for mutating or modifying character strings. For example, the str_c() function concatenates or combines multiple strings together.

# Concatenating two character vectors
str_c(letters, LETTERS)
##  [1] "aA" "bB" "cC" "dD" "eE" "fF" "gG" "hH" "iI" "jJ" "kK" "lL" "mM" "nN" "oO"
## [16] "pP" "qQ" "rR" "sS" "tT" "uU" "vV" "wW" "xX" "yY" "zZ"

One of the most common string operations that is needed when cleaning data is to extract substrings. This can be achieved using the str_sub() function.

# Positive indices start from beginning of the string
str_sub(string = "Eras Tour", start = 1, end = 3)
## [1] "Era"
# Negative indices start from end of the string
str_sub(string = "Eras Tour", start = -4, end = -1)
## [1] "Tour"

Another common operation is to replace a particular pattern in strings. We can use the str_replace_all() function to replace certain patterns with a replacement of our choice.

str_replace_all("I’m so sick of running as fast as I can Wondering if I'd get there quicker if I was a man And I'm so sick of them coming at me again 'Cause if I was a man, then I'd be the man I'd be the man I'd be the man", 
                pattern = "man", replacement = "@@@")
## [1] "I’m so sick of running as fast as I can Wondering if I'd get there quicker if I was a @@@ And I'm so sick of them coming at me again 'Cause if I was a @@@, then I'd be the @@@ I'd be the @@@ I'd be the @@@"

Create a new variable called youtube_time that is the same as youtube_duration, but with a : symbol replacing the M.

swift_songs <- swift_songs |>
  mutate(youtube_time = str_replace_all(youtube_duration, 
                                               pattern = "M",
                                               replacement = ":"))

Modify youtube_time by removing the P, T, and S letters.

swift_songs <- swift_songs |>
  mutate(youtube_time = str_replace_all(youtube_duration, 
                                               pattern = "M",
                                               replacement = ":"),
         youtube_time = str_remove_all(youtube_time, 
                                               pattern = "PT|S"),
         youtube_time = case_when(str_length(youtube_time) == 2 ~ str_replace_all(youtube_time, 
                                                                                 patter = ":",
                                                                                 replacement = ":00"),
                                  str_length(youtube_time) == 3 ~ str_replace_all(youtube_time, 
                                                                                 patter = ":",
                                                                                 replacement = ":0"),
                                  TRUE ~ youtube_time))

Next we coerce the cleaned Youtube time to a special date/time

# Coercing youtube_time to a date / time variable
swift_songs <- swift_songs |> 
  dplyr::mutate(youtube_time = lubridate::parse_date_time(youtube_time, orders = "%M:%S"))

Use the minute() and second() functions from the lubridate package, create a new variable, song_duration_s that gives the song duration in seconds.

# Creating song_duration_s variable
swift_songs <- swift_songs |> 
  dplyr::mutate(song_duration_s = lubridate::second(youtube_time) + 
                               60*lubridate::minute(youtube_time))

The escape sequence \w+ can be used to match any ‘word’ character (although it very slightly over counts). Create a new variable song_words equal to the number of words in the song using the str_count() function and the full_lyrics variable.

# Creating song_words variable
swift_songs <- swift_songs |> 
  dplyr::mutate(song_words = str_count(full_lyrics, pattern = "\\w+"))

Reproduce the plot below showing the relationship between the duration of each song in seconds and its number of words. Hint: to match the style of the points, use fill = '#01a7d9', pch = 23, color = '#7d488e' inside of the geom_point() layer.

swift_songs |>
  ggplot(aes(x = song_words, y = song_duration_s)) +
  geom_point(pch = 23, fill = '#01a7d9',
             color = '#7d488e') +
  labs(title = "Number of Words by Taylor Swift YouTube Song Duration",
       x = "Number of Words in Lyrics",
       y = "YouTube Song Duration (seconds)",
       caption = "Data Source: geniusr and tuberr packages") +
  theme_bw()

## Capitalization and spacing

Lowercase

# Setting all characters to lowercase
str_to_lower("It’s nice to have a friend")
## [1] "it’s nice to have a friend"

Uppercase

# Setting all characters to uppercase
str_to_upper("It’s nice to have a friend")
## [1] "IT’S NICE TO HAVE A FRIEND"

Title case

# Setting all characters to title case
str_to_title("It’s nice to have a friend")
## [1] "It’s Nice To Have A Friend"

Leading white spaces

# Removing spaces at start and end of string
str_trim(" Best believe I'm still bejeweled     When I walk in the room     I can still make the whole place shimmer ")
## [1] "Best believe I'm still bejeweled     When I walk in the room     I can still make the whole place shimmer"

Spaces within the text/string

# Removing spaces at start and end of string and repetitive spaces
str_squish(" Best believe I'm still bejeweled     When I walk in the room     I can still make the whole place shimmer ")
## [1] "Best believe I'm still bejeweled When I walk in the room I can still make the whole place shimmer"

Making a Word Cloud

library(wordcloud2) 

# Tallying up frequency of words in all songs
wordFreqs <- swift_songs |> 
  unnest_tokens(word, full_lyrics) |> 
  count(word) 

Since words like ‘the’ and ‘and’ (stop words) are very common, but often not of interest, we can remove these before tallying up the song words.

# Removing 'stop words' (common but not very meaningful words)
wordFreqs <- wordFreqs |> 
  anti_join(stop_words) |>
  filter(!word %in% c("ah", "ooh", "yeah"))

Finally, we can create the word cloud using the wordcloud2() function from the wordcloud2 package.

# Creating word cloud
wordcloud2(wordFreqs, size=1.6, color='random-dark')

shotext package

We can also customize the font using Google fonts, customize the colors used in the plot, and save it to an external image file as well. For more customization options using wordcloud2, see the Wordcloud2 library.

Customizing colors and font

library(showtext) # for custom fonts

font_family <- "satisfy"

word_colors <- c('#7f6070', '#964c32', '#bb9559',
                  '#8c8c8c', '#eeadcf', '#7193ac',
                  '#a81e47', '#0c0c0c', '#7d488e', '#01a7d9')

# Downloading Google fonts for plots
font_add_google(name = str_to_title(font_family), 
                family = font_family)

# Creating word cloud
set.seed(1989)

wordcloud2(wordFreqs, size = 1.6, 
                          color = sample(word_colors, 
                                         replace = TRUE, size = nrow(wordFreqs)), 
                          fontFamily = font_family)

Sentiment analysis

To begin, similar to the process of creating a word cloud, we can tokenize pieces of text into words using the unnest_tokens() function.

# Tokenizing song lyrics into words for each song
swift_tidy <- swift_songs |> 
  unnest_tokens(word, full_lyrics)

First, we first need a set of rules describing the sentiment of words called a sentiment lexicon.

# Getting sentiments
bing_sentiments <- get_sentiments("bing")
bing_sentiments
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

Using the bing sentiments lexicon, let’s explore the sentiment of songs from Taylor’s critically acclaimed 1989 album.

# Summarizing sentiment of each song from 1989
swift_1989_sentiment <- swift_tidy %>%
  dplyr::filter(album_name == "1989") |> 
  dplyr::select(track_name, album_name, song_words, word) |> 
  inner_join(bing_sentiments) %>%
  group_by(track_name) |> 
  dplyr::mutate(word_num = 1:n()) |> 
  ungroup() 
# Creating wide version of data to plot net sentiment
swift_1989_sentiment_wide <- swift_1989_sentiment |> 
  count(track_name, index = word_num, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

Use the vector below to reproduce the plot visualizing the sentiment for songs from 1989, and use a scale_fill_manual() layer with the colors c('#7193ac', '#01a7d9') to match the coloring as well.

# Define a vector with the 1989 songs in order
songs_1989 <- c("Welcome To New York", "Blank Space", "Style", "Out Of The Woods",
                     "All You Had To Do Was Stay", "Shake It Off", "I Wish You Would",
                     "Bad Blood", "Wildest Dreams", "How You Get The Girl",
                     "This Love", "I Know Places", "Clean")

# Creating wide version of data to plot net sentiment
swift_1989_sentiment_wide <- swift_1989_sentiment |> 
  count(track_name, index = word_num, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative,
         track_name = fct_relevel(track_name, songs_1989))
# Plotting a sentiment for each word
swift_1989_sentiment_wide |>
  # mutate(sentiment_sign = sign(sentiment))
  ggplot(aes(x = index,
             y = sentiment,
             fill = sentiment > 0)) +
  geom_col() +
  scale_fill_manual(values = c('#7193ac', '#01a7d9'))+
  scale_y_continuous(breaks = c(-1, 0, 1)) +
  facet_wrap( ~ track_name, ncol = 4, scales = "free_x") +
  labs(title = "Sentiment of Songs from Taylor Swift 1989 album",
       x = "",
       y = "Sentiment of each Word",
       caption = "Data Source: geniusr R package \n Sentiment calculated using bing sentiment lexicon") +
  ggthemes::theme_few() +
  theme(legend.position = "none", axis.ticks.x = element_blank(), axis.text.x = element_blank())