03 Data Wrangling

Author

Fu Wei Hsu

Executive Summary

Purpose: Standardize three datasets(data1,2,3) to enable join matching in subsequent steps.

Tasks performed in this document:

  • Artist field standardization (remove collaboration markers, unify case)

  • Song/Track field standardization (remove version markers, unify case)

  • Column name unification (align three datasets) - Generate Join Key (preparation for 04_Data_Joining)


Setup

library(tidyverse)
library(janitor)
library(gt)

# Load cleaned data
D1_clean_base <- readRDS("../Data/cleaned_D1.rds")
D2_clean_base <- readRDS("../Data/cleaned_D2.rds")
D3_clean_base <- readRDS("../Data/cleaned_D3.rds")

cat("D1 (Billboard):", nrow(D1_clean_base), "rows\n")
D1 (Billboard): 330087 rows
cat("D2 (Spotify):", nrow(D2_clean_base), "rows\n")
D2 (Spotify): 41106 rows
cat("D3 (Music Dataset):", nrow(D3_clean_base), "rows\n")
D3 (Music Dataset): 28372 rows

Part 1: Dataset 1 (Billboard)

Dataset 1 Wrangling

# Artist cleaning function 
clean_d1_artist <- function(x) {
  x %>%
    str_to_lower() %>%
    str_trim() %>%
    # Remove parentheses containing featuring / feat. / duet with
    str_remove_all(regex("\\s*\\([^)]*(featuring|feat\\.|duet with)[^)]*\\)", ignore_case = TRUE)) %>%
    # Remove "featuring" and everything after it
    str_remove(regex("\\s+featuring.*$", ignore_case = TRUE)) %>%
    # Remove "feat" and everything after it
    str_remove(regex("\\s*\\(?\\s*feat\\..*$", ignore_case = TRUE)) %>%
    # Remove "ft." and everything after it
    str_remove(regex("\\s+ft\\..*$", ignore_case = TRUE)) %>%
    # Remove "with" and everything after it
    str_remove(regex("\\s+with\\s+.*$", ignore_case = TRUE)) %>%
    str_trim()
}

# Song cleaning function 
clean_d1_song <- function(x) {
  x %>%
    str_to_lower() %>%
    str_trim() %>%
    # Remove all square brackets and their contents
    str_remove_all("\\s*\\[.*?\\]") %>%
    # Remove version markers inside parentheses
    str_remove_all(regex("\\s*\\([^)]*remix[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*remaster[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(live version|acoustic|unplugged)[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(featuring|feat\\.|ft\\.)[^)]*\\)", ignore_case = TRUE)) %>%
    # Remove Remix marker after dash
    str_remove(regex("\\s*-\\s*.*remix.*$", ignore_case = TRUE)) %>%
    str_trim()
}

# Apply cleaning 
D1_wrangled <- D1_clean_base %>%
  mutate(
    artist_clean = clean_d1_artist(artist),
    song_clean   = clean_d1_song(song)
  )

cat("D1 cleaning data total rows:", nrow(D1_wrangled), "\n")
D1 cleaning data total rows: 330087 

Dataset 1 Validation

# Artist changed records
D1_wrangled %>%
  filter(artist != artist_clean) %>%
  distinct(artist, artist_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D1 Artist Cleaning Effect (Changed Records Top 20)")
D1 Artist Cleaning Effect (Changed Records Top 20)
artist artist_clean
Adele adele
The Kid LAROI & Justin Bieber the kid laroi & justin bieber
Lil Nas X & Jack Harlow lil nas x & jack harlow
Walker Hayes walker hayes
Ed Sheeran ed sheeran
Drake Featuring Future & Young Thug drake
Olivia Rodrigo olivia rodrigo
Doja Cat doja cat
Dua Lipa dua lipa
Wizkid Featuring Justin Bieber & Tems wizkid
Doja Cat Featuring SZA doja cat
Glass Animals glass animals
Maneskin maneskin
Elton John & Dua Lipa elton john & dua lipa
Doja Cat & The Weeknd doja cat & the weeknd
The Weeknd & Ariana Grande the weeknd & ariana grande
Jason Aldean & Carrie Underwood jason aldean & carrie underwood
Coldplay x BTS coldplay x bts
Nardo Wick Featuring G Herbo, Lil Durk & 21 Savage nardo wick
Drake Featuring 21 Savage & Project Pat drake
# Artist changed records
D1_wrangled %>%
  filter(song != song_clean) %>%
  distinct(song, song_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D1 Song Cleaning Effect (Changed Records Top 20)")
D1 Song Cleaning Effect (Changed Records Top 20)
song song_clean
Easy On Me easy on me
Stay stay
Industry Baby industry baby
Fancy Like fancy like
Bad Habits bad habits
Way 2 Sexy way 2 sexy
Shivers shivers
Good 4 U good 4 u
Need To Know need to know
Levitating levitating
Essence essence
Kiss Me More kiss me more
Heat Waves heat waves
Beggin' beggin'
Cold Heart (PNAU Remix) cold heart
You Right you right
Save Your Tears save your tears
If I Didn't Love You if i didn't love you
Traitor traitor
My Universe my universe
artist_changed <- D1_wrangled %>%
  filter(artist != artist_clean) %>%
  nrow()

song_changed <- D1_wrangled %>%
  filter(song != song_clean) %>%
  nrow()

cat("Artist records changed:", artist_changed, "rows (",
    round(artist_changed / nrow(D1_wrangled) * 100, 2), "%)\n")
Artist records changed: 328951 rows ( 99.66 %)
cat("Song records changed:  ", song_changed, "rows (",
    round(song_changed / nrow(D1_wrangled) * 100, 2), "%)\n\n")
Song records changed:   328971 rows ( 99.66 %)

Part 2: Dataset 2 (Spotify)

Dataset 2 Wrangling

# Artist cleaning function
clean_d2_artist <- function(x) {
  x %>%
    str_to_lower() %>%
    str_trim() %>%
    # Remove parentheses containing featuring / feat. / duet with
    str_remove_all(regex("\\s*\\([^)]*(featuring|feat\\.|duet with)[^)]*\\)", ignore_case = TRUE)) %>%
    # Remove "featuring" and everything after it
    str_remove(regex("\\s+featuring.*$", ignore_case = TRUE)) %>%
    # Remove "feat." and everything after it
    str_remove(regex("\\s*\\(?\\s*feat\\..*$", ignore_case = TRUE)) %>%
    # Remove "ft." and everything after it
    str_remove(regex("\\s+ft\\..*$", ignore_case = TRUE)) %>%
    # Remove "with" and everything after it
    str_remove(regex("\\s+with\\s+.*$", ignore_case = TRUE)) %>%
    str_trim()
}

# Track cleaning function
clean_d2_track <- function(x) {
  x %>%
    str_to_lower() %>%
    str_trim() %>%
    # Remove version markers inside parentheses
    str_remove_all("\\s*\\[.*?\\]") %>%
    # Remove version markers inside parentheses
    str_remove_all(regex("\\s*\\([^)]*remix[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*remaster[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(live version|acoustic|unplugged)[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(featuring|feat\\.|ft\\.)[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*radio edit[^)]*\\)", ignore_case = TRUE)) %>%
    # Version markers after dash
    str_remove(regex("\\s*-\\s*.*(remaster|remastered|remasteris[eé]).*$", ignore_case = TRUE)) %>%
    str_remove(regex("\\s*-\\s*(live|version live|live version)\\s*$", ignore_case = TRUE)) %>%
    str_remove(regex("\\s*-\\s*.*remix.*$", ignore_case = TRUE)) %>%
    str_remove(regex("\\s*-\\s*radio edit.*$", ignore_case = TRUE)) %>%
    str_remove(regex("\\s*-\\s*feat\\..*$", ignore_case = TRUE)) %>%
    str_trim()
}

# Apply cleaning + unify column names
D2_wrangled <- D2_clean_base %>%
  rename(
    artist = artist,   
    song   = track     # rename track to song, aling with D1
  ) %>%
  mutate(
    artist_clean = clean_d2_artist(artist),
    song_clean   = clean_d2_track(song)
  )

cat("D2 cleaning data total rows:", nrow(D2_wrangled), "\n")
D2 cleaning data total rows: 41106 

Dataset 2 Validation

#D2 Artist Cleaning Validation\n
D2_wrangled %>%
  filter(artist != artist_clean) %>%
  distinct(artist, artist_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D2 Artist Cleaning Effect (Changed Records Top 20)")
D2 Artist Cleaning Effect (Changed Records Top 20)
artist artist_clean
Garland Green garland green
Serge Gainsbourg serge gainsbourg
Lord Melody lord melody
Celia Cruz celia cruz
P. Susheela p. susheela
Ennio Morricone ennio morricone
Antônio Carlos Jobim antônio carlos jobim
Marv Johnson marv johnson
Caetano Veloso caetano veloso
The Beach Boys the beach boys
Jerry Goldsmith jerry goldsmith
Roy Orbison roy orbison
Luiz Gonzaga luiz gonzaga
Jimmy Dean jimmy dean
Mary Wells mary wells
Leño leño
Jackie Wilson jackie wilson
Staatskapelle Berlin staatskapelle berlin
Shirley & Lee shirley & lee
The Everly Brothers the everly brothers
#D2 Song Cleaning Validation
D2_wrangled %>%
  filter(song != song_clean) %>%
  distinct(song, song_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D2 Song Cleaning Effect (Changed Records Top 20)")
D2 Song Cleaning Effect (Changed Records Top 20)
song song_clean
Jealous Kind Of Fella jealous kind of fella
Initials B.B. initials b.b.
Melody Twist melody twist
Mi Bomba Sonó mi bomba sonó
Uravu Solla uravu solla
Beat n. 3 beat n. 3
Samba De Uma Nota So (One Note Samba) samba de uma nota so (one note samba)
Happy Days happy days
Carolina - Remastered 2006 carolina
I Can Hear Music i can hear music
The Aftermath (From "Stagecoach") the aftermath (from "stagecoach")
Ride Away ride away
Caboclo Nordestino caboclo nordestino
P.T. 109 p.t. 109
Bye Bye Baby bye bye baby
A Festa dos Seus 15 Anos a festa dos seus 15 anos
My Empty Arms my empty arms
Alfômega - Remastered 2006 alfômega
Parsifal: "Wehevolles Erbe, dem ich verfallen" parsifal: "wehevolles erbe, dem ich verfallen"
Let The Good Times Roll let the good times roll
# D2 Cleaning Summary
artist_changed <- D2_wrangled %>%
  filter(artist != artist_clean) %>%
  nrow()

song_changed <- D2_wrangled %>%
  filter(song != song_clean) %>%
  nrow()

cat("Artist records changed:", artist_changed, "rows (",
    round(artist_changed / nrow(D2_wrangled) * 100, 2), "%)\n")
Artist records changed: 40937 rows ( 99.59 %)
cat("Songs records changed:", song_changed, "rows (",
    round(song_changed / nrow(D2_wrangled) * 100, 2), "%)\n\n")
Songs records changed: 40899 rows ( 99.5 %)

Part 3: Dataset 3 (Music Dataset)

Dataset 3 Wrangling

# D3 (Music Dataset) Wrangling
# Artist cleaning function 
clean_d3_artist <- function(x) {
  x %>%
    str_trim()
}

# Song cleaning function
clean_d3_song <- function(x) {
  x %>%
    str_trim() %>%
    # Remove all square brackets and their contents
    str_remove_all("\\s*\\[.*?\\]") %>%
    # Remove version markers inside parentheses
    str_remove_all(regex("\\s*\\([^)]*remix[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*remaster[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(feat\\.|featuring)[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*radio edit[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(acoustic|unplugged)[^)]*\\)", ignore_case = TRUE)) %>%
    # For "live", only remove explicit (live) or (live version) patterns in parentheses
    # Do not touch "live" appearing in song titles (e.g., 'as long as i live')
    str_remove_all(regex("\\s*\\(live version\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\(live\\)", ignore_case = TRUE)) %>%
    str_trim()
}

# Apply cleaning + unify column names
D3_wrangled <- D3_clean_base %>%
  rename(
    artist = artist_name,  # rename artist_name to artist, align with D1
    song   = track_name    # rename track_name to song, align with D1
  ) %>%
  mutate(
    artist_clean = clean_d3_artist(artist),
    song_clean   = clean_d3_song(song)
  )

cat("D3 cleaning data total rows:", nrow(D3_wrangled), "\n")
D3 cleaning data total rows: 28372 

Dataset 3 Validation

# D3 artist
artist_changed <- D3_wrangled %>%
  filter(artist != artist_clean) %>%
  nrow()

cat("Artist records changed:", artist_changed, "rows\n")
Artist records changed: 1 rows
D3_wrangled %>%
  filter(artist != artist_clean) %>%
  distinct(artist, artist_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "3 Artist Cleaning Effect (Changed Records)")
3 Artist Cleaning Effect (Changed Records)
artist artist_clean
babes in toyland  babes in toyland
# D3 Song Cleaning Validation
D3_wrangled %>%
  filter(song != song_clean) %>%
  distinct(song, song_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D3 Song Cleaning Effect (Changed Records Top 20)")
D3 Song Cleaning Effect (Changed Records Top 20)
song song_clean
don't look back (feat. van morrison) don't look back
so far away (live) so far away
i feel the earth move (live) i feel the earth move
he don't love you [like i love you] he don't love you
you're the song [that i can't stop singing] you're the song
whenever i call you "friend" (feat. stevie nicks) whenever i call you "friend"
i just can't stop loving you (feat. siedah garrett) i just can't stop loving you
speed demon (remix by nero) speed demon
two occasions (live) two occasions
gin and juice (feat. dat nigga daz) gin and juice
ain't no fun (if the homies cant have none) (feat. nate dogg, warren g & kurupt) ain't no fun (if the homies cant have none)
whatta man (feat. en vogue) whatta man
ice cream (feat. ghostface killah, method man & cappadonna) ice cream
this is for the lover in you (feat. ll cool j, howard hewett, jody watley & jeffrey daniels) this is for the lover in you
ill vibe (feat. q-tip) ill vibe
keep on, keepin' on (feat. xscape) keep on, keepin' on
a thin line between love & hate (feat. shirley murdock) a thin line between love & hate
mo money mo problems (feat. mase & puff daddy) mo money mo problems
make it hot (feat. missy elliott & mocha) make it hot
heartbreaker (remix) (feat. da brat & missy elliott) heartbreaker
# D3 Cleaning Summary
artist_changed <- D3_wrangled %>%
  filter(artist != artist_clean) %>%
  nrow()

song_changed <- D3_wrangled %>%
  filter(song != song_clean) %>%
  nrow()

cat("Artist records changed: ", artist_changed, "rows (",
    round(artist_changed / nrow(D3_wrangled) * 100, 2), "%)\n")
Artist records changed:  1 rows ( 0 %)
cat("Song records changed: ", song_changed, "rows (",
    round(song_changed / nrow(D3_wrangled) * 100, 2), "%)\n\n")
Song records changed:  430 rows ( 1.52 %)

Part 4: Join Key Creation

All Data create Join Key

# Generate Join Key
D1_wrangled <- D1_wrangled %>%
  mutate(
    join_key = paste(artist_clean, song_clean, sep = "|")
  )

D2_wrangled <- D2_wrangled %>%
  mutate(
    join_key = paste(artist_clean, song_clean, sep = "|")
  )

D3_wrangled <- D3_wrangled %>%
  mutate(
    join_key = paste(artist_clean, song_clean, sep = "|")
  )

D1 Join Key

D1_wrangled %>%
  select(artist, song, artist_clean, song_clean, join_key) %>%
  distinct(join_key, .keep_all = TRUE) %>%
  slice_head(n = 5) %>%
  gt() %>%
  tab_header(title = "D1 Join Key")
D1 Join Key
artist song artist_clean song_clean join_key
Adele Easy On Me adele easy on me adele|easy on me
The Kid LAROI & Justin Bieber Stay the kid laroi & justin bieber stay the kid laroi & justin bieber|stay
Lil Nas X & Jack Harlow Industry Baby lil nas x & jack harlow industry baby lil nas x & jack harlow|industry baby
Walker Hayes Fancy Like walker hayes fancy like walker hayes|fancy like
Ed Sheeran Bad Habits ed sheeran bad habits ed sheeran|bad habits

D2 Join Key

D2_wrangled %>%
  select(artist, song, artist_clean, song_clean, join_key) %>%
  distinct(join_key, .keep_all = TRUE) %>%
  slice_head(n = 5) %>%
  gt() %>%
  tab_header(title = "D2 Join Key")
D2 Join Key
artist song artist_clean song_clean join_key
Garland Green Jealous Kind Of Fella garland green jealous kind of fella garland green|jealous kind of fella
Serge Gainsbourg Initials B.B. serge gainsbourg initials b.b. serge gainsbourg|initials b.b.
Lord Melody Melody Twist lord melody melody twist lord melody|melody twist
Celia Cruz Mi Bomba Sonó celia cruz mi bomba sonó celia cruz|mi bomba sonó
P. Susheela Uravu Solla p. susheela uravu solla p. susheela|uravu solla

D3 Join Key

D3_wrangled %>%
  select(artist, song, artist_clean, song_clean, join_key) %>%
  distinct(join_key, .keep_all = TRUE) %>%
  slice_head(n = 5) %>%
  gt() %>%
  tab_header(title = "D3 Join Key")
D3 Join Key
artist song artist_clean song_clean join_key
mukesh mohabbat bhi jhoothi mukesh mohabbat bhi jhoothi mukesh|mohabbat bhi jhoothi
frankie laine i believe frankie laine i believe frankie laine|i believe
johnnie ray cry johnnie ray cry johnnie ray|cry
pérez prado patricia pérez prado patricia pérez prado|patricia
giorgos papadopoulos apopse eida oneiro giorgos papadopoulos apopse eida oneiro giorgos papadopoulos|apopse eida oneiro

Part 5: Data join key overlap predication

# Before the formal join, quickly verify key overlap across the three datasets
# D1 unique join keys
d1_keys <- D1_wrangled %>% distinct(join_key) %>% pull(join_key)
d2_keys <- D2_wrangled %>% distinct(join_key) %>% pull(join_key)
d3_keys <- D3_wrangled %>% distinct(join_key) %>% pull(join_key)

d1_d2_overlap <- sum(d1_keys %in% d2_keys)
d1_d3_overlap <- sum(d1_keys %in% d3_keys)

cat("D1 unique keys:", length(d1_keys), "\n")
D1 unique keys: 29671 
cat("D2 unique keys:", length(d2_keys), "\n")
D2 unique keys: 39831 
cat("D3 unique keys:", length(d3_keys), "\n\n")
D3 unique keys: 28338 
cat("D1 ∩ D2 overlap:", d1_d2_overlap, "rows (",
    round(d1_d2_overlap / length(d1_keys) * 100, 2), "% of D1)\n")
D1 ∩ D2 overlap: 20123 rows ( 67.82 % of D1)
cat("D1 ∩ D3 overlap:", d1_d3_overlap, "rows (",
    round(d1_d3_overlap / length(d1_keys) * 100, 2), "% of D1)\n\n")
D1 ∩ D3 overlap: 3154 rows ( 10.63 % of D1)

Part 6 saveRDS

saveRDS(D1_wrangled, "../Data/wrangled_D1.rds")
saveRDS(D2_wrangled, "../Data/wrangled_D2.rds")
saveRDS(D3_wrangled, "../Data/wrangled_D3.rds")

Note Due to formatting inconsistencies between D3 and the Billboard dataset, the exact match rate between D1 and D3 is approximately 10.6%. As a result, D3 is used primarily as a supplementary source for genre classification and lyrical sentiment analysis, rather than as a fully integrated dataset. Findings derived from the D1–D3 joined subset should be interpreted with caution given the limited coverage.