03 Data Wrangling

Author

Fu Wei Hsu

Executive Summary

Purpose: Standardize three datasets(data1,2,3) to enable join matching in subsequent steps.

Tasks performed in this document:

Artist field standardization (remove collaboration markers, unify case)
Song/Track field standardization (remove version markers, unify case)
Column name unification (align three datasets) - Generate Join Key (preparation for 04_Data_Joining)

Setup

library(tidyverse)
library(janitor)
library(gt)

# Load cleaned data
D1_clean_base <- readRDS("../Data/cleaned_D1.rds")
D2_clean_base <- readRDS("../Data/cleaned_D2.rds")
D3_clean_base <- readRDS("../Data/cleaned_D3.rds")

cat("D1 (Billboard):", nrow(D1_clean_base), "rows\n")

D1 (Billboard): 330087 rows

cat("D2 (Spotify):", nrow(D2_clean_base), "rows\n")

D2 (Spotify): 41106 rows

cat("D3 (Music Dataset):", nrow(D3_clean_base), "rows\n")

D3 (Music Dataset): 28372 rows

Part 1: Dataset 1 (Billboard)

Dataset 1 Wrangling

# Artist cleaning function 
clean_d1_artist <- function(x) {
  x %>%
    str_to_lower() %>%
    str_trim() %>%
    # Remove parentheses containing featuring / feat. / duet with
    str_remove_all(regex("\\s*\\([^)]*(featuring|feat\\.|duet with)[^)]*\\)", ignore_case = TRUE)) %>%
    # Remove "featuring" and everything after it
    str_remove(regex("\\s+featuring.*$", ignore_case = TRUE)) %>%
    # Remove "feat" and everything after it
    str_remove(regex("\\s*\\(?\\s*feat\\..*$", ignore_case = TRUE)) %>%
    # Remove "ft." and everything after it
    str_remove(regex("\\s+ft\\..*$", ignore_case = TRUE)) %>%
    # Remove "with" and everything after it
    str_remove(regex("\\s+with\\s+.*$", ignore_case = TRUE)) %>%
    str_trim()
}

# Song cleaning function 
clean_d1_song <- function(x) {
  x %>%
    str_to_lower() %>%
    str_trim() %>%
    # Remove all square brackets and their contents
    str_remove_all("\\s*\\[.*?\\]") %>%
    # Remove version markers inside parentheses
    str_remove_all(regex("\\s*\\([^)]*remix[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*remaster[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(live version|acoustic|unplugged)[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(featuring|feat\\.|ft\\.)[^)]*\\)", ignore_case = TRUE)) %>%
    # Remove Remix marker after dash
    str_remove(regex("\\s*-\\s*.*remix.*$", ignore_case = TRUE)) %>%
    str_trim()
}

# Apply cleaning 
D1_wrangled <- D1_clean_base %>%
  mutate(
    artist_clean = clean_d1_artist(artist),
    song_clean   = clean_d1_song(song)
  )

cat("D1 cleaning data total rows:", nrow(D1_wrangled), "\n")

D1 cleaning data total rows: 330087

Dataset 1 Validation

# Artist changed records
D1_wrangled %>%
  filter(artist != artist_clean) %>%
  distinct(artist, artist_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D1 Artist Cleaning Effect (Changed Records Top 20)")

artist	artist_clean
D1 Artist Cleaning Effect (Changed Records Top 20)
Adele	adele
The Kid LAROI & Justin Bieber	the kid laroi & justin bieber
Lil Nas X & Jack Harlow	lil nas x & jack harlow
Walker Hayes	walker hayes
Ed Sheeran	ed sheeran
Drake Featuring Future & Young Thug	drake
Olivia Rodrigo	olivia rodrigo
Doja Cat	doja cat
Dua Lipa	dua lipa
Wizkid Featuring Justin Bieber & Tems	wizkid
Doja Cat Featuring SZA	doja cat
Glass Animals	glass animals
Maneskin	maneskin
Elton John & Dua Lipa	elton john & dua lipa
Doja Cat & The Weeknd	doja cat & the weeknd
The Weeknd & Ariana Grande	the weeknd & ariana grande
Jason Aldean & Carrie Underwood	jason aldean & carrie underwood
Coldplay x BTS	coldplay x bts
Nardo Wick Featuring G Herbo, Lil Durk & 21 Savage	nardo wick
Drake Featuring 21 Savage & Project Pat	drake

# Artist changed records
D1_wrangled %>%
  filter(song != song_clean) %>%
  distinct(song, song_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D1 Song Cleaning Effect (Changed Records Top 20)")

song	song_clean
D1 Song Cleaning Effect (Changed Records Top 20)
Easy On Me	easy on me
Stay	stay
Industry Baby	industry baby
Fancy Like	fancy like
Bad Habits	bad habits
Way 2 Sexy	way 2 sexy
Shivers	shivers
Good 4 U	good 4 u
Need To Know	need to know
Levitating	levitating
Essence	essence
Kiss Me More	kiss me more
Heat Waves	heat waves
Beggin'	beggin'
Cold Heart (PNAU Remix)	cold heart
You Right	you right
Save Your Tears	save your tears
If I Didn't Love You	if i didn't love you
Traitor	traitor
My Universe	my universe

artist_changed <- D1_wrangled %>%
  filter(artist != artist_clean) %>%
  nrow()

song_changed <- D1_wrangled %>%
  filter(song != song_clean) %>%
  nrow()

cat("Artist records changed:", artist_changed, "rows (",
    round(artist_changed / nrow(D1_wrangled) * 100, 2), "%)\n")

Artist records changed: 328951 rows ( 99.66 %)

cat("Song records changed:  ", song_changed, "rows (",
    round(song_changed / nrow(D1_wrangled) * 100, 2), "%)\n\n")

Song records changed:   328971 rows ( 99.66 %)

Part 2: Dataset 2 (Spotify)

Dataset 2 Wrangling

# Artist cleaning function
clean_d2_artist <- function(x) {
  x %>%
    str_to_lower() %>%
    str_trim() %>%
    # Remove parentheses containing featuring / feat. / duet with
    str_remove_all(regex("\\s*\\([^)]*(featuring|feat\\.|duet with)[^)]*\\)", ignore_case = TRUE)) %>%
    # Remove "featuring" and everything after it
    str_remove(regex("\\s+featuring.*$", ignore_case = TRUE)) %>%
    # Remove "feat." and everything after it
    str_remove(regex("\\s*\\(?\\s*feat\\..*$", ignore_case = TRUE)) %>%
    # Remove "ft." and everything after it
    str_remove(regex("\\s+ft\\..*$", ignore_case = TRUE)) %>%
    # Remove "with" and everything after it
    str_remove(regex("\\s+with\\s+.*$", ignore_case = TRUE)) %>%
    str_trim()
}

# Track cleaning function
clean_d2_track <- function(x) {
  x %>%
    str_to_lower() %>%
    str_trim() %>%
    # Remove version markers inside parentheses
    str_remove_all("\\s*\\[.*?\\]") %>%
    # Remove version markers inside parentheses
    str_remove_all(regex("\\s*\\([^)]*remix[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*remaster[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(live version|acoustic|unplugged)[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(featuring|feat\\.|ft\\.)[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*radio edit[^)]*\\)", ignore_case = TRUE)) %>%
    # Version markers after dash
    str_remove(regex("\\s*-\\s*.*(remaster|remastered|remasteris[eé]).*$", ignore_case = TRUE)) %>%
    str_remove(regex("\\s*-\\s*(live|version live|live version)\\s*$", ignore_case = TRUE)) %>%
    str_remove(regex("\\s*-\\s*.*remix.*$", ignore_case = TRUE)) %>%
    str_remove(regex("\\s*-\\s*radio edit.*$", ignore_case = TRUE)) %>%
    str_remove(regex("\\s*-\\s*feat\\..*$", ignore_case = TRUE)) %>%
    str_trim()
}

# Apply cleaning + unify column names
D2_wrangled <- D2_clean_base %>%
  rename(
    artist = artist,   
    song   = track     # rename track to song, aling with D1
  ) %>%
  mutate(
    artist_clean = clean_d2_artist(artist),
    song_clean   = clean_d2_track(song)
  )

cat("D2 cleaning data total rows:", nrow(D2_wrangled), "\n")

D2 cleaning data total rows: 41106

Dataset 2 Validation

#D2 Artist Cleaning Validation\n
D2_wrangled %>%
  filter(artist != artist_clean) %>%
  distinct(artist, artist_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D2 Artist Cleaning Effect (Changed Records Top 20)")

artist	artist_clean
D2 Artist Cleaning Effect (Changed Records Top 20)
Garland Green	garland green
Serge Gainsbourg	serge gainsbourg
Lord Melody	lord melody
Celia Cruz	celia cruz
P. Susheela	p. susheela
Ennio Morricone	ennio morricone
Antônio Carlos Jobim	antônio carlos jobim
Marv Johnson	marv johnson
Caetano Veloso	caetano veloso
The Beach Boys	the beach boys
Jerry Goldsmith	jerry goldsmith
Roy Orbison	roy orbison
Luiz Gonzaga	luiz gonzaga
Jimmy Dean	jimmy dean
Mary Wells	mary wells
Leño	leño
Jackie Wilson	jackie wilson
Staatskapelle Berlin	staatskapelle berlin
Shirley & Lee	shirley & lee
The Everly Brothers	the everly brothers

#D2 Song Cleaning Validation
D2_wrangled %>%
  filter(song != song_clean) %>%
  distinct(song, song_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D2 Song Cleaning Effect (Changed Records Top 20)")

song	song_clean
D2 Song Cleaning Effect (Changed Records Top 20)
Jealous Kind Of Fella	jealous kind of fella
Initials B.B.	initials b.b.
Melody Twist	melody twist
Mi Bomba Sonó	mi bomba sonó
Uravu Solla	uravu solla
Beat n. 3	beat n. 3
Samba De Uma Nota So (One Note Samba)	samba de uma nota so (one note samba)
Happy Days	happy days
Carolina - Remastered 2006	carolina
I Can Hear Music	i can hear music
The Aftermath (From "Stagecoach")	the aftermath (from "stagecoach")
Ride Away	ride away
Caboclo Nordestino	caboclo nordestino
P.T. 109	p.t. 109
Bye Bye Baby	bye bye baby
A Festa dos Seus 15 Anos	a festa dos seus 15 anos
My Empty Arms	my empty arms
Alfômega - Remastered 2006	alfômega
Parsifal: "Wehevolles Erbe, dem ich verfallen"	parsifal: "wehevolles erbe, dem ich verfallen"
Let The Good Times Roll	let the good times roll

# D2 Cleaning Summary
artist_changed <- D2_wrangled %>%
  filter(artist != artist_clean) %>%
  nrow()

song_changed <- D2_wrangled %>%
  filter(song != song_clean) %>%
  nrow()

cat("Artist records changed:", artist_changed, "rows (",
    round(artist_changed / nrow(D2_wrangled) * 100, 2), "%)\n")

Artist records changed: 40937 rows ( 99.59 %)

cat("Songs records changed:", song_changed, "rows (",
    round(song_changed / nrow(D2_wrangled) * 100, 2), "%)\n\n")

Songs records changed: 40899 rows ( 99.5 %)

Part 3: Dataset 3 (Music Dataset)

Dataset 3 Wrangling

# D3 (Music Dataset) Wrangling
# Artist cleaning function 
clean_d3_artist <- function(x) {
  x %>%
    str_trim()
}

# Song cleaning function
clean_d3_song <- function(x) {
  x %>%
    str_trim() %>%
    # Remove all square brackets and their contents
    str_remove_all("\\s*\\[.*?\\]") %>%
    # Remove version markers inside parentheses
    str_remove_all(regex("\\s*\\([^)]*remix[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*remaster[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(feat\\.|featuring)[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*radio edit[^)]*\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\([^)]*(acoustic|unplugged)[^)]*\\)", ignore_case = TRUE)) %>%
    # For "live", only remove explicit (live) or (live version) patterns in parentheses
    # Do not touch "live" appearing in song titles (e.g., 'as long as i live')
    str_remove_all(regex("\\s*\\(live version\\)", ignore_case = TRUE)) %>%
    str_remove_all(regex("\\s*\\(live\\)", ignore_case = TRUE)) %>%
    str_trim()
}

# Apply cleaning + unify column names
D3_wrangled <- D3_clean_base %>%
  rename(
    artist = artist_name,  # rename artist_name to artist, align with D1
    song   = track_name    # rename track_name to song, align with D1
  ) %>%
  mutate(
    artist_clean = clean_d3_artist(artist),
    song_clean   = clean_d3_song(song)
  )

cat("D3 cleaning data total rows:", nrow(D3_wrangled), "\n")

D3 cleaning data total rows: 28372

Dataset 3 Validation

# D3 artist
artist_changed <- D3_wrangled %>%
  filter(artist != artist_clean) %>%
  nrow()

cat("Artist records changed:", artist_changed, "rows\n")

Artist records changed: 1 rows

D3_wrangled %>%
  filter(artist != artist_clean) %>%
  distinct(artist, artist_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "3 Artist Cleaning Effect (Changed Records)")

artist	artist_clean
3 Artist Cleaning Effect (Changed Records)
babes in toyland	babes in toyland

# D3 Song Cleaning Validation
D3_wrangled %>%
  filter(song != song_clean) %>%
  distinct(song, song_clean) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "D3 Song Cleaning Effect (Changed Records Top 20)")

song	song_clean
D3 Song Cleaning Effect (Changed Records Top 20)
don't look back (feat. van morrison)	don't look back
so far away (live)	so far away
i feel the earth move (live)	i feel the earth move
he don't love you [like i love you]	he don't love you
you're the song [that i can't stop singing]	you're the song
whenever i call you "friend" (feat. stevie nicks)	whenever i call you "friend"
i just can't stop loving you (feat. siedah garrett)	i just can't stop loving you
speed demon (remix by nero)	speed demon
two occasions (live)	two occasions
gin and juice (feat. dat nigga daz)	gin and juice
ain't no fun (if the homies cant have none) (feat. nate dogg, warren g & kurupt)	ain't no fun (if the homies cant have none)
whatta man (feat. en vogue)	whatta man
ice cream (feat. ghostface killah, method man & cappadonna)	ice cream
this is for the lover in you (feat. ll cool j, howard hewett, jody watley & jeffrey daniels)	this is for the lover in you
ill vibe (feat. q-tip)	ill vibe
keep on, keepin' on (feat. xscape)	keep on, keepin' on
a thin line between love & hate (feat. shirley murdock)	a thin line between love & hate
mo money mo problems (feat. mase & puff daddy)	mo money mo problems
make it hot (feat. missy elliott & mocha)	make it hot
heartbreaker (remix) (feat. da brat & missy elliott)	heartbreaker

# D3 Cleaning Summary
artist_changed <- D3_wrangled %>%
  filter(artist != artist_clean) %>%
  nrow()

song_changed <- D3_wrangled %>%
  filter(song != song_clean) %>%
  nrow()

cat("Artist records changed: ", artist_changed, "rows (",
    round(artist_changed / nrow(D3_wrangled) * 100, 2), "%)\n")

Artist records changed:  1 rows ( 0 %)

cat("Song records changed: ", song_changed, "rows (",
    round(song_changed / nrow(D3_wrangled) * 100, 2), "%)\n\n")

Song records changed:  430 rows ( 1.52 %)

Part 4: Join Key Creation

All Data create Join Key

# Generate Join Key
D1_wrangled <- D1_wrangled %>%
  mutate(
    join_key = paste(artist_clean, song_clean, sep = "|")
  )

D2_wrangled <- D2_wrangled %>%
  mutate(
    join_key = paste(artist_clean, song_clean, sep = "|")
  )

D3_wrangled <- D3_wrangled %>%
  mutate(
    join_key = paste(artist_clean, song_clean, sep = "|")
  )

D1 Join Key

D1_wrangled %>%
  select(artist, song, artist_clean, song_clean, join_key) %>%
  distinct(join_key, .keep_all = TRUE) %>%
  slice_head(n = 5) %>%
  gt() %>%
  tab_header(title = "D1 Join Key")

artist	song	artist_clean	song_clean	join_key
D1 Join Key
Adele	Easy On Me	adele	easy on me	adele\|easy on me
The Kid LAROI & Justin Bieber	Stay	the kid laroi & justin bieber	stay	the kid laroi & justin bieber\|stay
Lil Nas X & Jack Harlow	Industry Baby	lil nas x & jack harlow	industry baby	lil nas x & jack harlow\|industry baby
Walker Hayes	Fancy Like	walker hayes	fancy like	walker hayes\|fancy like
Ed Sheeran	Bad Habits	ed sheeran	bad habits	ed sheeran\|bad habits

D2 Join Key

D2_wrangled %>%
  select(artist, song, artist_clean, song_clean, join_key) %>%
  distinct(join_key, .keep_all = TRUE) %>%
  slice_head(n = 5) %>%
  gt() %>%
  tab_header(title = "D2 Join Key")

artist	song	artist_clean	song_clean	join_key
D2 Join Key
Garland Green	Jealous Kind Of Fella	garland green	jealous kind of fella	garland green\|jealous kind of fella
Serge Gainsbourg	Initials B.B.	serge gainsbourg	initials b.b.	serge gainsbourg\|initials b.b.
Lord Melody	Melody Twist	lord melody	melody twist	lord melody\|melody twist
Celia Cruz	Mi Bomba Sonó	celia cruz	mi bomba sonó	celia cruz\|mi bomba sonó
P. Susheela	Uravu Solla	p. susheela	uravu solla	p. susheela\|uravu solla

D3 Join Key

D3_wrangled %>%
  select(artist, song, artist_clean, song_clean, join_key) %>%
  distinct(join_key, .keep_all = TRUE) %>%
  slice_head(n = 5) %>%
  gt() %>%
  tab_header(title = "D3 Join Key")

artist	song	artist_clean	song_clean	join_key
D3 Join Key
mukesh	mohabbat bhi jhoothi	mukesh	mohabbat bhi jhoothi	mukesh\|mohabbat bhi jhoothi
frankie laine	i believe	frankie laine	i believe	frankie laine\|i believe
johnnie ray	cry	johnnie ray	cry	johnnie ray\|cry
pérez prado	patricia	pérez prado	patricia	pérez prado\|patricia
giorgos papadopoulos	apopse eida oneiro	giorgos papadopoulos	apopse eida oneiro	giorgos papadopoulos\|apopse eida oneiro

Part 5: Data join key overlap predication

# Before the formal join, quickly verify key overlap across the three datasets
# D1 unique join keys
d1_keys <- D1_wrangled %>% distinct(join_key) %>% pull(join_key)
d2_keys <- D2_wrangled %>% distinct(join_key) %>% pull(join_key)
d3_keys <- D3_wrangled %>% distinct(join_key) %>% pull(join_key)

d1_d2_overlap <- sum(d1_keys %in% d2_keys)
d1_d3_overlap <- sum(d1_keys %in% d3_keys)

cat("D1 unique keys:", length(d1_keys), "\n")

D1 unique keys: 29671

cat("D2 unique keys:", length(d2_keys), "\n")

D2 unique keys: 39831

cat("D3 unique keys:", length(d3_keys), "\n\n")

D3 unique keys: 28338

cat("D1 ∩ D2 overlap:", d1_d2_overlap, "rows (",
    round(d1_d2_overlap / length(d1_keys) * 100, 2), "% of D1)\n")

D1 ∩ D2 overlap: 20123 rows ( 67.82 % of D1)

cat("D1 ∩ D3 overlap:", d1_d3_overlap, "rows (",
    round(d1_d3_overlap / length(d1_keys) * 100, 2), "% of D1)\n\n")

D1 ∩ D3 overlap: 3154 rows ( 10.63 % of D1)

Part 6 saveRDS

saveRDS(D1_wrangled, "../Data/wrangled_D1.rds")
saveRDS(D2_wrangled, "../Data/wrangled_D2.rds")
saveRDS(D3_wrangled, "../Data/wrangled_D3.rds")

Note Due to formatting inconsistencies between D3 and the Billboard dataset, the exact match rate between D1 and D3 is approximately 10.6%. As a result, D3 is used primarily as a supplementary source for genre classification and lyrical sentiment analysis, rather than as a fully integrated dataset. Findings derived from the D1–D3 joined subset should be interpreted with caution given the limited coverage.