02 Pattern Analysis

Author

Fu Wei Hsu

Executive Summary

Objective:

To conduct a systematic exploration of formatting patterns and special characters across the three datasets, ensuring a unified cleaning protocol prior to data integration.

importing 01 cleaned data

library(tidyverse)
library(janitor)
library(gt) 

# Load cleaned datasets from 01_data_exploration
D1_clean_base <- readRDS("../Data/cleaned_D1.rds")
D2_clean_base <- readRDS("../Data/cleaned_D2.rds")
D3_clean_base <- readRDS("../Data/cleaned_D3.rds")

cat("Datasets loaded successfully\n")

Datasets loaded successfully

cat("D1 (Billboard):", nrow(D1_clean_base), "records\n")

D1 (Billboard): 330087 records

cat("D2 (Spotify):", nrow(D2_clean_base), "records\n")

D2 (Spotify): 41106 records

cat("D3 (Music Dataset):", nrow(D3_clean_base), "records\n")

D3 (Music Dataset): 28372 records

Part 1: Dataset 1 (Billboard) Pattern Analysis

1.1 Artist Column Pattern Analysis

# Define patterns to analyze
d1_artist_stats <- tibble(
  Pattern = c(
    "Contains 'Featuring'",
    "Contains 'feat.' or 'ft.'",
    "Contains '&'",
    "Contains ','",
    "Contains '()' or '[]'",
    "Contains '/'",
    "Contains '\\'",
    "Contains 'x' or 'X' (Collab)",
    "Contains '+' (Collab)"
  ),
  Count = c(
    sum(str_detect(D1_clean_base$artist, regex("Featuring", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$artist, regex("feat\\.|ft\\.", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$artist, "&")),
    sum(str_detect(D1_clean_base$artist, ",")),
    sum(str_detect(D1_clean_base$artist, "[\\(\\)\\[\\]]")),
    sum(str_detect(D1_clean_base$artist, "/")),
    sum(str_detect(D1_clean_base$artist, "\\\\")),
    sum(str_detect(D1_clean_base$artist, regex("\\sx\\s", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$artist, regex("\\s\\+\\s", ignore_case = TRUE)))
  )
)

# Display table
d1_artist_stats %>%
  gt() %>%
  tab_header(
    title = "Artist Column Pattern Statistics (D1)"
  ) %>%
  cols_label(
    Pattern = "Pattern Description"
  ) %>%
  fmt_number(
    columns = Count,
    decimals = 0,
    use_seps = TRUE 
  )

Pattern Description	Count
Artist Column Pattern Statistics (D1)
Contains 'Featuring'	32,274
Contains 'feat.' or 'ft.'	441
Contains '&'	25,155
Contains ','	4,206
Contains '()' or '[]'	1,324
Contains '/'	766
Contains '\'	0
Contains 'x' or 'X' (Collab)	844
Contains '+' (Collab)	651

Representative Examples

# define a function to avoid repeating the filtering logic
show_examples <- function(data, pattern_name, regex_pattern) {
  cat(paste0("\nExample: contain ", pattern_name, "\n"))
  
  data %>%
    filter(str_detect(artist, regex(regex_pattern, ignore_case = TRUE))) %>%
    distinct(artist) %>%
    slice_head(n = 2) %>%
    pull(artist) %>%
    print()
}

show_examples(D1_clean_base, "Featuring", "featuring")


Example: contain Featuring
[1] "Drake Featuring Future & Young Thug"  
[2] "Wizkid Featuring Justin Bieber & Tems"

show_examples(D1_clean_base, "feat./ft.", "feat\\.|ft\\.")


Example: contain feat./ft.
[1] "Ariana Grande Feat. Doja Cat & Megan Thee Stallion"                                     
[2] "Lil Wayne, Wiz Khalifa & Imagine Dragons With Logic & Ty Dolla $ign Feat. X Ambassadors"

show_examples(D1_clean_base, "Ampersand (&)", "&")


Example: contain Ampersand (&)
[1] "The Kid LAROI & Justin Bieber" "Lil Nas X & Jack Harlow"

show_examples(D1_clean_base, "Comma (,)", ",")


Example: contain Comma (,)
[1] "Nardo Wick Featuring G Herbo, Lil Durk & 21 Savage"
[2] "Yung Bleu, Chris Brown & 2 Chainz"

show_examples(D1_clean_base, "Brackets () []", "[\\(\\)\\[\\]]")


Example: contain Brackets () []
[1] "Silk Sonic (Bruno Mars & Anderson .Paak)"
[2] "F.L.Y. (Fast Life Yungstaz)"

show_examples(D1_clean_base, "Forward Slash (/)", "/")


Example: contain Forward Slash (/)
[1] "John & Yoko/The Plastic Ono Band With The Harlem Community Choir"
[2] "Zayn / Taylor Swift"

show_examples(D1_clean_base, "Backslash (\\)", "\\\\")


Example: contain Backslash (\)
character(0)

show_examples(D1_clean_base, "x or X (Collab)", "\\sx\\s")


Example: contain x or X (Collab)
[1] "Lil Nas X & Jack Harlow" "Coldplay x BTS"

show_examples(D1_clean_base, "Plus (+)", "\\s\\+\\s")


Example: contain Plus (+)
[1] "Dan + Shay"               "Chris Young + Kane Brown"

Deep Analysis

Analysis 1: Usage Context of ‘&’ Symbol

ampersand_data <- D1_clean_base %>% 
  filter(str_detect(artist, "&")) %>% 
  distinct(artist) %>% 
  mutate(
    # Check for collaboration markers
    has_collab_marker = str_detect(artist, 
      regex("featuring|feat\\.|ft\\.|with", ignore_case = TRUE)),
    
    # Count occurrences of '&'
    ampersand_count = str_count(artist, "&"),
    
    # Categorize
    type = case_when(
      has_collab_marker ~ "Has collab marker (Safe to clean)",
      ampersand_count >= 2 ~ "Multiple '&' (Complex collab)",
      TRUE ~ "Simple '&' (Likely duo or group)"
    )
  )

cat("Total artists containing '&':", nrow(ampersand_data), "\n\n")

Total artists containing '&': 1485

# Summary
type_summary <- ampersand_data %>% 
  count(type) %>% 
  mutate(Percentage = round(n / nrow(ampersand_data) * 100, 1))

type_summary %>% 
  gt() %>% 
  tab_header(title = "Analysis of '&' Symbol Usage Types") %>%
  fmt_number(columns = n, decimals = 0, use_seps = TRUE)

type	n	Percentage
Analysis of '&' Symbol Usage Types
Has collab marker (Safe to clean)	612	41.2
Multiple '&' (Complex collab)	7	0.5
Simple '&' (Likely duo or group)	866	58.3

Analysis 1: Representative Examples

# Define a helper function to display samples
display_samples <- function(data, type_name, title, strategy) {
  cat("\nCategory:", title, "\n")
  cat("Cleaning Strategy:", strategy, "\n\n")
  
  data %>% 
    filter(type == type_name) %>% 
    slice_head(n = 2) %>% 
    pull(artist) %>% 
    print()
}

# Display samples based on categories
display_samples(ampersand_data, 
                "Has collab marker (Safe to clean)", 
                "Collab marker detected (featuring/with/feat)", 
                "Remove all content following the collab marker.")


Category: Collab marker detected (featuring/with/feat) 
Cleaning Strategy: Remove all content following the collab marker. 

[1] "Drake Featuring Future & Young Thug"  
[2] "Wizkid Featuring Justin Bieber & Tems"

display_samples(ampersand_data, 
                "Simple '&' (Likely duo or group)", 
                "Simple '&' (No collab marker)", 
                "Conservative approach; retain the primary artist before '&'.")


Category: Simple '&' (No collab marker) 
Cleaning Strategy: Conservative approach; retain the primary artist before '&'. 

[1] "The Kid LAROI & Justin Bieber" "Lil Nas X & Jack Harlow"

display_samples(ampersand_data, 
                "Multiple '&' (Complex collab)", 
                "Multiple '&' (Complex cases)", 
                "Similar to Category 1; remove content after the collab marker.")


Category: Multiple '&' (Complex cases) 
Cleaning Strategy: Similar to Category 1; remove content after the collab marker. 

[1] "Bad Bunny, Jowell & Randy & Nengo Flow"
[2] "Luke Combs & Brooks & Dunn"

Analysis 1: Cleaning Strategy

cat("\n========================================\n")


========================================

cat("  Cleaning Strategy for '&' Symbol\n")

  Cleaning Strategy for '&' Symbol

cat("========================================\n\n")

========================================

cat("   Conservative Cleaning\n")

   Conservative Cleaning

cat("   - Remove all content after 'featuring', 'feat.', 'ft.', or 'with'.\n")

   - Remove all content after 'featuring', 'feat.', 'ft.', or 'with'.

cat("   - Preserve '&' (do not remove).\n\n")

   - Preserve '&' (do not remove).

cat("   Examples:\n")

   Examples:

cat("   'Drake Featuring Future & Young Thug' -> 'drake'\n")

   'Drake Featuring Future & Young Thug' -> 'drake'

cat("   'The Kid LAROI & Justin Bieber'       -> 'the kid laroi & justin bieber' (Retained)\n")

   'The Kid LAROI & Justin Bieber'       -> 'the kid laroi & justin bieber' (Retained)

Analysis 2: Parentheses Content Analysis

# Prepare the data
paren_data <- D1_clean_base %>%
  filter(str_detect(artist, "\\(.*\\)")) %>%
  mutate(
    inside_paren = str_extract(artist, "\\(([^)]+)\\)"),
    inside_paren_clean = str_remove_all(inside_paren, "[\\(\\)]"),
    # Create categories to enable the display_samples function
    type = if_else(str_detect(inside_paren_clean, regex("feat|duet|featuring", ignore_case = TRUE)),
                   "Collab-related (Safe to remove)",
                   "Group Info/Metadata (Preserve)")
  )

cat("Total artists containing parentheses:", nrow(paren_data), "\n\n")

Total artists containing parentheses: 1271

# Summary table of content inside parentheses
paren_data %>%
  count(inside_paren_clean, sort = TRUE) %>%
  slice_head(n = 10) %>%
  gt() %>%
  tab_header(title = "Distribution of Content inside Parentheses") %>%
  fmt_number(columns = n, decimals = 0, use_seps = TRUE)

inside_paren_clean	n
Distribution of Content inside Parentheses
Featuring Aaron Neville	47
Bruno Mars & Anderson .Paak	42
He's the King	42
Not Was	39
Of Major Coinz	37
Question Mark	33
Featuring Dr. Dre	31
Featuring Susan Jacks	30
Featuring Mase	28
Featuring Carolyn Willis	26

Analysis 2: Representative Examples

display_samples(paren_data, 
                "Collab-related (Safe to remove)", 
                "Collab-related (featuring/duet)", 
                "Remove parentheses and content.")


Category: Collab-related (featuring/duet) 
Cleaning Strategy: Remove parentheses and content. 

[1] "R. Kelly Or Bow Wow (Featuring T.I. & T-Pain)"
[2] "R. Kelly Or Bow Wow (Featuring T.I. & T-Pain)"

display_samples(paren_data, 
                "Group Info/Metadata (Preserve)", 
                "Group Names/Metadata", 
                "Retain full string (preserve for context).")


Category: Group Names/Metadata 
Cleaning Strategy: Retain full string (preserve for context). 

[1] "Silk Sonic (Bruno Mars & Anderson .Paak)"
[2] "Silk Sonic (Bruno Mars & Anderson .Paak)"

Analysis 2: Cleaning Strategy

cat("\n========================================\n")


========================================

cat("Cleaning Strategy for Parentheses\n")

Cleaning Strategy for Parentheses

cat("========================================\n\n")

========================================

cat("   Sective Removal\n")

   Sective Removal

cat("   - Remove parentheses containing 'featuring', 'feat.', or 'duet with'.\n")

   - Remove parentheses containing 'featuring', 'feat.', or 'duet with'.

cat("   - Preserve other parentheses (e.g., band members, full group names).\n\n")

   - Preserve other parentheses (e.g., band members, full group names).

cat("   Examples:\n")

   Examples:

cat("   'Artist (Featuring Guest)'              -> 'artist' (Removed)\n")

   'Artist (Featuring Guest)'              -> 'artist' (Removed)

cat("   'Artist (Duet With Guest)'              -> 'artist' (Removed)\n")

   'Artist (Duet With Guest)'              -> 'artist' (Removed)

cat("   'Silk Sonic (Bruno Mars & Anderson .Paak)' -> 'Silk Sonic (Bruno Mars & Anderson .Paak)' (Preserved)\n")

   'Silk Sonic (Bruno Mars & Anderson .Paak)' -> 'Silk Sonic (Bruno Mars & Anderson .Paak)' (Preserved)

cat("   'F.L.Y. (Fast Life Yungstaz)'           -> 'F.L.Y. (Fast Life Yungstaz)' (Preserved)\n")

   'F.L.Y. (Fast Life Yungstaz)'           -> 'F.L.Y. (Fast Life Yungstaz)' (Preserved)

Analysis 3: Comma Usage Patterns

comma_data <- D1_clean_base %>% 
  filter(str_detect(artist, ",")) %>% 
  distinct(artist) %>% 
  mutate(
    # Check for collaboration markers
    has_collab_marker = str_detect(artist, 
      regex("featuring|feat\\.|ft\\.|with", ignore_case = TRUE)),
    
    # Count commas
    comma_count = str_count(artist, ","),
    
    # Categorize
    type = case_when(
      has_collab_marker ~ "Collab marker (Automated processing)",
      comma_count >= 2 ~ "Multiple commas (Complex cases)",
      TRUE ~ "Simple comma (Group/Part of name)"
    )
  )

cat("Total artists containing commas:", nrow(comma_data), "\n\n")

Total artists containing commas: 290

# Summary table
comma_data %>% 
  count(type) %>% 
  mutate(Percentage = round(n / sum(n) * 100, 2)) %>% 
  gt() %>% 
  tab_header(title = "Usage Context Statistics for Commas") %>%
  fmt_number(columns = n, decimals = 0, use_seps = TRUE)

type	n	Percentage
Usage Context Statistics for Commas
Collab marker (Automated processing)	148	51.03
Multiple commas (Complex cases)	34	11.72
Simple comma (Group/Part of name)	108	37.24

Analysis 3: Representative Examples

# Using the helper function defined in previous steps
display_samples(comma_data, 
                "Collab marker (Automated processing)", 
                "Collab Marker Detected", 
                "Remove collab marker and following content; comma will be handled automatically.")


Category: Collab Marker Detected 
Cleaning Strategy: Remove collab marker and following content; comma will be handled automatically. 

[1] "Nardo Wick Featuring G Herbo, Lil Durk & 21 Savage"                     
[2] "Tyler, The Creator Featuring YoungBoy Never Broke Again & Ty Dolla $ign"

display_samples(comma_data, 
                "Simple comma (Group/Part of name)", 
                "Simple Comma (No collab marker)", 
                "Preserve fully; likely multiple artists or part of the name.")


Category: Simple Comma (No collab marker) 
Cleaning Strategy: Preserve fully; likely multiple artists or part of the name. 

[1] "Yung Bleu, Chris Brown & 2 Chainz"   "Lil Tjay, Fivio Foreign & Kay Flock"

display_samples(comma_data, 
                "Multiple commas (Complex cases)", 
                "Multiple Commas (Complex)", 
                "Similar to collab cases; remove following content if collab marker exists.")


Category: Multiple Commas (Complex) 
Cleaning Strategy: Similar to collab cases; remove following content if collab marker exists. 

[1] "Skylar Grey, Polo G, Mozzy & Eminem"  
[2] "J Balvin, Dua Lipa, Bad Bunny & Tainy"

Analysis 3: Cleaning Strategy

# Check for ", The" pattern (common in names like "Tyler, The Creator")
name_comma <- D1_clean_base %>% 
  filter(str_detect(artist, ", The")) %>% 
  distinct(artist) %>% 
  slice_head(n = 10)

cat("Artists containing ', The' (likely part of the name):\n")

Artists containing ', The' (likely part of the name):

name_comma %>% 
  pull(artist) %>% 
  print()

 [1] "Tyler, The Creator Featuring YoungBoy Never Broke Again & Ty Dolla $ign"
 [2] "Belly, The Weeknd & Young Thug"                                         
 [3] "Tyler, The Creator Featuring Lil Uzi Vert & Pharrell Williams"          
 [4] "Tyler, The Creator Featuring 42 Dugg"                                   
 [5] "Tyler, The Creator"                                                     
 [6] "Tyler, The Creator Featuring Lil Wayne"                                 
 [7] "Tyler, The Creator Featuring DJ Drama"                                  
 [8] "Tyler, The Creator Featuring Brent Faiyaz & Fana Hues"                  
 [9] "Tyler, The Creator Featuring Teezo Touchdown"                           
[10] "Tyler, The Creator Featuring Domo Genesis"

cat("These commas are part of the artist's name is not be removed\n")

These commas are part of the artist's name is not be removed

1.2 Song Column Pattern Analysis

d1_song_stats <- tibble(
  Pattern = c(
    "Contains Parentheses ()",
    "Contains Brackets []",
    "Contains ' - ' (dash)",
    "Contains 'feat.'",
    "Contains 'Remaster'",
    "Contains 'Live'",
    "Contains 'Remix'",
    "Contains 'Radio Edit'",
    "Contains 'feat' inside ()",
    "Contains '+' (Part of song title)",
    "Contains 'Pt.' or 'Part'",
    "Contains 'Take'"
  ),
  Count = c(
    sum(str_detect(D1_clean_base$song, "\\(.*\\)")),
    sum(str_detect(D1_clean_base$song, "\\[.*\\]")),
    sum(str_detect(D1_clean_base$song, " - ")),
    sum(str_detect(D1_clean_base$song, regex("feat\\.", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$song, regex("Remaster", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$song, regex("Live", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$song, regex("Remix", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$song, regex("Radio Edit", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$song, regex("\\(.*feat", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$song, regex("\\s\\+\\s", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$song, regex("Pt\\.|Part", ignore_case = TRUE))),
    sum(str_detect(D1_clean_base$song, regex("Take", ignore_case = TRUE)))
  )
) %>% 
  mutate(Percentage = round(Count / nrow(D1_clean_base) * 100, 1))

# Display table
d1_song_stats %>% 
  gt() %>% 
  tab_header(title = "D1 Song Column Pattern Statistics") %>% 
  fmt_number(columns = Count, decimals = 0, use_seps = TRUE)

Pattern	Count	Percentage
D1 Song Column Pattern Statistics
Contains Parentheses ()	22,882	6.9
Contains Brackets []	112	0.0
Contains ' - ' (dash)	872	0.3
Contains 'feat.'	0	0.0
Contains 'Remaster'	0	0.0
Contains 'Live'	1,770	0.5
Contains 'Remix'	94	0.0
Contains 'Radio Edit'	0	0.0
Contains 'feat' inside ()	6	0.0
Contains '+' (Part of song title)	102	0.0
Contains 'Pt.' or 'Part'	4,019	1.2
Contains 'Take'	3,193	1.0

Pattern Examples

show_song_examples <- function(data, pattern_regex, label) {
  cat("\n=== Example:", label, "===\n")
  data %>% 
    filter(str_detect(song, regex(pattern_regex, ignore_case = TRUE))) %>% 
    distinct(song) %>% 
    slice_head(n = 3) %>% 
    pull(song) %>% 
    print()
}

# Run the function for each pattern
show_song_examples(D1_clean_base, "\\(.*\\)", "Contains Parentheses ()")


=== Example: Contains Parentheses () ===
[1] "Cold Heart (PNAU Remix)"        "Montero (Call Me By Your Name)"
[3] "Love Nwantiti (Ah Ah Ah)"

show_song_examples(D1_clean_base, "\\[.*\\]", "Contains Brackets []")


=== Example: Contains Brackets [] ===
[1] "We Are One (Ole Ola) [The 2014 FIFA World Cup Official Song]"
[2] "Turn It Up [Remix]/Fire It Up"                               
[3] "Everybody [Backstreet's Back]"

show_song_examples(D1_clean_base, " - ", "Contains ' - ' (dash)")


=== Example: Contains ' - ' (dash) ===
[1] "Savage Love (Laxed - Siren Beat)"   "Never Leave You - Uh Ooh, Uh Oooh!"
[3] "Roxanne `97 - Puff Daddy Remix"

show_song_examples(D1_clean_base, "feat\\.", "Contains 'feat.'")


=== Example: Contains 'feat.' ===
character(0)

show_song_examples(D1_clean_base, "Remaster", "Contains 'Remaster'")


=== Example: Contains 'Remaster' ===
character(0)

show_song_examples(D1_clean_base, "Live", "Contains 'Live'")


=== Example: Contains 'Live' ===
[1] "You Only Live Twice"  "Keep My Spirit Alive" "Long Live"

show_song_examples(D1_clean_base, "Remix", "Contains 'Remix'")


=== Example: Contains 'Remix' ===
[1] "Cold Heart (PNAU Remix)"     "No New Friends (SFTB Remix)"
[3] "Karate Chop (Remix)"

show_song_examples(D1_clean_base, "Radio Edit", "Contains 'Radio Edit'")


=== Example: Contains 'Radio Edit' ===
character(0)

show_song_examples(D1_clean_base, "\\(.*feat", "Contains 'feat' inside ()")


=== Example: Contains 'feat' inside () ===
[1] "Black Lassie (Featuring Johnny Stash)"

show_song_examples(D1_clean_base, "\\s\\+\\s", "Contains '+' (Part of title)")


=== Example: Contains '+' (Part of title) ===
[1] "Pink + White" "19 You + Me"  "U + Ur Hand"

show_song_examples(D1_clean_base, "Pt\\.|Part", "Contains 'Pt.' or 'Part'")


=== Example: Contains 'Pt.' or 'Part' ===
[1] "No More Parties" "Party Lyfe"      "Go Part 1"

show_song_examples(D1_clean_base, "Take", "Contains 'Take'")


=== Example: Contains 'Take' ===
[1] "Take My Breath"       "I Can't Take It Back" "Beautiful Mistakes"

Deep Analysis

Analysis 1: Parentheses Content Analysis

# Extract and categorize content inside parentheses
paren_data <- D1_clean_base %>%
  filter(str_detect(song, "\\(.*\\)")) %>%
  distinct(song) %>%
  mutate(
    inside_paren = str_extract(song, "\\(([^)]+)\\)"),
    inside_clean = str_remove_all(inside_paren, "[\\(\\)]"),
    
    # Categorization
    type = case_when(
      str_detect(inside_clean, regex("remix", ignore_case = TRUE)) ~ "Remix Version",
      str_detect(inside_clean, regex("^live$|live version|acoustic|unplugged", ignore_case = TRUE)) ~ "Performance Version",
      str_detect(inside_clean, regex("radio edit|album version|single version", ignore_case = TRUE)) ~ "Release Version Tag",
      str_detect(inside_clean, regex("remaster", ignore_case = TRUE)) ~ "Remastered Version",
      str_detect(inside_clean, regex("feat\\.|featuring", ignore_case = TRUE)) ~ "Featuring Info",
      TRUE ~ "Subtitle or Title Part (Keep)"
    )
  )

cat("Total songs containing parentheses:", nrow(paren_data), "\n\n")

Total songs containing parentheses: 1995

# Statistics table
paren_data %>%
  count(type) %>%
  arrange(desc(n)) %>%
  mutate(Percentage = round(n / sum(n) * 100, 1)) %>%
  gt() %>%
  tab_header(title = "Parentheses Content Classification") %>%
  fmt_number(columns = n, decimals = 0, use_seps = TRUE)

type	n	Percentage
Parentheses Content Classification
Subtitle or Title Part (Keep)	1,983	99.4
Performance Version	6	0.3
Remix Version	5	0.3
Featuring Info	1	0.1

Analysis 1: Representative Examples

# Define a cleaner display function
display_samples <- function(data, category_name) {
  cat("\n=== Category:", category_name, "===\n")
  data %>%
    filter(type == category_name) %>%
    slice_head(n = 8) %>%
    pull(song) %>%
    {cat(paste0("  ", seq_along(.), ". '", ., "'\n"), sep = "")}
}

# Run display for each category
display_samples(paren_data, "Remix Version")


=== Category: Remix Version ===
  1. 'Cold Heart (PNAU Remix)'
  2. 'No New Friends (SFTB Remix)'
  3. 'Karate Chop (Remix)'
  4. 'Outta Control (Remix)'
  5. 'Sympathy For The Devil (Remixes)'

display_samples(paren_data, "Performance Version")


=== Category: Performance Version ===
  1. 'As Long As You Love Me (Acoustic)'
  2. 'The Prayer (Live)'
  3. 'Freak On A Leash (Unplugged)'
  4. 'I'll Be Home For Christmas (Live)'
  5. 'I Will Remember You (Live)'
  6. 'Rock And Roll All Nite (live)'

display_samples(paren_data, "Release Version Tag")


=== Category: Release Version Tag ===
  . ''

display_samples(paren_data, "Remastered Version")


=== Category: Remastered Version ===
  . ''

display_samples(paren_data, "Featuring Info")


=== Category: Featuring Info ===
  1. 'Black Lassie (Featuring Johnny Stash)'

display_samples(paren_data, "Subtitle or Title Part (Keep)")


=== Category: Subtitle or Title Part (Keep) ===
  1. 'Montero (Call Me By Your Name)'
  2. 'Love Nwantiti (Ah Ah Ah)'
  3. 'Get Into It (Yuh)'
  4. 'Ya Superame (En Vivo Desde Culiacan, Sinaloa)'
  5. 'Intro (Hate On Me)'
  6. 'Drunk (And I Don't Wanna Go Home)'
  7. 'Outside (100 MPH)'
  8. 'Me (FWM)'

Top 20 Parentheses Content

paren_data %>%
  count(inside_clean, sort = TRUE) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "Top 20 Most Frequent Parentheses Content") %>%
  fmt_number(columns = n, decimals = 0, use_seps = TRUE)

inside_clean	n
Top 20 Most Frequent Parentheses Content
Part 1	56
Part I	24
Taylor's Version	9
Part One	6
Theme From	6
Part 2	5
Part II	5
Pt. I	5
Theme from	5
From "Beverly Hills Cop II"	4
From "Boomerang"	4
From "Cocktail"	4
From "Space Jam"	4
From "Waiting To Exhale"	4
Interlude	4
Parts I & II	4
Pt. 1	4
Without You	4
Forever	3
Forget About Me	3

Analysis 2: Dash Content Analysis

# Extract and categorize content after the dash
dash_data <- D1_clean_base %>%
  filter(str_detect(song, " - ")) %>%
  distinct(song) %>%
  mutate(
    # Extract content after the dash
    after_dash = str_extract(song, " - (.*)$"),
    after_dash_clean = str_remove(after_dash, "^ - "),
    
    # Categorization
    type = case_when(
      str_detect(after_dash_clean, regex("remix", ignore_case = TRUE)) ~ "Remix Version",
      str_detect(after_dash_clean, regex("remaster", ignore_case = TRUE)) ~ "Remastered Version",
      str_detect(after_dash_clean, regex("live|acoustic", ignore_case = TRUE)) ~ "Performance Version",
      str_detect(after_dash_clean, regex("radio edit|album version|single", ignore_case = TRUE)) ~ "Release Version Tag",
      TRUE ~ "Subtitle or Title Part (Keep)"
    )
  )

cat("Total songs containing dashes:", nrow(dash_data), "\n\n")

Total songs containing dashes: 96

# Statistics table
dash_data %>%
  count(type) %>%
  arrange(desc(n)) %>%
  mutate(Percentage = round(n / sum(n) * 100, 1)) %>%
  gt() %>%
  tab_header(title = "Dash Content Classification") %>%
  fmt_number(columns = n, decimals = 0, use_seps = TRUE)

type	n	Percentage
Dash Content Classification
Subtitle or Title Part (Keep)	95	99
Remix Version	1	1

Analysis 2: Representative Examples

# display_samples(dash_data, "Category Name")

display_samples(dash_data, "Remix Version")


=== Category: Remix Version ===
  1. 'Roxanne `97 - Puff Daddy Remix'

display_samples(dash_data, "Remastered Version")


=== Category: Remastered Version ===
  . ''

display_samples(dash_data, "Performance Version")


=== Category: Performance Version ===
  . ''

display_samples(dash_data, "Release Version Tag")


=== Category: Release Version Tag ===
  . ''

display_samples(dash_data, "Subtitle or Title Part (Keep)")


=== Category: Subtitle or Title Part (Keep) ===
  1. 'Savage Love (Laxed - Siren Beat)'
  2. 'Never Leave You - Uh Ooh, Uh Oooh!'
  3. 'Undone - The Sweater Song'
  4. 'Don't Look Down - The Sequel'
  5. 'Dance Wit' Me - Part 1'
  6. 'Chariots Of Fire - Titles'
  7. 'Tell Me Tomorrow - Part I'
  8. '(Not Just) Knee Deep - Part 1'

Top 20 Dash Content

dash_data %>%
  count(after_dash_clean, sort = TRUE) %>%
  slice_head(n = 20) %>%
  gt() %>%
  tab_header(title = "Top 20 Most Frequent Dash Content") %>%
  fmt_number(columns = n, decimals = 0, use_seps = TRUE)

after_dash_clean	n
Top 20 Most Frequent Dash Content
Part I	21
Part 1	20
Pt. 1	5
Pt. I	5
"Muhammad Ali"	1
(Give Me Peace On Earth)	1
1965	1
A Want You	1
Boogie Woogie Flu	1
Bossa Nova	1
Breaker	1
By The Beatles	1
City Man	1
Don't Be A Hero	1
Get A Ticket	1
Half Heartache	1
Hey Boy	1
I Quit - I'm Movin' On	1
I'm Black And I'm Proud (Part 1)	1
I'm Coming, Coming, I'm Coming	1

Analysis 3: Bracket Content Analysis

# Extract content inside brackets
bracket_data <- D1_clean_base %>%
  filter(str_detect(song, "\\[.*\\]")) %>%
  distinct(song) %>%
  mutate(
    inside_bracket = str_extract(song, "\\[([^]]+)\\]"),
    inside_clean = str_remove_all(inside_bracket, "[\\[\\]]")
  )

cat("Total songs containing brackets:", nrow(bracket_data), "\n\n")

Total songs containing brackets: 7

# Statistics table
bracket_data %>%
  count(inside_clean, sort = TRUE) %>%
  slice_head(n = 10) %>%
  gt() %>%
  tab_header(title = "Frequency of Content Inside Brackets") %>%
  fmt_number(columns = n, decimals = 0, use_seps = TRUE)

inside_clean	n
Frequency of Content Inside Brackets
Backstreet's Back	1
Cherish You	1
Flash The Message	1
Freak	1
Remix	1
The 2014 FIFA World Cup Official Song	1
Uptown Baby	1

Analysis 3: Representative Examples

bracket_data %>%
  slice_head(n = 10) %>%
  pull(song) %>%
  {cat(paste0("  ", seq_along(.), ". '", ., "'\n"), sep = "")}

  1. 'We Are One (Ole Ola) [The 2014 FIFA World Cup Official Song]'
  2. 'Turn It Up [Remix]/Fire It Up'
  3. 'Everybody [Backstreet's Back]'
  4. 'Ninety Nine [Flash The Message]'
  5. 'I Do [Cherish You]'
  6. 'Deja Vu [Uptown Baby]'
  7. '[Freak] And U Know It'

False Positive Checks

# Check 'Live'
## 'Live' as a lyric word
live_in_lyrics <- D1_clean_base %>%
  filter(str_detect(song, regex("\\blive\\b", ignore_case = TRUE))) %>%
  filter(!str_detect(song, regex("\\(.*live.*\\)| - .*live", ignore_case = TRUE))) %>%
  distinct(song) %>%
  slice_head(n = 10)

## 'Live' as part of the song title (Not a Live version)
live_in_lyrics %>%
  pull(song) %>%
  {cat(paste0("  ", seq_along(.), ". '", ., "'\n"), sep = "")}

  1. 'You Only Live Twice'
  2. 'Long Live'
  3. 'Scared To Live'
  4. 'Live Off My Closet'
  5. 'I Don't Wanna Live Forever (Fifty Shades Darker)'
  6. 'Live From The Gutter'
  7. 'Live For The Night'
  8. 'Live It Up'
  9. 'Long Live A$AP'
  10. 'Live While We're Young'

cat("Using regex('live') would result in false positives!\n")

Using regex('live') would result in false positives!

# Check 'Part'
cat("\n─── Check 2: 'Part' as a lyric word ───\n\n")


─── Check 2: 'Part' as a lyric word ───

party_words <- D1_clean_base %>%
  filter(str_detect(song, regex("party|part", ignore_case = TRUE))) %>%
  filter(!str_detect(song, regex("part\\s+\\d|pt\\.\\s*\\d", ignore_case = TRUE))) %>%
  distinct(song) %>%
  slice_head(n = 10)

## 'Party/Part' as part of the song title
party_words %>%
  pull(song) %>%
  {cat(paste0("  ", seq_along(.), ". '", ., "'\n"), sep = "")}

  1. 'No More Parties'
  2. 'Party Lyfe'
  3. 'Party Up (Up In Here)'
  4. 'Party Girl'
  5. 'After Party'
  6. 'Six Feet Apart'
  7. 'Best Part Of Me'
  8. 'When The Party's Over'
  9. 'Best Part'
  10. 'Welcome To The Party'

# Check 'Take'
## 'Take' as a lyric word
take_words <- D1_clean_base %>%
  filter(str_detect(song, regex("\\btake\\b", ignore_case = TRUE))) %>%
  filter(!str_detect(song, regex("take\\s+\\d", ignore_case = TRUE))) %>%
  distinct(song) %>%
  slice_head(n = 10)

## 'Take' as part of the song title
take_words %>%
  pull(song) %>%
  {cat(paste0("  ", seq_along(.), ". '", ., "'\n"), sep = "")}

  1. 'Take My Breath'
  2. 'I Can't Take It Back'
  3. 'Take You Dancing'
  4. 'Take Me Home For Christmas'
  5. 'Take What You Want'
  6. 'The Take'
  7. 'Take It From Me'
  8. 'Take One'
  9. 'Take Back Home Girl'
  10. 'Can't Take A Joke'

Cleaning Strategy

cat("\n========================================\n")


========================================

cat("D1 Song Column Cleaning Strategy\n")

D1 Song Column Cleaning Strategy

cat("========================================\n\n")

========================================

cat("Extreme Conservative Cleaning - Remove only explicit version tags.\n\n")

Extreme Conservative Cleaning - Remove only explicit version tags.

cat("Data-Driven Rationale:\n")

Data-Driven Rationale:

cat("  - Parentheses (): 99.40% are part of the song title -> Preserve most.\n")

  - Parentheses (): 99.40% are part of the song title -> Preserve most.

cat("  - Dash -:        98.96% are part of the song title -> Preserve most.\n")

  - Dash -:        98.96% are part of the song title -> Preserve most.

cat("  - Brackets []:  100% are descriptive metadata    -> Remove all.\n\n")

  - Brackets []:  100% are descriptive metadata    -> Remove all.

cat("Cleaning Objectives:\n")

Cleaning Objectives:

cat("  1. Remove explicit version tags (Remix, Remaster, Live Version, etc.).\n")

  1. Remove explicit version tags (Remix, Remaster, Live Version, etc.).

cat("  2. Remove descriptive brackets [].\n")

  2. Remove descriptive brackets [].

cat("  3. Preserve song subtitles and full titles.\n")

  3. Preserve song subtitles and full titles.

cat("  4. Avoid removing common words in lyrics (Live, Part, Take).\n\n")

  4. Avoid removing common words in lyrics (Live, Part, Take).

cat("Exclusions (I Do NOT clean):\n")

Exclusions (I Do NOT clean):

cat("  - Standard parentheses (likely part of the title).\n")

  - Standard parentheses (likely part of the title).

cat("  - Standard dashes (likely subtitles).\n")

  - Standard dashes (likely subtitles).

cat("  - Words like 'live', 'part', 'take' (usually part of the song lyrics).\n")

  - Words like 'live', 'part', 'take' (usually part of the song lyrics).

Part 2: Dataset 2 (Spotify) Pattern Analysis

2.1 Artist Column Pattern Analysis

d2_artist_stats <- tibble(
  Pattern = c(
    "Contains 'Featuring'",
    "Contains 'feat.' or 'ft.'",
    "Contains '&'",
    "Contains comma ','",
    "Contains brackets () or []",
    "Contains forward slash /",
    "Contains backslash \\",
    "Contains 'x' or 'X' (Collab marker)",
    "Contains '+' (Collab marker)"
  ),
  Count = c(
    sum(str_detect(D2_clean_base$artist, regex("Featuring", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$artist, regex("feat\\.|ft\\.", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$artist, "&")),
    sum(str_detect(D2_clean_base$artist, ",")),
    sum(str_detect(D2_clean_base$artist, "[\\(\\)\\[\\]]")),
    sum(str_detect(D2_clean_base$artist, "/")),
    sum(str_detect(D2_clean_base$artist, "\\\\")),
    sum(str_detect(D2_clean_base$artist, regex("\\sx\\s", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$artist, regex("\\s\\+\\s", ignore_case = TRUE)))
  )
) %>% 
  mutate(
    Percentage = round(Count / nrow(D2_clean_base) * 100, 1)
  )

d2_artist_stats %>% 
  gt() %>% 
  tab_header(title = "D2 Artist Field Pattern Statistics") %>% 
  fmt_number(columns = Count, decimals = 0, use_seps = TRUE)

Pattern	Count	Percentage
D2 Artist Field Pattern Statistics
Contains 'Featuring'	1,658	4.0
Contains 'feat.' or 'ft.'	10	0.0
Contains '&'	1,786	4.3
Contains comma ','	255	0.6
Contains brackets () or []	63	0.2
Contains forward slash /	40	0.1
Contains backslash \	0	0.0
Contains 'x' or 'X' (Collab marker)	40	0.1
Contains '+' (Collab marker)	24	0.1

Representative Examples

display_samples <- function(data, pattern_regex, label) {
  cat("\n=== Example:", label, "===\n")
  data %>% 
    filter(str_detect(artist, regex(pattern_regex, ignore_case = TRUE))) %>% 
    distinct(artist) %>% 
    slice_head(n = 2) %>% 
    pull(artist) %>% 
    print()
}

# Run the function for each pattern
display_samples(D2_clean_base, "Featuring", "Contains 'Featuring'")


=== Example: Contains 'Featuring' ===
[1] "The Partridge Family Starring Shirley Jones Featuring David Cassidy"
[2] "Elvin Bishop Featuring Mickey Thomas"

display_samples(D2_clean_base, "feat\\.|ft\\.", "Contains 'feat.' or 'ft.'")


=== Example: Contains 'feat.' or 'ft.' ===
[1] "Quincy Jones (feat. The Brothers Johnson)"                
[2] "Master P Feat. Fiend, Silkk The Shocker, Mia X & Mystikal"

display_samples(D2_clean_base, "&", "Contains '&'")


=== Example: Contains '&' ===
[1] "Shirley & Lee"     "Ike & Tina Turner"

display_samples(D2_clean_base, ",", "Contains comma ','")


=== Example: Contains comma ',' ===
[1] "Blood, Sweat & Tears" "John Wesley Ryles, I"

display_samples(D2_clean_base, "[\\(\\)\\[\\]]", "Contains brackets () or []")


=== Example: Contains brackets () or [] ===
[1] "Ernie (Jim Henson)" "Sylvia (r&b)"

display_samples(D2_clean_base, "/", "Contains forward slash /")


=== Example: Contains forward slash / ===
[1] "Glen Campbell/Anne Murray"     "Cheryl Barnes/Hair Soundtrack"

display_samples(D2_clean_base, "\\\\", "Contains backslash \\")


=== Example: Contains backslash \ ===
character(0)

display_samples(D2_clean_base, "\\sx\\s", "Contains 'x' or 'X'")


=== Example: Contains 'x' or 'X' ===
[1] "Gina X Performance"                                       
[2] "Master P Feat. Fiend, Silkk The Shocker, Mia X & Mystikal"

display_samples(D2_clean_base, "\\s\\+\\s", "Contains '+'")


=== Example: Contains '+' ===
[1] "2Pac + Outlawz"      "Jay-Z + Alicia Keys"

Cleaning Strategy

cat("\n========================================\n")


========================================

cat("D2 Artist Field Cleaning Strategy\n")

D2 Artist Field Cleaning Strategy

cat("========================================\n\n")

========================================

cat("Core Principle: Conservative cleaning - Only remove explicit collaboration markers.\n\n")

Core Principle: Conservative cleaning - Only remove explicit collaboration markers.

cat("Data-Driven Rationale:\n")

Data-Driven Rationale:

cat("  - Featuring: 4.03%  -> Remove it and all following content.\n")

  - Featuring: 4.03%  -> Remove it and all following content.

cat("  - feat./ft.: 0.02%  -> Remove it and all following content.\n")

  - feat./ft.: 0.02%  -> Remove it and all following content.

cat("  - &:         4.34%  -> Preserve (part of the artist name).\n")

  - &:         4.34%  -> Preserve (part of the artist name).

cat("  - Comma:     0.62%  -> Preserve (let featuring logic naturally handle it).\n")

  - Comma:     0.62%  -> Preserve (let featuring logic naturally handle it).

cat("  - Brackets:  0.15%  -> Preserve (often artist aliases or metadata).\n")

  - Brackets:  0.15%  -> Preserve (often artist aliases or metadata).

cat("  - / + x:     <0.2%  -> Preserve (part of the artist name).\n\n")

  - / + x:     <0.2%  -> Preserve (part of the artist name).

cat("Cleaning Objectives:\n")

Cleaning Objectives:

cat("  1. Remove 'Featuring' and all following content.\n")

  1. Remove 'Featuring' and all following content.

cat("  2. Remove 'feat.' / 'ft.' and all following content.\n")

  2. Remove 'feat.' / 'ft.' and all following content.

cat("  3. Preserve '&', '/', '+', 'x' (these are intrinsic to the artist names).\n\n")

  3. Preserve '&', '/', '+', 'x' (these are intrinsic to the artist names).

cat("Exclusions (I Do Not Clean):\n")

Exclusions (I Do Not Clean):

cat("  - '&' symbol (may indicate a collaborative group name).\n")

  - '&' symbol (may indicate a collaborative group name).

cat("  - Brackets (may contain artist aliases, e.g., 'Ernie (Jim Henson)').\n")

  - Brackets (may contain artist aliases, e.g., 'Ernie (Jim Henson)').

cat("  - Commas (may be part of a band name, e.g., 'Blood, Sweat & Tears').\n")

  - Commas (may be part of a band name, e.g., 'Blood, Sweat & Tears').

2.2 Track Column Pattern Analysis

cat("\n========================================\n")


========================================

cat("D2 (Spotify) - Track Field Analysis\n")

D2 (Spotify) - Track Field Analysis

cat("========================================\n\n")

========================================

d2_track_stats <- tibble(
  Pattern = c(
    "Contains Parentheses ()",
    "Contains Brackets []",
    "Contains ' - ' (dash)",
    "Contains 'feat.'",
    "Contains 'Remaster'",
    "Contains 'Live'",
    "Contains 'Remix'",
    "Contains 'Radio Edit'",
    "Contains 'feat' inside ()",
    "Contains 'Pt.' or 'Part'",
    "Contains 'Take'"
  ),
  Count = c(
    sum(str_detect(D2_clean_base$track, "\\(.*\\)")),
    sum(str_detect(D2_clean_base$track, "\\[.*\\]")),
    sum(str_detect(D2_clean_base$track, " - ")),
    sum(str_detect(D2_clean_base$track, regex("feat\\.", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$track, regex("Remaster", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$track, regex("Live", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$track, regex("Remix", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$track, regex("Radio Edit", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$track, regex("\\(.*feat", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$track, regex("Pt\\.|Part", ignore_case = TRUE))),
    sum(str_detect(D2_clean_base$track, regex("Take", ignore_case = TRUE)))
  )
) %>% 
  mutate(Percentage = round(Count / nrow(D2_clean_base) * 100, 1))

d2_track_stats %>% 
  gt() %>% 
  tab_header(title = "D2 Track Field Pattern Statistics") %>% 
  fmt_number(columns = Count, decimals = 0, use_seps = TRUE)

Pattern	Count	Percentage
D2 Track Field Pattern Statistics
Contains Parentheses ()	2,831	6.9
Contains Brackets []	92	0.2
Contains ' - ' (dash)	2,857	7.0
Contains 'feat.'	148	0.4
Contains 'Remaster'	865	2.1
Contains 'Live'	604	1.5
Contains 'Remix'	159	0.4
Contains 'Radio Edit'	40	0.1
Contains 'feat' inside ()	142	0.3
Contains 'Pt.' or 'Part'	357	0.9
Contains 'Take'	341	0.8

Representative Examples

# Reusing the helper function for clean output
show_track_examples <- function(data, pattern_regex, label) {
  cat("\n=== Example:", label, "===\n")
  data %>% 
    filter(str_detect(track, regex(pattern_regex, ignore_case = TRUE))) %>% 
    distinct(track) %>% 
    slice_head(n = 5) %>% 
    pull(track) %>% 
    print()
}

show_track_examples(D2_clean_base, "\\(.*\\)", "Contains Parentheses ()")


=== Example: Contains Parentheses () ===
[1] "Samba De Uma Nota So (One Note Samba)"                             
[2] "The Aftermath (From \"Stagecoach\")"                               
[3] "Montague Terrace (In Blue)"                                        
[4] "Madama Butterfly, Act II: Or vienmi ad adornar (Suzuki, Butterfly)"
[5] "Cien Kilos de Barro (100 Pounds of Clay)"

show_track_examples(D2_clean_base, "\\[.*\\]", "Contains Brackets []")


=== Example: Contains Brackets [] ===
[1] "Rigoletto [Acts II & III]: V'ho ingannato....Lassù in cielo (Gilda, Rigoletto)"                     
[2] "Rigoletto [Acts II & III]: Un dì, se ben rammentomi (Duke, Gilda, Maddalena, Rigoletto)"            
[3] "Rigoletto [Act I]: Pari siamo! (Rigoletto)"                                                         
[4] "Rigoletto [Acts II & III]: Compiuto pur quanto a fare mi resta (Rigoletto, Gilda, Usher, Monterone)"
[5] "Mi Corazón Canta (All of a Sudden) [My Heart Sings]"

show_track_examples(D2_clean_base, " - ", "Contains ' - ' (dash)")


=== Example: Contains ' - ' (dash) ===
[1] "Carolina - Remastered 2006"                 
[2] "Alfômega - Remastered 2006"                 
[3] "Hip City - Pt. 2"                           
[4] "Black Verse - Part 2"                       
[5] "Comment te dire adieu - Remasterisé en 2016"

show_track_examples(D2_clean_base, "feat\\.", "Contains 'feat.'")


=== Example: Contains 'feat.' ===
[1] "In Love Wit Chu (feat. Cherish) - Radio Edit"                
[2] "Lonesome River (feat. Ricky Skaggs & Keith Whitley)"         
[3] "Le massage des doigts - feat. Anne Germain"                  
[4] "Rock Bottom (feat. Ricky Skaggs & Keith Whitley)"            
[5] "Canción de Cuna para Despertar un Hijo (feat. Marilina Ross)"

show_track_examples(D2_clean_base, "Remaster", "Contains 'Remaster'")


=== Example: Contains 'Remaster' ===
[1] "Carolina - Remastered 2006"                                                                                           
[2] "Alfômega - Remastered 2006"                                                                                           
[3] "Comment te dire adieu - Remasterisé en 2016"                                                                          
[4] "Madama Butterfly, Act I: Quanto cielo!....Ancora un passo or via (Coro/Butterfly/Sharpless) - 1986 Remastered Version"
[5] "We Will Fall (John Cale Mix) - 2019 Remaster"

show_track_examples(D2_clean_base, "Live", "Contains 'Live'")


=== Example: Contains 'Live' ===
[1] "In My Father's House - Live"                                             
[2] "Our Man Flint: Man Does Not Live By Bread Alone - From \"Our Man Flint\""
[3] "Yambu - Live"                                                            
[4] "Western générique - Version Live"                                        
[5] "Sailor Man - Live"

show_track_examples(D2_clean_base, "Remix", "Contains 'Remix'")


=== Example: Contains 'Remix' ===
[1] "Stranger (Remix)"            "Metal Generation - 97 Remix"
[3] "Run From Love - Remix"       "Heatwave - Remix"           
[5] "A Better Way - Club Remix"

show_track_examples(D2_clean_base, "Radio Edit", "Contains 'Radio Edit'")


=== Example: Contains 'Radio Edit' ===
[1] "In Love Wit Chu (feat. Cherish) - Radio Edit"           
[2] "Eternity - Radio Edit"                                  
[3] "Freedom (feat. Freedom) - Radio Edit"                   
[4] "Transylvanian Concubine (The Manson Mix (Radio Edit))**"
[5] "Feeling Good - No Apologize Radio Edit"

show_track_examples(D2_clean_base, "\\(.*feat", "Contains 'feat' inside ()")


=== Example: Contains 'feat' inside () ===
[1] "In Love Wit Chu (feat. Cherish) - Radio Edit"                       
[2] "Lonesome River (feat. Ricky Skaggs & Keith Whitley)"                
[3] "Rock Bottom (feat. Ricky Skaggs & Keith Whitley)"                   
[4] "Canción de Cuna para Despertar un Hijo (feat. Marilina Ross)"       
[5] "Shouting on the Hills of Glory (feat. Ricky Skaggs & Keith Whitley)"

show_track_examples(D2_clean_base, "Pt\\.|Part", "Contains 'Pt.' or 'Part'")


=== Example: Contains 'Pt.' or 'Part' ===
[1] "Hip City - Pt. 2"                 "Ya Parte el Galgo Terrible"      
[3] "Black Verse - Part 2"             "Départ de Renaud"                
[5] "Yuri Approaches Lara's Apartment"

show_track_examples(D2_clean_base, "Take", "Contains 'Take'")


=== Example: Contains 'Take' ===
[1] "Take The \"A\" Train"                          
[2] "Trust Us - Take 9"                             
[3] "All I Have To Do Is Take A Bite Of Your Apple?"
[4] "Electrotherapy from The Caretakers"            
[5] "Party In The Ward from The Caretakers"

Cleaning Strategy

cat("\n========================================\n")


========================================

cat("D2 Track Field Cleaning Strategy\n")

D2 Track Field Cleaning Strategy

cat("========================================\n\n")

========================================

cat("Core Principle: Active Dash Cleaning - D2 dash usage differs significantly from D1.\n\n")

Core Principle: Active Dash Cleaning - D2 dash usage differs significantly from D1.

cat("Data-Driven Rationale:\n")

Data-Driven Rationale:

cat("  - Parentheses ():   6.89%  -> Selective removal (target version tags only).\n")

  - Parentheses ():   6.89%  -> Selective removal (target version tags only).

cat("  - Brackets []:      0.22%  -> Remove all.\n")

  - Brackets []:      0.22%  -> Remove all.

cat("  - Dash -:           6.95%  -> Active cleaning (often indicates version tags).\n")

  - Dash -:           6.95%  -> Active cleaning (often indicates version tags).

cat("  - Remaster:         2.10%  -> Remove (often found after dash).\n")

  - Remaster:         2.10%  -> Remove (often found after dash).

cat("  - Live:             1.47%  -> Remove (often found after dash).\n")

  - Live:             1.47%  -> Remove (often found after dash).

cat("  - Remix:            0.39%  -> Remove.\n")

  - Remix:            0.39%  -> Remove.

cat("  - Radio Edit:       0.10%  -> Remove.\n")

  - Radio Edit:       0.10%  -> Remove.

cat("  - feat.:            0.36%  -> Remove (in parentheses or after dash).\n\n")

  - feat.:            0.36%  -> Remove (in parentheses or after dash).

cat("Cleaning Objectives:\n")

Cleaning Objectives:

cat("  1. Remove all brackets and their content.\n")

  1. Remove all brackets and their content.

cat("  2. Remove version tags inside parentheses (remix, remaster, live, feat., etc.).\n")

  2. Remove version tags inside parentheses (remix, remaster, live, feat., etc.).

cat("  3. Remove version tags following a dash (Remaster, Live, Remix, Radio Edit, etc.).\n\n")

  3. Remove version tags following a dash (Remaster, Live, Remix, Radio Edit, etc.).

cat("Exclusions (I Do Not Clean):\n")

Exclusions (I Do Not Clean):

cat("  - Standard parentheses (likely part of the title, e.g., 'Samba De Uma Nota So (One Note Samba)').\n")

  - Standard parentheses (likely part of the title, e.g., 'Samba De Uma Nota So (One Note Samba)').

cat("  - 'Part' or 'Pt.' following a dash (likely song identification, e.g., 'Hip City - Pt. 2').\n\n")

  - 'Part' or 'Pt.' following a dash (likely song identification, e.g., 'Hip City - Pt. 2').

cat("Note on D2 vs. D1 Dash Differences:\n")

Note on D2 vs. D1 Dash Differences:

cat("  - D1 Dash: 98.5% are subtitles -> Conservative approach (remove Remix only).\n")

  - D1 Dash: 98.5% are subtitles -> Conservative approach (remove Remix only).

cat("  - D2 Dash: Frequent version tagging -> Active approach (remove Remaster, Live, Remix, etc.).\n")

  - D2 Dash: Frequent version tagging -> Active approach (remove Remaster, Live, Remix, etc.).

Part 3: Dataset 3 (Music Dataset) Pattern Analysis

3.1 Artist Column Pattern Analysis

cat("\n========================================\n")


========================================

cat("D3 (Music Dataset) - Column Field Analysis\n")

D3 (Music Dataset) - Column Field Analysis

cat("========================================\n\n")

========================================

d3_artist_stats <- tibble(
  Pattern = c(
    "Contains 'Featuring'",
    "Contains 'feat.' or 'ft.'",
    "Contains '&'",
    "Contains comma ','",
    "Contains brackets () or []",
    "Contains forward slash /",
    "Contains backslash \\",
    "Contains 'x' or 'X' (Collab marker)",
    "Contains '+' (Collab marker)"
  ),
  Count = c(
    sum(str_detect(D3_clean_base$artist_name, regex("Featuring", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$artist_name, regex("feat\\.|ft\\.", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$artist_name, "&")),
    sum(str_detect(D3_clean_base$artist_name, ",")),
    sum(str_detect(D3_clean_base$artist_name, "[\\(\\)\\[\\]]")),
    sum(str_detect(D3_clean_base$artist_name, "/")),
    sum(str_detect(D3_clean_base$artist_name, "\\\\")),
    sum(str_detect(D3_clean_base$artist_name, regex("\\sx\\s", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$artist_name, regex("\\s\\+\\s", ignore_case = TRUE)))
  )
) %>% 
  mutate(
    Percentage = round(Count / nrow(D3_clean_base) * 100, 2)
  )

d3_artist_stats %>% 
  gt() %>% 
  tab_header(
    title = "D3 Artist Field Pattern Statistics"
  ) %>% 
  fmt_number(
    columns = Count,
    decimals = 0,
    use_seps = TRUE
  )

Pattern	Count	Percentage
D3 Artist Field Pattern Statistics
Contains 'Featuring'	0	0.00
Contains 'feat.' or 'ft.'	2	0.01
Contains '&'	844	2.97
Contains comma ','	224	0.79
Contains brackets () or []	3	0.01
Contains forward slash /	45	0.16
Contains backslash \	0	0.00
Contains 'x' or 'X' (Collab marker)	9	0.03
Contains '+' (Collab marker)	16	0.06

Representative Examples

# Reusing the display_samples helper function
# Make sure to use 'artist_name' column for D3
display_d3_samples <- function(data, pattern_regex, label) {
  cat("\n=== Example:", label, "===\n")
  data %>% 
    filter(str_detect(artist_name, regex(pattern_regex, ignore_case = TRUE))) %>% 
    distinct(artist_name) %>% 
    slice_head(n = 2) %>% 
    pull(artist_name) %>% 
    print()
}

display_d3_samples(D3_clean_base, "Featuring", "Contains 'Featuring'")


=== Example: Contains 'Featuring' ===
character(0)

display_d3_samples(D3_clean_base, "feat\\.|ft\\.", "Contains 'feat.' or 'ft.'")


=== Example: Contains 'feat.' or 'ft.' ===
[1] "10 ft. ganja plant"

display_d3_samples(D3_clean_base, "&", "Contains '&'")


=== Example: Contains '&' ===
[1] "bill haley & his comets"     "cliff richard & the shadows"

display_d3_samples(D3_clean_base, ",", "Contains comma ','")


=== Example: Contains comma ',' ===
[1] "blood, sweat & tears" "peter, paul and mary"

display_d3_samples(D3_clean_base, "[\\(\\)\\[\\]]", "Contains brackets () or []")


=== Example: Contains brackets () or [] ===
[1] "sunn 0)))"  "(hed) p.e."

display_d3_samples(D3_clean_base, "/", "Contains forward slash /")


=== Example: Contains forward slash / ===
[1] "au/ra"               "yusuf / cat stevens"

display_d3_samples(D3_clean_base, "\\\\", "Contains backslash \\")


=== Example: Contains backslash \ ===
character(0)

display_d3_samples(D3_clean_base, "\\sx\\s", "Contains 'x' or 'X'")


=== Example: Contains 'x' or 'X' ===
[1] "terror x crew"

display_d3_samples(D3_clean_base, "\\s\\+\\s", "Contains '+'")


=== Example: Contains '+' ===
[1] "florence + the machine" "dan + shay"

Cleaning Strategy

cat("\n========================================\n")


========================================

cat("D3 Artist Field Cleaning Strategy\n")

D3 Artist Field Cleaning Strategy

cat("========================================\n\n")

========================================

cat("Core Principle: Almost no cleaning required.\n\n")

Core Principle: Almost no cleaning required.

cat("Data-Driven Rationale:\n")

Data-Driven Rationale:

cat("  - Featuring: 0.00%  -> Does not exist.\n")

  - Featuring: 0.00%  -> Does not exist.

cat("  - feat./ft.: 0.01%  -> These are part of the artist names, not collaboration markers.\n")

  - feat./ft.: 0.01%  -> These are part of the artist names, not collaboration markers.

cat("  - &:         2.97%  -> Preserve (part of the artist name).\n")

  - &:         2.97%  -> Preserve (part of the artist name).

cat("  - Comma:     0.79%  -> Preserve (part of band names).\n")

  - Comma:     0.79%  -> Preserve (part of band names).

cat("  - Brackets:  0.01%  -> Preserve (part of band names).\n")

  - Brackets:  0.01%  -> Preserve (part of band names).

cat("  - / + x:     <0.2%  -> Preserve (part of artist names).\n\n")

  - / + x:     <0.2%  -> Preserve (part of artist names).

cat("Cleaning Objectives:\n")

Cleaning Objectives:

cat("  1. Only execute str_trim() to remove leading/trailing whitespace.\n")

  1. Only execute str_trim() to remove leading/trailing whitespace.

cat("  2. Do not remove any content.\n\n")

  2. Do not remove any content.

3.2 Track Column Pattern Analysis

cat("\n========================================\n")


========================================

cat("D3 (Music Dataset) - Track Coulmn Analysis\n")

D3 (Music Dataset) - Track Coulmn Analysis

cat("========================================\n\n")

========================================

d3_track_stats <- tibble(
  Pattern = c(
    "Contains Parentheses ()",
    "Contains Brackets []",
    "Contains ' - ' (dash)",
    "Contains 'feat.'",
    "Contains 'Remaster'",
    "Contains 'Live'",
    "Contains 'Remix'",
    "Contains 'Radio Edit'",
    "Contains 'feat' inside ()",
    "Contains '+' (Part of title)",
    "Contains 'Pt.' or 'Part'",
    "Contains 'Take'"
  ),
  Count = c(
    sum(str_detect(D3_clean_base$track_name, "\\(.*\\)")),
    sum(str_detect(D3_clean_base$track_name, "\\[.*\\]")),
    sum(str_detect(D3_clean_base$track_name, " - ")),
    sum(str_detect(D3_clean_base$track_name, regex("feat\\.", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$track_name, regex("Remaster", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$track_name, regex("Live", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$track_name, regex("Remix", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$track_name, regex("Radio Edit", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$track_name, regex("\\(.*feat", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$track_name, regex("\\s\\+\\s", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$track_name, regex("Pt\\.|Part", ignore_case = TRUE))),
    sum(str_detect(D3_clean_base$track_name, regex("Take", ignore_case = TRUE)))
  )
) %>% 
  mutate(Percentage = round(Count / nrow(D3_clean_base) * 100, 2))

d3_track_stats %>% 
  gt() %>% 
  tab_header(title = "D3 Track Field Pattern Statistics") %>% 
  fmt_number(columns = Count, decimals = 0, use_seps = TRUE)

Pattern	Count	Percentage
D3 Track Field Pattern Statistics
Contains Parentheses ()	1,480	5.22
Contains Brackets []	24	0.08
Contains ' - ' (dash)	0	0.00
Contains 'feat.'	357	1.26
Contains 'Remaster'	24	0.08
Contains 'Live'	234	0.82
Contains 'Remix'	28	0.10
Contains 'Radio Edit'	5	0.02
Contains 'feat' inside ()	351	1.24
Contains '+' (Part of title)	2	0.01
Contains 'Pt.' or 'Part'	188	0.66
Contains 'Take'	231	0.81

Representative Examples

show_d3_track_examples <- function(data, pattern_regex, label) {
  cat("\n=== Example:", label, "===\n")
  data %>% 
    filter(str_detect(track_name, regex(pattern_regex, ignore_case = TRUE))) %>% 
    distinct(track_name) %>% 
    slice_head(n = 5) %>% 
    pull(track_name) %>% 
    print()
}

show_d3_track_examples(D3_clean_base, "\\(.*\\)", "Contains Parentheses ()")


=== Example: Contains Parentheses () ===
[1] "round and round (with mitchell ayres and his orchestra & the ray charles singers)"
[2] "chupke chupke (from ''paying guest'')"                                            
[3] "viljen i veta (would you like to know?)"                                          
[4] "just my imagination (running away with me)"                                       
[5] "volare (nel blu dipinto di blu)"

show_d3_track_examples(D3_clean_base, "\\[.*\\]", "Contains Brackets []")


=== Example: Contains Brackets [] ===
[1] "he don't love you [like i love you]"                                                                         
[2] "you're the song [that i can't stop singing]"                                                                 
[3] "waka waka (this time for africa) [the official 2010 fifa world cup (tm) song]"                               
[4] "mic drop (steve aoki remix) [full length edition]"                                                           
[5] "dar um jeito (we will find a way) [the official 2014 fifa world cup anthem] (feat. avicii & alexandre pires)"

show_d3_track_examples(D3_clean_base, " - ", "Contains ' - ' (dash)")


=== Example: Contains ' - ' (dash) ===
character(0)

show_d3_track_examples(D3_clean_base, "feat\\.", "Contains 'feat.'")


=== Example: Contains 'feat.' ===
[1] "don't look back (feat. van morrison)"                                            
[2] "whenever i call you \"friend\" (feat. stevie nicks)"                             
[3] "i just can't stop loving you (feat. siedah garrett)"                             
[4] "gin and juice (feat. dat nigga daz)"                                             
[5] "ain't no fun (if the homies cant have none) (feat. nate dogg, warren g & kurupt)"

show_d3_track_examples(D3_clean_base, "Remaster", "Contains 'Remaster'")


=== Example: Contains 'Remaster' ===
[1] "i was there when it happened (2017 remaster)"                                      
[2] "cry! cry! cry! (alternate version, extended take 1) (2017 remaster)"               
[3] "angel of the morning (as heard in charlie wilson's war) (re-recorded / remastered)"
[4] "queen of hearts (re-recorded / remastered)"                                        
[5] "stand up (remastered)"

show_d3_track_examples(D3_clean_base, "Live", "Contains 'Live'")


=== Example: Contains 'Live' ===
[1] "as long as i live"        "i'm alive"               
[3] "nothing was delivered"    "i live one day at a time"
[5] "so far away (live)"

show_d3_track_examples(D3_clean_base, "Remix", "Contains 'Remix'")


=== Example: Contains 'Remix' ===
[1] "speed demon (remix by nero)"                                                                                       
[2] "heartbreaker (remix) (feat. da brat & missy elliott)"                                                              
[3] "dance (a$$) remix"                                                                                                 
[4] "mic drop (steve aoki remix) [full length edition]"                                                                 
[5] "mud digger mega remix (feat. lenny cooper, colt ford, moonshine bandits, demun jones, moccasin creek & cap bailey)"

show_d3_track_examples(D3_clean_base, "Radio Edit", "Contains 'Radio Edit'")


=== Example: Contains 'Radio Edit' ===
[1] "alabama blues (todd edwards vocal radio edit mix)" 
[2] "heart shaped box (radio edit) [feat. julia church]"
[3] "upside down (radio edit)"                          
[4] "tipsy (j skillz remix) [radio edit]"               
[5] "tipsy 09 (radio edit)"

show_d3_track_examples(D3_clean_base, "\\(.*feat", "Contains 'feat' inside ()")


=== Example: Contains 'feat' inside () ===
[1] "don't look back (feat. van morrison)"                                            
[2] "whenever i call you \"friend\" (feat. stevie nicks)"                             
[3] "i just can't stop loving you (feat. siedah garrett)"                             
[4] "gin and juice (feat. dat nigga daz)"                                             
[5] "ain't no fun (if the homies cant have none) (feat. nate dogg, warren g & kurupt)"

show_d3_track_examples(D3_clean_base, "\\s\\+\\s", "Contains '+' (Part of title)")


=== Example: Contains '+' (Part of title) ===
[1] "good + bad times" "19 you + me"

show_d3_track_examples(D3_clean_base, "Pt\\.|Part", "Contains 'Pt.' or 'Part'")


=== Example: Contains 'Pt.' or 'Part' ===
[1] "it's all a part of love"             "changing partners"                  
[3] "having a party"                      "garden party"                       
[5] "everything put together falls apart"

show_d3_track_examples(D3_clean_base, "Take", "Contains 'Take'")


=== Example: Contains 'Take' ===
[1] "take the “a” train"                "take your time"                   
[3] "i'll take you home again kathleen" "won't you take me back"           
[5] "take good care of her"

Cleaning Strategy

cat("\n========================================\n")


========================================

cat("D3 Track Field Cleaning Strategy\n")

D3 Track Field Cleaning Strategy

cat("========================================\n\n")

========================================

cat("Core Principle: Only clean version tags inside parentheses; no dash handling required.\n\n")

Core Principle: Only clean version tags inside parentheses; no dash handling required.

cat("Data-Driven Rationale:\n")

Data-Driven Rationale:

cat("  - Parentheses ():   5.22%  -> Selective removal (target version tags only).\n")

  - Parentheses ():   5.22%  -> Selective removal (target version tags only).

cat("  - Brackets []:      0.08%  -> Remove all.\n")

  - Brackets []:      0.08%  -> Remove all.

cat("  - Dash -:           0.00%  -> Non-existent; no action needed.\n")

  - Dash -:           0.00%  -> Non-existent; no action needed.

cat("  - feat.:            1.26%  -> Remove (almost entirely within parentheses).\n")

  - feat.:            1.26%  -> Remove (almost entirely within parentheses).

cat("  - Remaster:         0.08%  -> Remove (all within parentheses).\n")

  - Remaster:         0.08%  -> Remove (all within parentheses).

cat("  - Live:             0.82%  -> Caution required (large number are lyric verbs).\n")

  - Live:             0.82%  -> Caution required (large number are lyric verbs).

cat("  - Remix:            0.10%  -> Remove.\n")

  - Remix:            0.10%  -> Remove.

cat("  - Radio Edit:       0.02%  -> Remove.\n\n")

  - Radio Edit:       0.02%  -> Remove.

cat("Cleaning Objectives:\n")

Cleaning Objectives:

cat("  1. Remove all brackets and their content.\n")

  1. Remove all brackets and their content.

cat("  2. Remove version tags inside parentheses (remix, remaster, feat., etc.).\n")

  2. Remove version tags inside parentheses (remix, remaster, feat., etc.).

cat("  3. For 'Live', only remove explicit (live) formats inside parentheses.\n\n")

  3. For 'Live', only remove explicit (live) formats inside parentheses.

cat("Exclusions (I Do Not Clean):\n")

Exclusions (I Do Not Clean):

cat("  - Standard parentheses (likely part of the title).\n")

  - Standard parentheses (likely part of the title).

cat("  - 'live' used in lyrics ('as long as i live', 'i'm alive').\n")

  - 'live' used in lyrics ('as long as i live', 'i'm alive').

cat("  - '+' symbol ('good + bad times' is the title itself).\n\n")

  - '+' symbol ('good + bad times' is the title itself).