04 Data Joining.qmd

Author

Fu Wei Hsu

Executive Summary

Purpose: Merge three standardized datasets (data1,2,3) to produce the final dataset for subsequent analysis.

Tasks performed in this document:

D1 + D2 exact match (primary join)
D1+D2 match evaluation (match rate, decade distribution)
D1 + D3 exact match (auxiliary join) - Output final dataset for analysis

Data Source: 03_Data_Wrangling.qmdthree RDSs

# Setup
library(tidyverse)
library(janitor)
library(gt)

# Load cleaned data from 03
D1_wrangled <- readRDS("../Data/wrangled_D1.rds")
D2_wrangled <- readRDS("../Data/wrangled_D2.rds")
D3_wrangled <- readRDS("../Data/wrangled_D3.rds")

cat("D1 (Billboard):", nrow(D1_wrangled), "rows\n")

D1 (Billboard): 330087 rows

cat("D2 (Spotify):", nrow(D2_wrangled), "rows\n")

D2 (Spotify): 41106 rows

cat("D3 (Music Dataset):", nrow(D3_wrangled), "rows\n\n")

D3 (Music Dataset): 28372 rows

Part 1: D1 + D2 Join (Primary Join)

# Prepare D2 join columns
# Keep one row per join_key in D2 (deduplication)
# Retain all audio features and target
D2_for_join <- D2_wrangled %>%
  distinct(join_key, .keep_all = TRUE)

cat("Original:", nrow(D2_wrangled), "rows, duplicate join_keys removed\n")

Original: 41106 rows, duplicate join_keys removed

# Execute Left Join
# Use left join to keep all D1 records
# Matched records will include D2 audio features
# Unmatched records will have NA for audio feature columns
D1_D2_joined <- D1_wrangled %>%
  left_join(
    D2_for_join %>% select(
      join_key,
      # audio features
      danceability, energy, key, loudness, mode,
      speechiness, acousticness, instrumentalness,
      liveness, valence, tempo, duration_ms,
      time_signature, chorus_hit, sections,
      # target label
      target,
      # decade
      decade
    ),
    by = "join_key"
  )

#D1 + D2 Join complete
cat("Total rows after join:", nrow(D1_D2_joined), "\n")

Total rows after join: 330087

Part 2: D1+D2 Match Rate Evaluation

# Overall Match Rate
matched    <- D1_D2_joined %>% filter(!is.na(target)) %>% nrow()
unmatched  <- D1_D2_joined %>% filter(is.na(target)) %>% nrow()
total      <- nrow(D1_D2_joined)

cat("Total records:    ", total, "rows\n")

Total records:     330087 rows

cat("Matched:    ", matched, "rows (",
    round(matched / total * 100, 2), "%)\n")

Matched:     264478 rows ( 80.12 %)

cat("unmatched :      ", unmatched, "rows (",
    round(unmatched / total * 100, 2), "%)\n\n")

unmatched :       65609 rows ( 19.88 %)

# Match Rate by Decade
D1_D2_joined %>%
  mutate(year = as.integer(format(date, "%Y")),
         decade_d1 = paste0(floor(year / 10) * 10, "s")) %>%
  group_by(decade_d1) %>%
  summarise(
    total     = n(),
    matched   = sum(!is.na(target)),
    match_pct = round(matched / total * 100, 2)
  ) %>%
  arrange(decade_d1) %>%
  gt() %>%
  tab_header(title = "D1+D2 Match Rate by Dacades") %>%
  fmt_number(columns = c(total, matched), decimals = 0, use_seps = TRUE)

decade_d1	total	matched	match_pct
D1+D2 Match Rate by Dacades
1950s	7,400	489	6.61
1960s	52,100	36,826	70.68
1970s	52,187	40,933	78.44
1980s	52,200	44,945	86.10
1990s	52,100	42,576	81.72
2000s	52,200	48,301	92.53
2010s	52,200	49,156	94.17
2020s	9,700	1,252	12.91

Part 3: D1-D2 Unmatched Records Sample Check

D1_D2_joined %>%
  filter(is.na(target)) %>%
  mutate(year = as.integer(format(date, "%Y"))) %>%
  distinct(artist_clean, song_clean, year) %>%
  arrange(year) %>%
  slice_head(n = 10) %>%
  gt() %>%
  tab_header(title = "D1+D2 Unmatched Records Sample")

artist_clean	song_clean	year
D1+D2 Unmatched Records Sample
the chipmunks	the chipmunk song	1958
the platters	smoke gets in your eyes	1958
the teddy bears	to know him, is to love him	1958
elvis presley	one night	1958
the everly brothers	problems	1958
the kingston trio	tom dooley	1958
ricky nelson	lonesome town	1958
the playmates	beep beep	1958
clyde mcphatter	a lover's question	1958
fats domino	whole lotta loving	1958

Part 4: D1 + D2 + D3 Join (Auxiliary Join)

D3_for_join <- D3_wrangled %>%
  distinct(join_key, .keep_all = TRUE)

cat("D3 distinct rows:", nrow(D3_for_join), "\n")

D3 distinct rows: 28338

D1_D2_D3_joined <- D1_D2_joined %>%
  left_join(
    D3_for_join %>% select(
      join_key,
      genre,
      dating, violence, `world/life`, `night/time`,
      romantic, communication, obscene, music,
      sadness, feelings, topic
    ),
    by = "join_key"
  )


# Validation 
total      <- nrow(D1_D2_D3_joined)
d2_matched <- sum(!is.na(D1_D2_D3_joined$target))
d3_matched <- sum(!is.na(D1_D2_D3_joined$genre))
both_matched <- sum(!is.na(D1_D2_D3_joined$target) & 
                    !is.na(D1_D2_D3_joined$genre))

cat("Total rows: ", total, "\n")

Total rows:  330087

cat("D2 matched (has target):  ", d2_matched,
    "(", round(d2_matched / total * 100, 2), "%)\n")

D2 matched (has target):   264478 ( 80.12 %)

cat("D3 matched (has genre):   ", d3_matched,
    "(", round(d3_matched / total * 100, 2), "%)\n")

D3 matched (has genre):    46012 ( 13.94 %)

cat("Both D2 + D3 matched:     ", both_matched,
    "(", round(both_matched / total * 100, 2), "%)\n\n")

Both D2 + D3 matched:      44543 ( 13.49 %)

Part 5: Save Final Analysis Dataset

# Save single merged file
saveRDS(D1_D2_D3_joined, "../Data/D1_D2_D3_joined.rds")

cat("Dataset size check:\n")

Dataset size check:

cat("Total rows:", nrow(D1_D2_D3_joined), "\n")

Total rows: 330087

cat("Total columns:", ncol(D1_D2_D3_joined), "\n")

Total columns: 39