04 Data Joining.qmd

Author

Fu Wei Hsu

Executive Summary

Purpose: Merge three standardized datasets (data1,2,3) to produce the final dataset for subsequent analysis.

Tasks performed in this document:

  • D1 + D2 exact match (primary join)

  • D1+D2 match evaluation (match rate, decade distribution)

  • D1 + D3 exact match (auxiliary join) - Output final dataset for analysis

Data Source: 03_Data_Wrangling.qmdthree RDSs


# Setup
library(tidyverse)
library(janitor)
library(gt)

# Load cleaned data from 03
D1_wrangled <- readRDS("../Data/wrangled_D1.rds")
D2_wrangled <- readRDS("../Data/wrangled_D2.rds")
D3_wrangled <- readRDS("../Data/wrangled_D3.rds")

cat("D1 (Billboard):", nrow(D1_wrangled), "rows\n")
D1 (Billboard): 330087 rows
cat("D2 (Spotify):", nrow(D2_wrangled), "rows\n")
D2 (Spotify): 41106 rows
cat("D3 (Music Dataset):", nrow(D3_wrangled), "rows\n\n")
D3 (Music Dataset): 28372 rows

Part 1: D1 + D2 Join (Primary Join)

# Prepare D2 join columns
# Keep one row per join_key in D2 (deduplication)
# Retain all audio features and target
D2_for_join <- D2_wrangled %>%
  distinct(join_key, .keep_all = TRUE)

cat("Original:", nrow(D2_wrangled), "rows, duplicate join_keys removed\n")
Original: 41106 rows, duplicate join_keys removed
# Execute Left Join
# Use left join to keep all D1 records
# Matched records will include D2 audio features
# Unmatched records will have NA for audio feature columns
D1_D2_joined <- D1_wrangled %>%
  left_join(
    D2_for_join %>% select(
      join_key,
      # audio features
      danceability, energy, key, loudness, mode,
      speechiness, acousticness, instrumentalness,
      liveness, valence, tempo, duration_ms,
      time_signature, chorus_hit, sections,
      # target label
      target,
      # decade
      decade
    ),
    by = "join_key"
  )

#D1 + D2 Join complete
cat("Total rows after join:", nrow(D1_D2_joined), "\n")
Total rows after join: 330087 

Part 2: D1+D2 Match Rate Evaluation

# Overall Match Rate
matched    <- D1_D2_joined %>% filter(!is.na(target)) %>% nrow()
unmatched  <- D1_D2_joined %>% filter(is.na(target)) %>% nrow()
total      <- nrow(D1_D2_joined)

cat("Total records:    ", total, "rows\n")
Total records:     330087 rows
cat("Matched:    ", matched, "rows (",
    round(matched / total * 100, 2), "%)\n")
Matched:     264478 rows ( 80.12 %)
cat("unmatched :      ", unmatched, "rows (",
    round(unmatched / total * 100, 2), "%)\n\n")
unmatched :       65609 rows ( 19.88 %)
# Match Rate by Decade
D1_D2_joined %>%
  mutate(year = as.integer(format(date, "%Y")),
         decade_d1 = paste0(floor(year / 10) * 10, "s")) %>%
  group_by(decade_d1) %>%
  summarise(
    total     = n(),
    matched   = sum(!is.na(target)),
    match_pct = round(matched / total * 100, 2)
  ) %>%
  arrange(decade_d1) %>%
  gt() %>%
  tab_header(title = "D1+D2 Match Rate by Dacades") %>%
  fmt_number(columns = c(total, matched), decimals = 0, use_seps = TRUE)
D1+D2 Match Rate by Dacades
decade_d1 total matched match_pct
1950s 7,400 489 6.61
1960s 52,100 36,826 70.68
1970s 52,187 40,933 78.44
1980s 52,200 44,945 86.10
1990s 52,100 42,576 81.72
2000s 52,200 48,301 92.53
2010s 52,200 49,156 94.17
2020s 9,700 1,252 12.91

Part 3: D1-D2 Unmatched Records Sample Check

D1_D2_joined %>%
  filter(is.na(target)) %>%
  mutate(year = as.integer(format(date, "%Y"))) %>%
  distinct(artist_clean, song_clean, year) %>%
  arrange(year) %>%
  slice_head(n = 10) %>%
  gt() %>%
  tab_header(title = "D1+D2 Unmatched Records Sample")
D1+D2 Unmatched Records Sample
artist_clean song_clean year
the chipmunks the chipmunk song 1958
the platters smoke gets in your eyes 1958
the teddy bears to know him, is to love him 1958
elvis presley one night 1958
the everly brothers problems 1958
the kingston trio tom dooley 1958
ricky nelson lonesome town 1958
the playmates beep beep 1958
clyde mcphatter a lover's question 1958
fats domino whole lotta loving 1958

Part 4: D1 + D2 + D3 Join (Auxiliary Join)

D3_for_join <- D3_wrangled %>%
  distinct(join_key, .keep_all = TRUE)

cat("D3 distinct rows:", nrow(D3_for_join), "\n")
D3 distinct rows: 28338 
D1_D2_D3_joined <- D1_D2_joined %>%
  left_join(
    D3_for_join %>% select(
      join_key,
      genre,
      dating, violence, `world/life`, `night/time`,
      romantic, communication, obscene, music,
      sadness, feelings, topic
    ),
    by = "join_key"
  )


# Validation 
total      <- nrow(D1_D2_D3_joined)
d2_matched <- sum(!is.na(D1_D2_D3_joined$target))
d3_matched <- sum(!is.na(D1_D2_D3_joined$genre))
both_matched <- sum(!is.na(D1_D2_D3_joined$target) & 
                    !is.na(D1_D2_D3_joined$genre))

cat("Total rows: ", total, "\n")
Total rows:  330087 
cat("D2 matched (has target):  ", d2_matched,
    "(", round(d2_matched / total * 100, 2), "%)\n")
D2 matched (has target):   264478 ( 80.12 %)
cat("D3 matched (has genre):   ", d3_matched,
    "(", round(d3_matched / total * 100, 2), "%)\n")
D3 matched (has genre):    46012 ( 13.94 %)
cat("Both D2 + D3 matched:     ", both_matched,
    "(", round(both_matched / total * 100, 2), "%)\n\n")
Both D2 + D3 matched:      44543 ( 13.49 %)

Part 5: Save Final Analysis Dataset

# Save single merged file
saveRDS(D1_D2_D3_joined, "../Data/D1_D2_D3_joined.rds")

cat("Dataset size check:\n")
Dataset size check:
cat("Total rows:", nrow(D1_D2_D3_joined), "\n")
Total rows: 330087 
cat("Total columns:", ncol(D1_D2_D3_joined), "\n")
Total columns: 39