Week 4 Mini Project: Exploring Spotify Data with data.table

Performance and Visualization

Author

DSA_406_001_SP25_wk4_msrodrig

#| label: setup #| include: false

Install and load required packages for the mini-project

required_packages <- c(“data.table”, “dplyr”, “ggplot2”, “microbenchmark”, “readr”)

for (pkg in required_packages) { if (!requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) } } invisible(lapply(required_packages, library, character.only = TRUE))

#| label: load-data #| warning: false #| message: false

Load data via data.table

spotify_dt <- fread(“week4/data/spotify_clean.csv”)

Also load via readr (we’ll use it for performance comparisons)

spotify_tbl <- read_csv(“week4/data/spotify_clean.csv”)

Convert read_csv data frame to tibble (just ensures consistent type for dplyr)

spotify_tbl <- as_tibble(spotify_tbl)

Quick check

dim(spotify_dt) dim(spotify_tbl)

Number of rows

spotify_dt[, .N]

Column names/types

sapply(spotify_dt, class)

Peek at the first few rows

spotify_dt[1:5]

Equivalent filter using dplyr

popular_rock_tbl <- spotify_tbl %>% filter(popularity > 80, main_genre == “Rock”) nrow(popular_rock_tbl)

Calculate mean popularity, mean danceability, and track_count by main_genre

genre_stats_dt <- spotify_dt[, .( avg_popularity = mean(popularity, na.rm = TRUE), avg_danceability = mean(danceability, na.rm = TRUE), track_count = .N ), by = main_genre] genre_stats_dt

Equivalent grouping/summarizing using dplyr

genre_stats_tbl <- spotify_tbl %>% group_by(main_genre) %>% summarize( avg_popularity = mean(popularity, na.rm = TRUE), avg_danceability = mean(danceability, na.rm = TRUE), track_count = n() ) %>% ungroup() genre_stats_tbl # Make a small data.table of categories for main_genre genre_info_dt <- data.table( main_genre = unique(spotify_dt\(main_genre), category = c("Cat1", "Cat2", "Cat3", "Cat4", "Cat5", "Cat6", "Cat7", "Cat8")[1:length(unique(spotify_dt\)main_genre))] )

data.table merge

merged_dt <- merge(genre_stats_dt, genre_info_dt, by = “main_genre”, all.x = TRUE) merged_dt[1:5]

genre_info_tbl <- tibble( main_genre = unique(spotify_tbl\(main_genre), category = c("Cat1", "Cat2", "Cat3", "Cat4", "Cat5", "Cat6", "Cat7", "Cat8")[1:length(unique(spotify_tbl\)main_genre))] )

merged_tbl <- left_join(genre_stats_tbl, genre_info_tbl, by = “main_genre”) head(merged_tbl, 5)

perf <- microbenchmark( dt_filter = { spotify_dt[popularity > 80 & main_genre == “Rock”] }, dplyr_filter = { spotify_tbl %>% filter(popularity > 80, main_genre == “Rock”) }, dt_group = { spotify_dt[, .( avg_popularity = mean(popularity, na.rm = TRUE), avg_danceability = mean(danceability, na.rm = TRUE), track_count = .N ), by = main_genre] }, dplyr_group = { spotify_tbl %>% group_by(main_genre) %>% summarize( avg_popularity = mean(popularity, na.rm = TRUE), avg_danceability = mean(danceability, na.rm = TRUE), track_count = n() ) %>% ungroup() }, dt_join = { merge(genre_stats_dt, genre_info_dt, by = “main_genre”, all.x = TRUE) }, dplyr_join = { left_join(genre_stats_tbl, genre_info_tbl, by = “main_genre”) }, times = 5 )

perf