Week 4 Mini Project: Exploring Spotify Data with data.table
Performance and Visualization
#| label: setup #| include: false
Install and load required packages for the mini-project
required_packages <- c(“data.table”, “dplyr”, “ggplot2”, “microbenchmark”, “readr”)
for (pkg in required_packages) { if (!requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) } } invisible(lapply(required_packages, library, character.only = TRUE))
#| label: load-data #| warning: false #| message: false
Load data via data.table
spotify_dt <- fread(“week4/data/spotify_clean.csv”)
Also load via readr (we’ll use it for performance comparisons)
spotify_tbl <- read_csv(“week4/data/spotify_clean.csv”)
Convert read_csv data frame to tibble (just ensures consistent type for dplyr)
spotify_tbl <- as_tibble(spotify_tbl)
Quick check
dim(spotify_dt) dim(spotify_tbl)
Number of rows
spotify_dt[, .N]
Column names/types
sapply(spotify_dt, class)
Peek at the first few rows
spotify_dt[1:5]
Filter to popular songs (popularity > 80) in the “Rock” main_genre
popular_rock_dt <- spotify_dt[popularity > 80 & main_genre == “Rock”] nrow(popular_rock_dt)
Equivalent filter using dplyr
popular_rock_tbl <- spotify_tbl %>% filter(popularity > 80, main_genre == “Rock”) nrow(popular_rock_tbl)
Calculate mean popularity, mean danceability, and track_count by main_genre
genre_stats_dt <- spotify_dt[, .( avg_popularity = mean(popularity, na.rm = TRUE), avg_danceability = mean(danceability, na.rm = TRUE), track_count = .N ), by = main_genre] genre_stats_dt
Equivalent grouping/summarizing using dplyr
genre_stats_tbl <- spotify_tbl %>% group_by(main_genre) %>% summarize( avg_popularity = mean(popularity, na.rm = TRUE), avg_danceability = mean(danceability, na.rm = TRUE), track_count = n() ) %>% ungroup() genre_stats_tbl # Make a small data.table of categories for main_genre genre_info_dt <- data.table( main_genre = unique(spotify_dt\(main_genre), category = c("Cat1", "Cat2", "Cat3", "Cat4", "Cat5", "Cat6", "Cat7", "Cat8")[1:length(unique(spotify_dt\)main_genre))] )
data.table merge
merged_dt <- merge(genre_stats_dt, genre_info_dt, by = “main_genre”, all.x = TRUE) merged_dt[1:5]
genre_info_tbl <- tibble( main_genre = unique(spotify_tbl\(main_genre), category = c("Cat1", "Cat2", "Cat3", "Cat4", "Cat5", "Cat6", "Cat7", "Cat8")[1:length(unique(spotify_tbl\)main_genre))] )
merged_tbl <- left_join(genre_stats_tbl, genre_info_tbl, by = “main_genre”) head(merged_tbl, 5)
perf <- microbenchmark( dt_filter = { spotify_dt[popularity > 80 & main_genre == “Rock”] }, dplyr_filter = { spotify_tbl %>% filter(popularity > 80, main_genre == “Rock”) }, dt_group = { spotify_dt[, .( avg_popularity = mean(popularity, na.rm = TRUE), avg_danceability = mean(danceability, na.rm = TRUE), track_count = .N ), by = main_genre] }, dplyr_group = { spotify_tbl %>% group_by(main_genre) %>% summarize( avg_popularity = mean(popularity, na.rm = TRUE), avg_danceability = mean(danceability, na.rm = TRUE), track_count = n() ) %>% ungroup() }, dt_join = { merge(genre_stats_dt, genre_info_dt, by = “main_genre”, all.x = TRUE) }, dplyr_join = { left_join(genre_stats_tbl, genre_info_tbl, by = “main_genre”) }, times = 5 )
perf