Speed Up Your Data Analysis with data.table

Gentle introduction

Author

DSA_406_001_SP25_wk4_ebbopp

Today’s Lesson: Efficient Big Data Analysis with data.table

Objectives

  • Understand why data.table is crucial for big data analysis

  • Learn the syntax and key features of data.table

  • Practice with real-world Spotify data

  • Compare performance with traditional methods

Why data.table?

data.table is designed for handling large datasets (1GB to 100GB) with:

  • Lightning-fast operations

  • Memory efficiency

  • Concise syntax

  • Enhanced features for data manipulation

Setting Up

First, let’s install and load our required packages:

# Install and load required packages
required_packages <- c("data.table", "tidyverse", "microbenchmark", "pryr")

# Install missing packages
for (pkg in required_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg)
  }
}

# Load libraries individually
#library(data.table)
#library(tidyverse)
#library(microbenchmark) # For precise timing
#library(pryr) # For memory tracking

#Load libraries
lapply(required_packages, library, character.only = TRUE)
[[1]]
[1] "data.table" "stats"      "graphics"   "grDevices"  "utils"     
[6] "datasets"   "methods"    "base"      

[[2]]
 [1] "lubridate"  "forcats"    "stringr"    "dplyr"      "purrr"     
 [6] "readr"      "tidyr"      "tibble"     "ggplot2"    "tidyverse" 
[11] "data.table" "stats"      "graphics"   "grDevices"  "utils"     
[16] "datasets"   "methods"    "base"      

[[3]]
 [1] "microbenchmark" "lubridate"      "forcats"        "stringr"       
 [5] "dplyr"          "purrr"          "readr"          "tidyr"         
 [9] "tibble"         "ggplot2"        "tidyverse"      "data.table"    
[13] "stats"          "graphics"       "grDevices"      "utils"         
[17] "datasets"       "methods"        "base"          

[[4]]
 [1] "pryr"           "microbenchmark" "lubridate"      "forcats"       
 [5] "stringr"        "dplyr"          "purrr"          "readr"         
 [9] "tidyr"          "tibble"         "ggplot2"        "tidyverse"     
[13] "data.table"     "stats"          "graphics"       "grDevices"     
[17] "utils"          "datasets"       "methods"        "base"          

Let’s read our Spotify data using data.table:

# Read the dataset using fread (much faster than read.csv() or read_csv())
#spotify_dt <- fread("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")

#read in teh data from data folder
spotify_dt <- fread("data/spotify_clean.csv")

spotify_csv <- read_csv("data/spotify_clean.csv")

Lets test it out with the fread function

# Let's measure memory usage
mem_before <- pryr::mem_used()
system.time(spotify_dt <- data.table::fread("data/spotify_clean.csv"))
   user  system elapsed 
   0.01    0.00    0.01 
mem_after_fread <- pryr::mem_used()
fread_memory <- mem_after_fread - mem_before

✅Now you try:

Now your turn try it out with the readr function read_csv

#Measuring and comparing memory usage of read_csv:
mem_before <- pryr::mem_used()
system.time(spotify_dt <- readr::read_csv("data/spotify_clean.csv"))
Rows: 9999 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (11): track_name, artist_name_s, album_name, album_artist_name_s, album...
dbl  (16): disc_number, track_number, track_duration_ms, popularity, danceab...
lgl   (2): explicit, album_genres
dttm  (1): added_at

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
   user  system elapsed 
   0.03    0.14    0.29 
mem_after_fread <- pryr::mem_used()
fread_memory <- mem_after_fread - mem_before

When you have an existing data frame (like one from tidyverse or base R like spotify_csv), you can convert it to a data.table using setDT().

# Option 1: setDT() - converts in place (memory efficient)
#setDT(spotify_csv)  # Converts the existing object to data.table

# Option 2: as.data.table() - creates a copy (uses more memory)
#spotify_dt <- as.data.table(spotify_csv)
  1. setDT() is more memory efficient because it modifies the existing object
  2. as.data.table() creates a copy, using twice the memory
  3. For big data, it’s best to read directly with fread() rather than converting
  4. The conversion only needs to happen if you start with a regular data frame

Here is a breakdown for comparison:

compare_methods <- function(file_path) {
 # Print initial memory state
 cat("Initial memory state:\n")
 print(gc())
 
 # Create container for results
 results <- list()
 
 # 1. fread method
 cat("\nTesting fread method...\n")
 mem_before <- pryr::mem_used()
 start_time <- Sys.time()
 dt1 <- data.table::fread(file_path)
 end_time <- Sys.time()
 mem_after <- pryr::mem_used()
 results$fread <- list(
   memory = (mem_after - mem_before) / 1024^2,  # Convert to MB
   time = end_time - start_time,
   size = format(object.size(dt1), units = "MB")
 )
 cat("fread object size:", results$fread$size, "\n")
 
 # 2. read_csv method
 cat("\nTesting read_csv method...\n")
 mem_before <- pryr::mem_used()
 start_time <- Sys.time()
 csv <- readr::read_csv(file_path)
 end_time <- Sys.time()
 mem_after <- pryr::mem_used()
 results$read_csv <- list(
   memory = (mem_after - mem_before) / 1024^2,
   time = end_time - start_time,
   size = format(object.size(csv), units = "MB")
 )
 cat("read_csv object size:", results$read_csv$size, "\n")
 
 # 3. setDT method
 cat("\nTesting setDT method...\n")
 mem_before <- pryr::mem_used()
 start_time <- Sys.time()
 csv_copy <- data.table::copy(csv)
 data.table::setDT(csv_copy)
 end_time <- Sys.time()
 mem_after <- pryr::mem_used()
 results$setDT <- list(
   memory = (mem_after - mem_before) / 1024^2,
   time = end_time - start_time,
   size = format(object.size(csv_copy), units = "MB")
 )
 cat("setDT object size:", results$setDT$size, "\n")
 
 # 4. as.data.table method
 cat("\nTesting as.data.table method...\n")
 mem_before <- pryr::mem_used()
 start_time <- Sys.time()
 dt2 <- data.table::as.data.table(csv)
 end_time <- Sys.time()
 mem_after <- pryr::mem_used()
 results$as_data_table <- list(
   memory = (mem_after - mem_before) / 1024^2,
   time = end_time - start_time,
   size = format(object.size(dt2), units = "MB")
 )
 cat("as.data.table object size:", results$as_data_table$size, "\n")
 
 # Final memory state
 cat("\nFinal memory state:\n")
 print(gc())
 
 # Create summary data.frame
 summary_df <- data.frame(
   method = names(results),
   memory_mb = sapply(results, function(x) round(x$memory, 2)),
   time_secs = sapply(results, function(x) round(as.numeric(x$time), 4)),
   final_size = sapply(results, function(x) x$size)
 )
 
 return(summary_df)
}

# Use the function
results <- compare_methods("data/spotify_clean.csv")
Initial memory state:
          used (Mb) gc trigger  (Mb) max used  (Mb)
Ncells 1147481 61.3    2331130 124.5  2331130 124.5
Vcells 2705580 20.7    8388608  64.0  3578589  27.4

Testing fread method...
fread object size: 5.5 Mb 

Testing read_csv method...
Rows: 9999 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (11): track_name, artist_name_s, album_name, album_artist_name_s, album...
dbl  (16): disc_number, track_number, track_duration_ms, popularity, danceab...
lgl   (2): explicit, album_genres
dttm  (1): added_at

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
read_csv object size: 5.8 Mb 

Testing setDT method...
setDT object size: 5.8 Mb 

Testing as.data.table method...
as.data.table object size: 5.8 Mb 

Final memory state:
          used (Mb) gc trigger  (Mb) max used  (Mb)
Ncells 1155808 61.8    2331130 124.5  2331130 124.5
Vcells 4109632 31.4    8388608  64.0  4345694  33.2
print("\nFinal Results:")
[1] "\nFinal Results:"
print(results)
                     method memory_mb time_secs final_size
fread                 fread      4.13    0.0168     5.5 Mb
read_csv           read_csv      2.23    0.2756     5.8 Mb
setDT                 setDT      2.37    0.0011     5.8 Mb
as_data_table as_data_table      2.33    0.0009     5.8 Mb

🔍 Investigation Exercise:

Without using any external resources, what can you discover about this dataset using data.table methods?

💡Think about….

  • How many observations and variables are there?

  • What types of data are present?

"
# Use these data.table commands to gather clues:

# Clue 1: Basic structure and dimensions
spotify_dt[, .N]  # Number of rows
ncol(spotify_dt)  # Number of columns

# Clue 2: Column names and types
spotify_dt[, lapply(.SD, class)]  # Shows class of each column

#Clure 3: look at data content
spotify_dt[1:5]

# Clue 4: Unique values in categorical columns
spotify_dt[, lapply(.SD, function(x) uniqueN(x)), .SDcols = is.character]
"
[1] "\n# Use these data.table commands to gather clues:\n\n# Clue 1: Basic structure and dimensions\nspotify_dt[, .N]  # Number of rows\nncol(spotify_dt)  # Number of columns\n\n# Clue 2: Column names and types\nspotify_dt[, lapply(.SD, class)]  # Shows class of each column\n\n#Clure 3: look at data content\nspotify_dt[1:5]\n\n# Clue 4: Unique values in categorical columns\nspotify_dt[, lapply(.SD, function(x) uniqueN(x)), .SDcols = is.character]\n"

Data.table Operations - data.table Syntax: [ i, j, by ]

Now let’s explore data.table’s powerful syntax for data manipulation. Think of these operations as your detective’s toolkit:

  • i (WHERE): Your filter 🔍 (selects specific rows)(like filter() & selec())

  • j (SELECT): Your magnifying glass 🔬 (focuses on specific columns or computations) (like select() or summarize())

  • by (GROUP BY): Your filing system 📁 (organizes findings)(like group_by())

1.) Basic Operations

  1. Selecting Rows (i)

When you only specify i, data.table returns a data.table (like a data frame), not a vector.

'
# Select first 5 rows
spotify_dt[1:5]

# Filter by popularity
spotify_dt[popularity > 75]

# Multiple conditions
spotify_dt[popularity > 75 & main_genre == "Rock"]

# Excluding certain values
spotify_dt[key != 10]

# Select tracks by genre
spotify_dt[main_genre %in% c("Rock", "Pop")]
'
[1] "\n# Select first 5 rows\nspotify_dt[1:5]\n\n# Filter by popularity\nspotify_dt[popularity > 75]\n\n# Multiple conditions\nspotify_dt[popularity > 75 & main_genre == \"Rock\"]\n\n# Excluding certain values\nspotify_dt[key != 10]\n\n# Select tracks by genre\nspotify_dt[main_genre %in% c(\"Rock\", \"Pop\")]\n"

✅Now you try:

Selecting Rows (i)

"
# Selecting data with track_duration_ms of > 300000
spotify_dt[track_duration_ms > 300000 ]
"
[1] "\n# Selecting data with track_duration_ms of > 300000\nspotify_dt[track_duration_ms > 300000 ]\n"

AS a reminder

Comparison Operators

  • == means “exactly equal to”
  • != means “not equal to”
  • > means “greater than”
  • < means “less than”
  • >= means “greater than or equal to”
  • <= means “less than or equal to”

Logical Operators

  • & means “AND” (both conditions must be true)
  • | means “OR” (at least one condition must be true)
  • ! means “NOT” (reverses the condition)

2.) Column Operations (j)

The j argument lets you select columns and perform computations:

"
# Select specific columns
spotify_dt[, .(track_name, artist_name_s, popularity)]

# Compute summary statistics
spotify_dt[, .(avg_popularity = mean(popularity),
              max_energy = max(energy))]

# Multiple computations
spotify_dt[, .(
  track_count = .N,
  avg_duration = mean(track_duration_ms)/1000,
  avg_popularity = mean(popularity)
)]

# Select specific columns
spotify_dt[, .(artist_name_s, track_name)]

# Create new columns in output
spotify_dt[, .(
  song = track_name,
  artist = artist_name_s,
  popularity_score = popularity/100
)]

# Calculate summary statistics
spotify_dt[, .(
  avg_popularity = mean(popularity),
  total_tracks = .N,
  avg_duration = mean(track_duration_ms)/1000  # in seconds
)]
"
[1] "\n# Select specific columns\nspotify_dt[, .(track_name, artist_name_s, popularity)]\n\n# Compute summary statistics\nspotify_dt[, .(avg_popularity = mean(popularity),\n              max_energy = max(energy))]\n\n# Multiple computations\nspotify_dt[, .(\n  track_count = .N,\n  avg_duration = mean(track_duration_ms)/1000,\n  avg_popularity = mean(popularity)\n)]\n\n# Select specific columns\nspotify_dt[, .(artist_name_s, track_name)]\n\n# Create new columns in output\nspotify_dt[, .(\n  song = track_name,\n  artist = artist_name_s,\n  popularity_score = popularity/100\n)]\n\n# Calculate summary statistics\nspotify_dt[, .(\n  avg_popularity = mean(popularity),\n  total_tracks = .N,\n  avg_duration = mean(track_duration_ms)/1000  # in seconds\n)]\n"

✅Now you try:

Selecting and computing (i)

'
# Selecting the popularity and track duration columns
spotify_dt[, .(popularity, track_duration_ms)]
'
[1] "\n# Selecting the popularity and track duration columns\nspotify_dt[, .(popularity, track_duration_ms)]\n"
  1. Grouping Operations (by)
'
# Group by genre
spotify_dt[, .(
  track_count = .N,
  avg_popularity = mean(popularity),
  avg_danceability = mean(danceability)
), by = main_genre]

# Multiple grouping variables
spotify_dt[, .(
  track_count = .N,
  avg_popularity = mean(popularity)
), by = .(main_genre, explicit)]

# Group by genre for popularity analysis
genre_stats <- spotify_dt[, .(
  avg_popularity = mean(popularity, na.rm = TRUE),
  track_count = .N,
  avg_duration = mean(track_duration_ms)/1000
), by = main_genre][]

'
[1] "\n# Group by genre\nspotify_dt[, .(\n  track_count = .N,\n  avg_popularity = mean(popularity),\n  avg_danceability = mean(danceability)\n), by = main_genre]\n\n# Multiple grouping variables\nspotify_dt[, .(\n  track_count = .N,\n  avg_popularity = mean(popularity)\n), by = .(main_genre, explicit)]\n\n# Group by genre for popularity analysis\ngenre_stats <- spotify_dt[, .(\n  avg_popularity = mean(popularity, na.rm = TRUE),\n  track_count = .N,\n  avg_duration = mean(track_duration_ms)/1000\n), by = main_genre][]\n\n"

✅Now you try:

Grouping (i)

'
#comment and write your code below:
spotify_dt[, .(
  track_count = .N,
  avg_popularity = mean(popularity)
), by = explicit]
'
[1] "\n#comment and write your code below:\nspotify_dt[, .(\n  track_count = .N,\n  avg_popularity = mean(popularity)\n), by = explicit]\n"

Think of it like a sentence:

i = “Which rows do you want?”

j = “What do you want to do with them?”

by = “How should they be grouped?”

5. Advanced Features

  1. Adding/Updating Columns (:=)

We use the empty brackets at the end to print but you could have also

"
# Add duration in seconds
spotify_dt[, duration_sec := track_duration_ms/1000][]

# Multiple column updates
spotify_dt[, ':='(
  popularity_scaled = popularity/100,
  is_popular = popularity > 75
)]

spotify_dt
"
[1] "\n# Add duration in seconds\nspotify_dt[, duration_sec := track_duration_ms/1000][]\n\n# Multiple column updates\nspotify_dt[, ':='(\n  popularity_scaled = popularity/100,\n  is_popular = popularity > 75\n)]\n\nspotify_dt\n"

✅Now you try:

"
#comment and write your code below:

# Create new columns
spotify_dt[, `:=`(
  is_popular = popularity > 75,
  loudness_energy_ratio = loudness / energy
)]

# Verify new columns
names(spotify_dt) 

#from data.table
spotify_dt[, names(.SD)]
"
[1] "\n#comment and write your code below:\n\n# Create new columns\nspotify_dt[, `:=`(\n  is_popular = popularity > 75,\n  loudness_energy_ratio = loudness / energy\n)]\n\n# Verify new columns\nnames(spotify_dt) \n\n#from data.table\nspotify_dt[, names(.SD)]\n"

Working with Dates

Here’s how to handle dates in data.table:

"
# Load lubridate
library(lubridate)

# Convert and extract date components
spotify_dt <- spotify_dt[, `:=`(
  release_date = as.Date(album_release_date),
  release_year = year(as.Date(album_release_date)),
  release_month = month(as.Date(album_release_date)),
  release_day = day(as.Date(album_release_date))
)]

# View the new date columns
spotify_dt[, .(release_date, release_year, release_month, release_day)][1:5]

class(spotify_dt$album_release_date)
"
[1] "\n# Load lubridate\nlibrary(lubridate)\n\n# Convert and extract date components\nspotify_dt <- spotify_dt[, `:=`(\n  release_date = as.Date(album_release_date),\n  release_year = year(as.Date(album_release_date)),\n  release_month = month(as.Date(album_release_date)),\n  release_day = day(as.Date(album_release_date))\n)]\n\n# View the new date columns\nspotify_dt[, .(release_date, release_year, release_month, release_day)][1:5]\n\nclass(spotify_dt$album_release_date)\n"

Ordering Results

Data.table provides efficient ways to sort:

'
# Sort genres by popularity (descending)
genre_stats[order(-avg_popularity)]

# Get top 5 most popular genres
genre_stats[order(-avg_popularity)][1:5]
'
[1] "\n# Sort genres by popularity (descending)\ngenre_stats[order(-avg_popularity)]\n\n# Get top 5 most popular genres\ngenre_stats[order(-avg_popularity)][1:5]\n"
'
year_genre_stats <- spotify_dt[, .(
    avg_popularity = mean(popularity),
    track_count = .N,
    avg_energy = mean(energy)
), by = .(release_year)]

# Sort by multiple columns
year_genre_stats[order(release_year, -avg_popularity)]
'
[1] "\nyear_genre_stats <- spotify_dt[, .(\n    avg_popularity = mean(popularity),\n    track_count = .N,\n    avg_energy = mean(energy)\n), by = .(release_year)]\n\n# Sort by multiple columns\nyear_genre_stats[order(release_year, -avg_popularity)]\n"

🔍 Practice Exercise:

Using the Spotify dataset, create an analysis that:

  1. Filters for songs after 2010

  2. Calculates average audio features by genre

  3. Identifies the top 3 genres by popularity

'
# solution here
spotify_dt[year(as.Date(album_release_date)) >= 2010,
          .(avg_popularity = mean(popularity),
            avg_danceability = mean(danceability),
            avg_energy = mean(energy),
            track_count = .N),
          by = main_genre
          ][order(-avg_popularity)
          ][1:3]
          '
[1] "\n# solution here\nspotify_dt[year(as.Date(album_release_date)) >= 2010,\n          .(avg_popularity = mean(popularity),\n            avg_danceability = mean(danceability),\n            avg_energy = mean(energy),\n            track_count = .N),\n          by = main_genre\n          ][order(-avg_popularity)\n          ][1:3]\n          "

Ordering Results

Data.table provides efficient ways to sort:

'
# Sort genres by popularity (descending)
genre_stats[order(-avg_popularity)]

# Sort by multiple columns
year_genre_stats[order(release_year, -avg_popularity)]

# Get top 5 most popular genres
genre_stats[order(-avg_popularity)][1:5]
'
[1] "\n# Sort genres by popularity (descending)\ngenre_stats[order(-avg_popularity)]\n\n# Sort by multiple columns\nyear_genre_stats[order(release_year, -avg_popularity)]\n\n# Get top 5 most popular genres\ngenre_stats[order(-avg_popularity)][1:5]\n"

Chaining Operations

Data.table allows you to chain operations efficiently:

'
# Complex analysis chain
spotify_dt[popularity > 70
          ][, .(avg_energy = mean(energy),
                track_count = .N),
            by = main_genre
          ][order(-avg_energy)]
          '
[1] "\n# Complex analysis chain\nspotify_dt[popularity > 70\n          ][, .(avg_energy = mean(energy),\n                track_count = .N),\n            by = main_genre\n          ][order(-avg_energy)]\n          "
  1. Special Symbols
  • .N: Counts rows (like n() in tidyverse)
  • .SD: Subset of data (useful for applying functions to multiple columns)
  • .(): Create a list (like select() or for multiple aggregations)
'
# Use .SD for multiple column operations
spotify_dt[, lapply(.SD, mean), 
          .SDcols = c("danceability", "energy", "valence"),
          by = main_genre]
          '
[1] "\n# Use .SD for multiple column operations\nspotify_dt[, lapply(.SD, mean), \n          .SDcols = c(\"danceability\", \"energy\", \"valence\"),\n          by = main_genre]\n          "

Performance Tips (Extra)

  1. set key columns for faster subsetting:

# Set key for faster lookups setkey(spotify_dt, main_genre) # Fast subset using key spotify_dt["Rock"]

'
# Set key for faster lookups 

setkey(spotify_dt, main_genre)  

# Fast subset using key 

spotify_dt["Rock"]
'
[1] "\n# Set key for faster lookups \n\nsetkey(spotify_dt, main_genre)  \n\n# Fast subset using key \n\nspotify_dt[\"Rock\"]\n"
  1. Avoid copies with :=:
'
# More efficient than creating new objects 
spotify_dt[, efficiency_ratio := energy/danceability]
'
[1] "\n# More efficient than creating new objects \nspotify_dt[, efficiency_ratio := energy/danceability]\n"
  1. Use .SDcols for column operations:
'
# Efficient multiple column operations 
audio_features <- c("danceability", "energy", "valence") 

spotify_dt[, lapply(.SD, mean), .SDcols = audio_features, by = main_genre]
'
[1] "\n# Efficient multiple column operations \naudio_features <- c(\"danceability\", \"energy\", \"valence\") \n\nspotify_dt[, lapply(.SD, mean), .SDcols = audio_features, by = main_genre]\n"

Watch out for these common mistakes:

  • Forgetting the .() when selecting multiple columns
  • Using $ instead of := for assignments
  • Forgetting to add [] at the end when you want to see results

Remember:

  • tidyverse is great for readability and smaller datasets
  • data.table shines with big data and performance needs
  • Both have their place in your R toolkit

🎊🎉 Congratulations you finished week 4 and now on to a mini project analysis using data.table functions.

💡 Mini-Project Guidelines:

Create a comprehensive analysis using data.table that:

Demonstrates at least three different data.table operations Includes performance comparisons with equivalent dplyr operations Creates insightful visualizations from the results

Remember to:

  • Document your code

  • Explain your analysis choices

  • Consider performance implications

  • Think about scalability

Want more….

  • Explore more advanced data.table features

Prompt for Chat GPT/Claude/Gemini

You are a data scientist using exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to …. {Add what you want it to do}

Mini Project

Project Description: For this project, I wanted to replicate the project done in week 3, just utilizing data.table instead of tidyverse. To reiterate the goal of this project, I would like to look at the popularity of songs by decade, and how it corresponds to whether or not a song is explicit.

I chose to analyze whether there is a trend between whether a song is explicit or not and its popularity, as well as how this trend may change by decade. To complete this project I group by ‘decade’ and subgroup by whether the ‘explicit’ value is true or false. During this process, I removed all entried that had missing value in the release date, explicit, and popularity columns. Additionally, I exempted the years that dindt contain bit explicit and non explicit content. Within the groups defined before, I averaged and displayed the popularity value to determine if there is a correlation between explicit content and popularity, as well as and changes by decade.

library(data.table)
library(lubridate)  # For date manipulation
library(tidyverse)

spotify_csv <- read_csv("data/spotify_clean.csv")
Rows: 9999 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (11): track_name, artist_name_s, album_name, album_artist_name_s, album...
dbl  (16): disc_number, track_number, track_duration_ms, popularity, danceab...
lgl   (2): explicit, album_genres
dttm  (1): added_at

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spotify_dt <- fread("data/spotify_clean.csv")

# Convert album_release_date to Date format
system.time(spotify_dt[, release_date := ymd(album_release_date)])
Warning: 1375 failed to parse.
   user  system elapsed 
   0.02    0.00    0.01 
system.time(spotify_clean_dplyr <- spotify_csv %>%
  mutate(release_date = ymd(album_release_date)))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `release_date = ymd(album_release_date)`.
Caused by warning:
!  1375 failed to parse.
   user  system elapsed 
   0.08    0.04    0.12 
# Extract year, month, day, and decade
system.time(spotify_dt[, `:=`(
  release_year = year(release_date),
  release_month = month(release_date),
  release_day = day(release_date),
  decade = floor(year(release_date) / 10) * 10
)])
   user  system elapsed 
      0       0       0 
# Display results
head(spotify_dt)
                                track_name      artist_name_s
                                    <char>             <char>
1: Justified & Ancient - Stand by the Jams            The KLF
2:         I Know You Want Me (Calle Ocho)            Pitbull
3:      From the Bottom of My Broken Heart     Britney Spears
4:        Apeman - 2014 Remastered Version          The Kinks
5:      You Can't Always Get What You Want The Rolling Stones
6:              Don't Stop - 2004 Remaster      Fleetwood Mac
                                                               album_name
                                                                   <char>
1:                                                       Songs Collection
2:                                         Pitbull Starring In Rebelution
3:                         ...Baby One More Time (Digital Deluxe Version)
4: Lola vs. Powerman and the Moneygoround, Pt. One + Percy (Super Deluxe)
5:                                                           Let It Bleed
6:                                                                Rumours
   album_artist_name_s album_release_date disc_number track_number
                <char>             <char>       <int>        <int>
1:             The KLF         1992-08-03           1            3
2:             Pitbull         2009-10-23           1            3
3:      Britney Spears         1999-01-12           1            6
4:           The Kinks         2014-10-20           1           11
5:  The Rolling Stones         1969-12-05           1            9
6:       Fleetwood Mac         1977-02-04           1            4
   track_duration_ms explicit popularity         isrc                 added_by
               <int>   <lgcl>      <int>       <char>                   <char>
1:            216270    FALSE          0 QMARG1760056 spotify:user:bradnumber1
2:            237120    FALSE         64 USJAY0900144 spotify:user:bradnumber1
3:            312533    FALSE         56 USJI19910455 spotify:user:bradnumber1
4:            233400    FALSE         42 GB5KW1499822 spotify:user:bradnumber1
5:            448720    FALSE          0 USA176910100 spotify:user:bradnumber1
6:            193346    FALSE         79 USWB10400049 spotify:user:bradnumber1
              added_at
                <POSc>
1: 2020-03-05 09:20:39
2: 2021-08-08 09:26:31
3: 2021-08-08 09:26:31
4: 2021-08-08 09:26:31
5: 2021-08-08 09:26:31
6: 2022-08-31 00:08:18
                                                                                                             artist_genres
                                                                                                                    <char>
1:                                                                             acid house,ambient house,big beat,hip house
2:                                                                                             dance pop,miami hip hop,pop
3:                                                                                                           dance pop,pop
4: album rock,art rock,british invasion,classic rock,folk rock,glam rock,protopunk,psychedelic rock,rock,singer-songwriter
5:                                                                           album rock,british invasion,classic rock,rock
6:                                                                       album rock,classic rock,rock,soft rock,yacht rock
   danceability energy   key loudness  mode speechiness acousticness
          <num>  <num> <int>    <num> <int>       <num>        <num>
1:        0.617  0.872     8  -12.305     1      0.0480       0.0158
2:        0.825  0.743     2   -5.995     1      0.1490       0.0142
3:        0.677  0.665     7   -5.171     1      0.0305       0.5600
4:        0.683  0.728     9   -8.920     1      0.2590       0.5680
5:        0.319  0.627     0   -9.611     1      0.0687       0.6750
6:        0.671  0.710     9   -7.724     1      0.0356       0.0393
   instrumentalness liveness valence   tempo time_signature album_genres
              <num>    <num>   <num>   <num>          <int>       <lgcl>
1:         1.12e-01   0.4080   0.504 111.458              4           NA
2:         2.12e-05   0.2370   0.800 127.045              4           NA
3:         1.01e-06   0.3380   0.706  74.981              4           NA
4:         5.08e-05   0.0384   0.833  75.311              4           NA
5:         7.29e-05   0.2890   0.497  85.818              4           NA
6:         1.12e-05   0.0387   0.834 118.745              4           NA
                                 label
                                <char>
1:                 Jams Communications
2: Mr.305/Polo Grounds Music/J Records
3:                                Jive
4:                   Sanctuary Records
5:               Universal Music Group
6:                Rhino/Warner Records
                                                                                                   copyrights
                                                                                                       <char>
1:                                                       C 1992 Copyright Control, P 1992 Jams Communications
2:                                        P (P) 2009 RCA/JIVE Label Group, a unit of Sony Music Entertainment
3:                                                                             P (P) 1999 Zomba Recording LLC
4: C © 2014 Sanctuary Records Group Ltd., a BMG Company, P ℗ 2014 Sanctuary Records Group Ltd., a BMG Company
5:                                   C © 2002 ABKCO Music & Records Inc., P ℗ 2002 ABKCO Music & Records Inc.
6:                                                 C © 2004 Warner Records Inc., P ℗ 2004 Warner Records Inc.
         main_genre release_date release_year release_month release_day decade
             <char>       <Date>        <num>         <num>       <int>  <num>
1: Dance/Electronic   1992-08-03         1992             8           3   1990
2:              Pop   2009-10-23         2009            10          23   2000
3:              Pop   1999-01-12         1999             1          12   1990
4:             Rock   2014-10-20         2014            10          20   2010
5:             Rock   1969-12-05         1969            12           5   1960
6:             Rock   1977-02-04         1977             2           4   1970
system.time(spotify_clean_dplyr <- spotify_clean_dplyr %>%
  mutate(release_year = year(release_date),
         release_month = month(release_date),
         release_day = day(release_date),
         decade = floor(year(release_date) / 10) * 10))
   user  system elapsed 
      0       0       0 
# Remove NA values in decade, explicit, and popularity
system.time(spotify_clean <- spotify_dt[!is.na(decade) & !is.na(explicit) & !is.na(popularity)])
   user  system elapsed 
   0.00    0.00    0.02 
# Display results
head(spotify_clean)
                                track_name      artist_name_s
                                    <char>             <char>
1: Justified & Ancient - Stand by the Jams            The KLF
2:         I Know You Want Me (Calle Ocho)            Pitbull
3:      From the Bottom of My Broken Heart     Britney Spears
4:        Apeman - 2014 Remastered Version          The Kinks
5:      You Can't Always Get What You Want The Rolling Stones
6:              Don't Stop - 2004 Remaster      Fleetwood Mac
                                                               album_name
                                                                   <char>
1:                                                       Songs Collection
2:                                         Pitbull Starring In Rebelution
3:                         ...Baby One More Time (Digital Deluxe Version)
4: Lola vs. Powerman and the Moneygoround, Pt. One + Percy (Super Deluxe)
5:                                                           Let It Bleed
6:                                                                Rumours
   album_artist_name_s album_release_date disc_number track_number
                <char>             <char>       <int>        <int>
1:             The KLF         1992-08-03           1            3
2:             Pitbull         2009-10-23           1            3
3:      Britney Spears         1999-01-12           1            6
4:           The Kinks         2014-10-20           1           11
5:  The Rolling Stones         1969-12-05           1            9
6:       Fleetwood Mac         1977-02-04           1            4
   track_duration_ms explicit popularity         isrc                 added_by
               <int>   <lgcl>      <int>       <char>                   <char>
1:            216270    FALSE          0 QMARG1760056 spotify:user:bradnumber1
2:            237120    FALSE         64 USJAY0900144 spotify:user:bradnumber1
3:            312533    FALSE         56 USJI19910455 spotify:user:bradnumber1
4:            233400    FALSE         42 GB5KW1499822 spotify:user:bradnumber1
5:            448720    FALSE          0 USA176910100 spotify:user:bradnumber1
6:            193346    FALSE         79 USWB10400049 spotify:user:bradnumber1
              added_at
                <POSc>
1: 2020-03-05 09:20:39
2: 2021-08-08 09:26:31
3: 2021-08-08 09:26:31
4: 2021-08-08 09:26:31
5: 2021-08-08 09:26:31
6: 2022-08-31 00:08:18
                                                                                                             artist_genres
                                                                                                                    <char>
1:                                                                             acid house,ambient house,big beat,hip house
2:                                                                                             dance pop,miami hip hop,pop
3:                                                                                                           dance pop,pop
4: album rock,art rock,british invasion,classic rock,folk rock,glam rock,protopunk,psychedelic rock,rock,singer-songwriter
5:                                                                           album rock,british invasion,classic rock,rock
6:                                                                       album rock,classic rock,rock,soft rock,yacht rock
   danceability energy   key loudness  mode speechiness acousticness
          <num>  <num> <int>    <num> <int>       <num>        <num>
1:        0.617  0.872     8  -12.305     1      0.0480       0.0158
2:        0.825  0.743     2   -5.995     1      0.1490       0.0142
3:        0.677  0.665     7   -5.171     1      0.0305       0.5600
4:        0.683  0.728     9   -8.920     1      0.2590       0.5680
5:        0.319  0.627     0   -9.611     1      0.0687       0.6750
6:        0.671  0.710     9   -7.724     1      0.0356       0.0393
   instrumentalness liveness valence   tempo time_signature album_genres
              <num>    <num>   <num>   <num>          <int>       <lgcl>
1:         1.12e-01   0.4080   0.504 111.458              4           NA
2:         2.12e-05   0.2370   0.800 127.045              4           NA
3:         1.01e-06   0.3380   0.706  74.981              4           NA
4:         5.08e-05   0.0384   0.833  75.311              4           NA
5:         7.29e-05   0.2890   0.497  85.818              4           NA
6:         1.12e-05   0.0387   0.834 118.745              4           NA
                                 label
                                <char>
1:                 Jams Communications
2: Mr.305/Polo Grounds Music/J Records
3:                                Jive
4:                   Sanctuary Records
5:               Universal Music Group
6:                Rhino/Warner Records
                                                                                                   copyrights
                                                                                                       <char>
1:                                                       C 1992 Copyright Control, P 1992 Jams Communications
2:                                        P (P) 2009 RCA/JIVE Label Group, a unit of Sony Music Entertainment
3:                                                                             P (P) 1999 Zomba Recording LLC
4: C © 2014 Sanctuary Records Group Ltd., a BMG Company, P ℗ 2014 Sanctuary Records Group Ltd., a BMG Company
5:                                   C © 2002 ABKCO Music & Records Inc., P ℗ 2002 ABKCO Music & Records Inc.
6:                                                 C © 2004 Warner Records Inc., P ℗ 2004 Warner Records Inc.
         main_genre release_date release_year release_month release_day decade
             <char>       <Date>        <num>         <num>       <int>  <num>
1: Dance/Electronic   1992-08-03         1992             8           3   1990
2:              Pop   2009-10-23         2009            10          23   2000
3:              Pop   1999-01-12         1999             1          12   1990
4:             Rock   2014-10-20         2014            10          20   2010
5:             Rock   1969-12-05         1969            12           5   1960
6:             Rock   1977-02-04         1977             2           4   1970
# Group by decade and explicit columns and calculate average popularity
system.time(popularity_result <- spotify_clean[, .(
  average_popularity = mean(popularity, na.rm = TRUE)
), by = .(decade, explicit)])
   user  system elapsed 
   0.00    0.00    0.01 
# Remove decades that don't have both explicit and non-explicit songs
system.time(valid_decades <- popularity_result[, .N, by = decade][N > 1, decade])
   user  system elapsed 
      0       0       0 
system.time(popularity_result <- popularity_result[decade %in% valid_decades])
   user  system elapsed 
      0       0       0 
# Display results
popularity_result
    decade explicit average_popularity
     <num>   <lgcl>              <num>
 1:   1990    FALSE           38.83432
 2:   2000    FALSE           34.12555
 3:   2010    FALSE           34.17537
 4:   1970    FALSE           46.40685
 5:   2020    FALSE           48.78423
 6:   1980    FALSE           44.10890
 7:   2010     TRUE           48.27149
 8:   2020     TRUE           58.48649
 9:   2000     TRUE           33.98611
10:   1990     TRUE           42.31111
11:   1970     TRUE           80.66667
12:   1980     TRUE           38.25000
# Grouping by decade and explicit columns
system.time(popularity_result_dplyr <- spotify_clean_dplyr %>%
  group_by(decade, explicit) %>%
  summarise(average_popularity = mean(popularity, na.rm = TRUE),
            .groups = "drop"))
   user  system elapsed 
   0.02    0.00    0.00 
# Removing the years that dont have both explicit and non explicit songs
system.time(popularity_result_dplyr <- popularity_result_dplyr %>%
  group_by(decade) %>%
  filter(any(explicit == TRUE)) %>%  # Keep only decades with at least one TRUE
  ungroup())
   user  system elapsed 
   0.00    0.00    0.01 

AI Mini Project

## First Prompt
"
You are a data scientist using exploring a large data set using 
exploratory data analysis principles. Your task is to create code 
in R using data.table functions to look for trends in popularity 
based on the main_genre, loudness, tempo, and explicit rating.
Attachment: spotify_clean.csv
"
[1] "\nYou are a data scientist using exploring a large data set using \nexploratory data analysis principles. Your task is to create code \nin R using data.table functions to look for trends in popularity \nbased on the main_genre, loudness, tempo, and explicit rating.\nAttachment: spotify_clean.csv\n"
library(data.table)

# Load dataset
dt <- fread("data/spotify_clean.csv")

# Remove rows with missing values in key columns
dt <- dt[!is.na(loudness) & !is.na(tempo) & !is.na(popularity) & !is.na(main_genre)]

# Convert 'explicit' to a factor
dt[, explicit := as.factor(explicit)]

# Average popularity by main_genre
genre_popularity <- dt[, .(avg_popularity = mean(popularity, na.rm = TRUE)), 
                       by = main_genre][order(-avg_popularity)]
print(genre_popularity)
                main_genre avg_popularity
                    <char>          <num>
 1:       Reggae/Caribbean       48.22222
 2:                    Pop       40.27116
 3:                   Rock       37.57563
 4:               R&B/Soul       34.93709
 5:           Folk/Country       34.65934
 6:       Dance/Electronic       32.76378
 7:                Hip Hop       32.50935
 8:                  Other       26.15360
 9:             Jazz/Blues       22.24074
10: Classical/Instrumental       14.75000
# Correlation between loudness, tempo, and popularity
cor_matrix <- dt[, .(cor_loudness_pop = cor(loudness, popularity, use = "complete.obs"),
                     cor_tempo_pop = cor(tempo, popularity, use = "complete.obs"))]
print(cor_matrix)
   cor_loudness_pop cor_tempo_pop
              <num>         <num>
1:       0.03268236   -0.00870543
# Popularity difference based on explicit content
explicit_popularity <- dt[, .(avg_popularity = mean(popularity, na.rm = TRUE)), by = explicit]
print(explicit_popularity)
   explicit avg_popularity
     <fctr>          <num>
1:    FALSE       37.26159
2:     TRUE       44.54028
## Second prompt
"
Remove the explicit quantifier. I would like a breakdown showing 
the ranges of tempo and loudness within each genre and the breakdowns 
of popularity in each subgroup.
"
[1] "\nRemove the explicit quantifier. I would like a breakdown showing \nthe ranges of tempo and loudness within each genre and the breakdowns \nof popularity in each subgroup.\n"
library(data.table)

# Load dataset
dt <- fread("data/spotify_clean.csv")

# Remove rows with missing values in key columns
dt <- dt[!is.na(loudness) & !is.na(tempo) & !is.na(popularity) & !is.na(main_genre)]

# Average popularity by main_genre
genre_popularity <- dt[, .(avg_popularity = mean(popularity, na.rm = TRUE)), 
                       by = main_genre][order(-avg_popularity)]
print(genre_popularity)
                main_genre avg_popularity
                    <char>          <num>
 1:       Reggae/Caribbean       48.22222
 2:                    Pop       40.27116
 3:                   Rock       37.57563
 4:               R&B/Soul       34.93709
 5:           Folk/Country       34.65934
 6:       Dance/Electronic       32.76378
 7:                Hip Hop       32.50935
 8:                  Other       26.15360
 9:             Jazz/Blues       22.24074
10: Classical/Instrumental       14.75000
# Correlation between loudness, tempo, and popularity
cor_matrix <- dt[, .(cor_loudness_pop = cor(loudness, popularity, use = "complete.obs"),
                     cor_tempo_pop = cor(tempo, popularity, use = "complete.obs"))]
print(cor_matrix)
   cor_loudness_pop cor_tempo_pop
              <num>         <num>
1:       0.03268236   -0.00870543
# Breakdown of tempo and loudness within each genre
genre_tempo_loudness <- dt[, .(min_tempo = min(tempo, na.rm = TRUE),
                                max_tempo = max(tempo, na.rm = TRUE),
                                avg_tempo = mean(tempo, na.rm = TRUE),
                                min_loudness = min(loudness, na.rm = TRUE),
                                max_loudness = max(loudness, na.rm = TRUE),
                                avg_loudness = mean(loudness, na.rm = TRUE)),
                            by = main_genre]
print(genre_tempo_loudness)
                main_genre min_tempo max_tempo avg_tempo min_loudness
                    <char>     <num>     <num>     <num>        <num>
 1:       Dance/Electronic    76.997   195.938  125.6409      -20.260
 2:                    Pop    50.937   210.796  121.0671      -23.092
 3:                   Rock    60.197   217.913  123.1042      -25.420
 4:                Hip Hop    69.992   207.966  116.6895      -17.117
 5:                  Other    34.999   207.673  120.7520      -29.368
 6:               R&B/Soul     0.000   196.986  115.7516      -22.304
 7:             Jazz/Blues    69.601   203.871  127.1590      -17.010
 8:           Folk/Country    74.375   203.853  128.2663      -20.508
 9:       Reggae/Caribbean    80.753   173.818  123.8397      -15.675
10: Classical/Instrumental    94.570   158.741  120.4697      -12.649
    max_loudness avg_loudness
           <num>        <num>
 1:       -1.131    -6.678193
 2:        2.769    -6.537436
 3:       -1.101    -8.219026
 4:       -0.776    -6.587467
 5:       -0.358    -8.720965
 6:       -2.383    -8.929705
 7:       -2.551    -9.240056
 8:       -3.745    -8.864626
 9:       -3.980    -8.636481
10:       -4.281    -9.881750
# Breakdown of popularity within each tempo and loudness subgroup
tempo_loudness_popularity <- dt[, .(avg_popularity = mean(popularity, na.rm = TRUE)),
                                by = .(cut(tempo, breaks = 5), cut(loudness, breaks = 5))]
print(tempo_loudness_popularity)
              cut         cut.1 avg_popularity
           <fctr>        <fctr>          <num>
 1:    (87.2,131] (-16.5,-10.1]       35.47418
 2:    (87.2,131] (-10.1,-3.66]       38.32587
 3:   (43.6,87.2] (-10.1,-3.66]       38.44741
 4:     (131,174] (-10.1,-3.66]       37.63743
 5:     (174,218] (-10.1,-3.66]       38.55082
 6:     (131,174]   (-3.66,2.8]       35.61947
 7:    (87.2,131]   (-3.66,2.8]       38.44972
 8:   (43.6,87.2] (-16.5,-10.1]       32.86486
 9:     (174,218] (-16.5,-10.1]       33.30986
10:     (131,174] (-22.9,-16.5]       34.21053
11:     (131,174] (-16.5,-10.1]       37.44963
12:   (43.6,87.2] (-22.9,-16.5]       35.92000
13:    (87.2,131] (-22.9,-16.5]       36.06557
14:   (43.6,87.2]   (-3.66,2.8]       34.62857
15:     (174,218]   (-3.66,2.8]       38.07895
16:    (87.2,131] (-29.4,-22.9]       16.00000
17: (-0.218,43.6] (-16.5,-10.1]        0.00000
18:   (43.6,87.2] (-29.4,-22.9]       48.00000
19: (-0.218,43.6] (-10.1,-3.66]       70.00000
20:     (131,174] (-29.4,-22.9]       30.50000
21:     (174,218] (-22.9,-16.5]       35.50000
22:     (174,218] (-29.4,-22.9]       34.00000
              cut         cut.1 avg_popularity
## Third prompt
"
Finally, can you take all columns into consideration and find the three 
variables with the strongest correlation to popularity.
"
[1] "\nFinally, can you take all columns into consideration and find the three \nvariables with the strongest correlation to popularity.\n"
library(data.table)

# Load dataset
dt <- fread("data/spotify_clean.csv")

# Remove rows with missing values in key columns
dt <- dt[!is.na(loudness) & !is.na(tempo) & !is.na(popularity) & !is.na(main_genre)]

# Average popularity by main_genre
genre_popularity <- dt[, .(avg_popularity = mean(popularity, na.rm = TRUE)), 
                       by = main_genre][order(-avg_popularity)]
print(genre_popularity)
                main_genre avg_popularity
                    <char>          <num>
 1:       Reggae/Caribbean       48.22222
 2:                    Pop       40.27116
 3:                   Rock       37.57563
 4:               R&B/Soul       34.93709
 5:           Folk/Country       34.65934
 6:       Dance/Electronic       32.76378
 7:                Hip Hop       32.50935
 8:                  Other       26.15360
 9:             Jazz/Blues       22.24074
10: Classical/Instrumental       14.75000
# Correlation between loudness, tempo, and popularity
cor_matrix <- dt[, .(cor_loudness_pop = cor(loudness, popularity, use = "complete.obs"),
                     cor_tempo_pop = cor(tempo, popularity, use = "complete.obs"))]
print(cor_matrix)
   cor_loudness_pop cor_tempo_pop
              <num>         <num>
1:       0.03268236   -0.00870543
# Breakdown of tempo and loudness within each genre
genre_tempo_loudness <- dt[, .(min_tempo = min(tempo, na.rm = TRUE),
                                max_tempo = max(tempo, na.rm = TRUE),
                                avg_tempo = mean(tempo, na.rm = TRUE),
                                min_loudness = min(loudness, na.rm = TRUE),
                                max_loudness = max(loudness, na.rm = TRUE),
                                avg_loudness = mean(loudness, na.rm = TRUE)),
                            by = main_genre]
print(genre_tempo_loudness)
                main_genre min_tempo max_tempo avg_tempo min_loudness
                    <char>     <num>     <num>     <num>        <num>
 1:       Dance/Electronic    76.997   195.938  125.6409      -20.260
 2:                    Pop    50.937   210.796  121.0671      -23.092
 3:                   Rock    60.197   217.913  123.1042      -25.420
 4:                Hip Hop    69.992   207.966  116.6895      -17.117
 5:                  Other    34.999   207.673  120.7520      -29.368
 6:               R&B/Soul     0.000   196.986  115.7516      -22.304
 7:             Jazz/Blues    69.601   203.871  127.1590      -17.010
 8:           Folk/Country    74.375   203.853  128.2663      -20.508
 9:       Reggae/Caribbean    80.753   173.818  123.8397      -15.675
10: Classical/Instrumental    94.570   158.741  120.4697      -12.649
    max_loudness avg_loudness
           <num>        <num>
 1:       -1.131    -6.678193
 2:        2.769    -6.537436
 3:       -1.101    -8.219026
 4:       -0.776    -6.587467
 5:       -0.358    -8.720965
 6:       -2.383    -8.929705
 7:       -2.551    -9.240056
 8:       -3.745    -8.864626
 9:       -3.980    -8.636481
10:       -4.281    -9.881750
# Breakdown of popularity within each tempo and loudness subgroup
tempo_loudness_popularity <- dt[, .(avg_popularity = mean(popularity, na.rm = TRUE)),
                                by = .(cut(tempo, breaks = 5), cut(loudness, breaks = 5))]
print(tempo_loudness_popularity)
              cut         cut.1 avg_popularity
           <fctr>        <fctr>          <num>
 1:    (87.2,131] (-16.5,-10.1]       35.47418
 2:    (87.2,131] (-10.1,-3.66]       38.32587
 3:   (43.6,87.2] (-10.1,-3.66]       38.44741
 4:     (131,174] (-10.1,-3.66]       37.63743
 5:     (174,218] (-10.1,-3.66]       38.55082
 6:     (131,174]   (-3.66,2.8]       35.61947
 7:    (87.2,131]   (-3.66,2.8]       38.44972
 8:   (43.6,87.2] (-16.5,-10.1]       32.86486
 9:     (174,218] (-16.5,-10.1]       33.30986
10:     (131,174] (-22.9,-16.5]       34.21053
11:     (131,174] (-16.5,-10.1]       37.44963
12:   (43.6,87.2] (-22.9,-16.5]       35.92000
13:    (87.2,131] (-22.9,-16.5]       36.06557
14:   (43.6,87.2]   (-3.66,2.8]       34.62857
15:     (174,218]   (-3.66,2.8]       38.07895
16:    (87.2,131] (-29.4,-22.9]       16.00000
17: (-0.218,43.6] (-16.5,-10.1]        0.00000
18:   (43.6,87.2] (-29.4,-22.9]       48.00000
19: (-0.218,43.6] (-10.1,-3.66]       70.00000
20:     (131,174] (-29.4,-22.9]       30.50000
21:     (174,218] (-22.9,-16.5]       35.50000
22:     (174,218] (-29.4,-22.9]       34.00000
              cut         cut.1 avg_popularity
# Identify top 3 variables most correlated with popularity
numeric_cols <- names(dt)[sapply(dt, is.numeric)]
correlations <- sapply(numeric_cols, function(col) cor(dt[[col]], dt$popularity, use = "complete.obs"))
top_3_correlations <- sort(correlations, decreasing = TRUE)[2:4] # Exclude self-correlation
print(top_3_correlations)
  danceability time_signature       loudness 
    0.05745467     0.03740262     0.03268236