Understand why data.table is crucial for big data analysis
Learn the syntax and key features of data.table
Practice with real-world Spotify data
Compare performance with traditional methods
data.table is designed for handling large datasets (1GB to 100GB) with:
Lightning-fast operations
Memory efficiency
Concise syntax
Enhanced features for data manipulation
First, let’s install and load our required packages:
```{r} #| warning: false #| message: false
required_packages <- c(“data.table”, “tidyverse”, “microbenchmark”, “pryr”)
for (pkg in required_packages) { if (!requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg) } }
#library(data.table) #library(tidyverse) #library(microbenchmark) # For precise timing #library(pryr) # For memory tracking
#Load libraries lapply(required_packages, library, character.only = TRUE)
Let's read our Spotify data using data.table:
```{r}
#| warning: false
#| message: false
# Read the dataset using fread (much faster than read.csv() or read_csv())
#spotify_dt <- fread("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")
#read in teh data from data folder
spotify_dt <- fread("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")
spotify_csv <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")
Lets test it out with the fread function
{r} # Let's measure memory usage mem_before <- pryr::mem_used() system.time(spotify_dt <- data.table::fread("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")) mem_after_fread <- pryr::mem_used() fread_memory <- mem_after_fread - mem_before
Now your turn try it out with the readr function
read_csv
```{r} #comment and write your code below:
When you have an existing data frame (like one from tidyverse or base R like spotify_csv), you can convert it to a data.table using setDT().
```{r}
# Option 1: setDT() - converts in place (memory efficient)
#setDT(spotify_csv) # Converts the existing object to data.table
# Option 2: as.data.table() - creates a copy (uses more memory)
#spotify_dt <- as.data.table(spotify_csv)
setDT() is more memory efficient because it modifies
the existing objectas.data.table() creates a copy, using twice the
memoryfread()
rather than convertingHere is a breakdown for comparison:
```{r} compare_methods <- function(file_path) { # Print initial memory state cat(“Initial memory state:”) print(gc())
# Create container for results results <- list()
# 1. fread method cat(“fread method…”) mem_before <- pryr::mem_used() start_time <- Sys.time() dt1 <- data.table::fread(file_path) end_time <- Sys.time() mem_after <- pryr::mem_used() results\(fread <- list( memory = (mem_after - mem_before) / 1024^2, # Convert to MB time = end_time - start_time, size = format(object.size(dt1), units = "MB") ) cat("fread object size:", results\)fread$size, “”)
# 2. read_csv method cat(“read_csv method…”) mem_before <- pryr::mem_used() start_time <- Sys.time() csv <- readr::read_csv(file_path) end_time <- Sys.time() mem_after <- pryr::mem_used() results\(read_csv <- list( memory = (mem_after - mem_before) / 1024^2, time = end_time - start_time, size = format(object.size(csv), units = "MB") ) cat("read_csv object size:", results\)read_csv$size, “”)
# 3. setDT method cat(“setDT method…”) mem_before <- pryr::mem_used() start_time <- Sys.time() csv_copy <- data.table::copy(csv) data.table::setDT(csv_copy) end_time <- Sys.time() mem_after <- pryr::mem_used() results\(setDT <- list( memory = (mem_after - mem_before) / 1024^2, time = end_time - start_time, size = format(object.size(csv_copy), units = "MB") ) cat("setDT object size:", results\)setDT$size, “”)
# 4. as.data.table method cat(“as.data.table method…”) mem_before <- pryr::mem_used() start_time <- Sys.time() dt2 <- data.table::as.data.table(csv) end_time <- Sys.time() mem_after <- pryr::mem_used() results\(as_data_table <- list( memory = (mem_after - mem_before) / 1024^2, time = end_time - start_time, size = format(object.size(dt2), units = "MB") ) cat("as.data.table object size:", results\)as_data_table$size, “”)
# Final memory state cat(“memory state:”) print(gc())
# Create summary data.frame summary_df <- data.frame( method = names(results), memory_mb = sapply(results, function(x) round(x\(memory, 2)), time_secs = sapply(results, function(x) round(as.numeric(x\)time), 4)), final_size = sapply(results, function(x) x$size) )
return(summary_df) }
results <- compare_methods(“https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv”) print(“Results:”) print(results)
## 🔍 Investigation Exercise:
Without using any external resources, what can you discover about this dataset using data.table methods?
💡Think about....
- How many observations and variables are there? There are 4 observations and 4 variables.
- What types of data are present? Chr and num
```{r}
# Use these data.table commands to gather clues:
# Clue 1: Basic structure and dimensions
spotify_dt[, .N] # Number of rows
ncol(spotify_dt) # Number of columns
# Clue 2: Column names and types
spotify_dt[, lapply(.SD, class)] # Shows class of each column
#Clure 3: look at data content
spotify_dt[1:5]
# Clue 4: Unique values in categorical columns
spotify_dt[, lapply(.SD, function(x) uniqueN(x)), .SDcols = is.character]
Now let’s explore data.table’s powerful syntax for data manipulation. Think of these operations as your detective’s toolkit:
i (WHERE): Your filter 🔍 (selects specific rows)(like filter() & selec())
j (SELECT): Your magnifying glass 🔬 (focuses on specific columns or computations) (like select() or summarize())
by (GROUP BY): Your filing system 📁 (organizes findings)(like group_by())
When you only specify i, data.table returns a data.table
(like a data frame), not a vector.
```{r}
spotify_dt[1:5]
spotify_dt[popularity > 75]
spotify_dt[popularity > 75 & main_genre == “Rock”]
spotify_dt[key != 10]
spotify_dt[main_genre %in% c(“Rock”, “Pop”)]
## ✅Now you try:
Selecting Rows (i)
```{r}
#comment and write your code below:
AS a reminder
The j argument lets you select columns and perform computations:
```{r} # Select specific columns spotify_dt[, .(track_name, artist_name_s, popularity)]
spotify_dt[, .(avg_popularity = mean(popularity), max_energy = max(energy))]
spotify_dt[, .( track_count = .N, avg_duration = mean(track_duration_ms)/1000, avg_popularity = mean(popularity) )]
spotify_dt[, .(artist_name_s, track_name)]
spotify_dt[, .( song = track_name, artist = artist_name_s, popularity_score = popularity/100 )]
spotify_dt[, .( avg_popularity = mean(popularity), total_tracks = .N, avg_duration = mean(track_duration_ms)/1000 # in seconds )]
## ✅Now you try:
Selecting and computing (i)
```{r}
#comment and write your code below:
```{r} # Group by genre spotify_dt[, .( track_count = .N, avg_popularity = mean(popularity), avg_danceability = mean(danceability) ), by = main_genre]
spotify_dt[, .( track_count = .N, avg_popularity = mean(popularity) ), by = .(main_genre, explicit)]
genre_stats <- spotify_dt[, .( avg_popularity = mean(popularity, na.rm = TRUE), track_count = .N, avg_duration = mean(track_duration_ms)/1000 ), by = main_genre][]
## ✅Now you try:
Grouping (i)
```{r}
#comment and write your code below:
Think of it like a sentence:
i = “Which rows do you want?”
j = “What do you want to do with them?”
by = “How should they be grouped?”
We use the empty brackets at the end to print but you could have also
```{r} # Add duration in seconds spotify_dt[, duration_sec := track_duration_ms/1000][]
spotify_dt[, ‘:=’( popularity_scaled = popularity/100, is_popular = popularity > 75 )]
spotify_dt
## ✅Now you try:
```{r}
#comment and write your code below:
# Create new columns
spotify_dt[, `:=`(
is_popular = popularity > 75,
loudness_energy_ratio = loudness / energy
)]
# Verify new columns
names(spotify_dt)
#from data.table
spotify_dt[, names(.SD)]
Here’s how to handle dates in data.table:
```{r} # Load lubridate library(lubridate)
spotify_dt <- spotify_dt[, :=( release_date =
as.Date(album_release_date), release_year =
year(as.Date(album_release_date)), release_month =
month(as.Date(album_release_date)), release_day =
day(as.Date(album_release_date)) )]
spotify_dt[, .(release_date, release_year, release_month, release_day)][1:5]
class(spotify_dt$album_release_date)
### Ordering Results
Data.table provides efficient ways to sort:
```{r}
# Sort genres by popularity (descending)
genre_stats[order(-avg_popularity)]
# Get top 5 most popular genres
genre_stats[order(-avg_popularity)][1:5]
```{r}
year_genre_stats <- spotify_dt[, .( avg_popularity = mean(popularity), track_count = .N, avg_energy = mean(energy) ), by = .(release_year)]
year_genre_stats[order(release_year, -avg_popularity)]
## 🔍 Practice Exercise:
Using the Spotify dataset, create an analysis that:
1. Filters for songs after 2010
2. Calculates average audio features by genre
3. Identifies the top 3 genres by popularity
```{r}
# solution here
spotify_dt[year(as.Date(album_release_date)) >= 2010,
.(avg_popularity = mean(popularity),
avg_danceability = mean(danceability),
avg_energy = mean(energy),
track_count = .N),
by = main_genre
][order(-avg_popularity)
][1:3]
Data.table provides efficient ways to sort:
```{r} # Sort genres by popularity (descending) genre_stats[order(-avg_popularity)]
year_genre_stats[order(release_year, -avg_popularity)]
genre_stats[order(-avg_popularity)][1:5]
### Chaining Operations
Data.table allows you to chain operations efficiently:
```{r}
# Complex analysis chain
spotify_dt[popularity > 70
][, .(avg_energy = mean(energy),
track_count = .N),
by = main_genre
][order(-avg_energy)]
{r} # Use .SD for multiple column operations spotify_dt[, lapply(.SD, mean), .SDcols = c("danceability", "energy", "valence"), by = main_genre]
# Set key for faster lookups setkey(spotify_dt, main_genre) # Fast subset using key spotify_dt["Rock"]
```{r} # Set key for faster lookups
setkey(spotify_dt, main_genre)
spotify_dt[“Rock”]
2. Avoid copies with `:=`:
```{r}
# More efficient than creating new objects
spotify_dt[, efficiency_ratio := energy/danceability]
```{r} # Efficient multiple column operations audio_features <- c(“danceability”, “energy”, “valence”)
spotify_dt[, lapply(.SD, mean), .SDcols = audio_features, by = main_genre]
Watch out for these common mistakes:
- Forgetting the .() when selecting multiple columns
- Using \$ instead of := for assignments
- Forgetting to add \[\] at the end when you want to see results
**Remember:**
- tidyverse is great for readability and smaller datasets
- data.table shines with big data and performance needs
- Both have their place in your R toolkit
### 🎊🎉 Congratulations you finished week 4 and now on to a mini project analysis using data.table functions.
## 💡 Mini-Project Guidelines:
Create a comprehensive analysis using data.table that:
Demonstrates at least three different data.table operations Includes performance comparisons with equivalent dplyr operations Creates insightful visualizations from the results
Remember to:
- Document your code
- Explain your analysis choices
- Consider performance implications
- Think about scalability
## Want more....
- Explore more advanced data.table features
Prompt for Chat GPT/Claude/Gemini
*You are a data scientist using exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to .... {Add what you want it to do}*
Step 1) Load Data and Libraries:
```{r}
# Load required libraries
library(data.table)
library(tidyverse)
library(ggplot2)
library(microbenchmark)
# Load Spotify dataset
data_url <- "https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv"
spotify_dt <- fread(data_url)
spotify_tbl <- as_tibble(spotify_dt) # For dplyr comparison
Step 2) Data Exploration with data.table:
str(spotify_dt) # Displays the structure of the data.table, including column names and types
numeric_cols <- names(spotify_dt)[sapply(spotify_dt, is.numeric)] # Identify numeric columns spotify_dt[, lapply(.SD, summary), .SDcols = numeric_cols] # Apply summary() to numeric columns
2) Filtering and Selecting:
```{r}
# Filter songs with popularity > 80 and select relevant columns
popular_songs <- spotify_dt[popularity > 80, .(track_name, artist_name_s, popularity, main_genre)]
head(popular_songs)
{r} # Average popularity by genre genre_popularity <- spotify_dt[, .(avg_popularity = mean(popularity)), by = main_genre] genre_popularity[order(-avg_popularity)]
Step 3) Performance Comparison:
```{r} # data.table dt_filter_time <- system.time({ dt_filtered <- spotify_dt[popularity > 80] })
dplyr_filter_time <- system.time({ dplyr_filtered <- spotify_tbl %>% filter(popularity > 80) })
cat(“data.table filtering time:”, dt_filter_time[“elapsed”], “seconds”) cat(“dplyr filtering time:”, dplyr_filter_time[“elapsed”], “seconds”)
2) Grouping and Aggregation:
```{r}
# data.table
dt_group_time <- system.time({
dt_grouped <- spotify_dt[, .(avg_popularity = mean(popularity)), by = main_genre]
})
# dplyr
dplyr_group_time <- system.time({
dplyr_grouped <- spotify_tbl %>% group_by(main_genre) %>% summarise(avg_popularity = mean(popularity))
})
cat("data.table grouping time:", dt_group_time["elapsed"], "seconds\n")
cat("dplyr grouping time:", dplyr_group_time["elapsed"], "seconds\n")
Step 4) Visualizations:
```{r} # Top 10 genres by average popularity top_genres <- genre_popularity[order(-avg_popularity)][1:10]
ggplot(top_genres, aes(x = reorder(main_genre, -avg_popularity), y = avg_popularity)) + geom_bar(stat = “identity”, fill = “steelblue”) + labs(title = “Top 10 Genres by Average Popularity”, x = “Genre”, y = “Average Popularity”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
2) Popularity vs. Danceability:
```{r}
# Scatter plot
ggplot(spotify_dt, aes(x = danceability, y = popularity)) +
geom_point(alpha = 0.5, color = "darkorange") +
labs(title = "Popularity vs. Danceability", x = "Danceability", y = "Popularity") +
theme_minimal()
Step 5) Advanced data.table Features:
{r} # Calculate mean for all numeric columns by genre numeric_cols <- names(spotify_dt)[sapply(spotify_dt, is.numeric)] genre_summary <- spotify_dt[, lapply(.SD, mean), .SDcols = numeric_cols, by = main_genre] head(genre_summary)
{r} # Filter, group, and sort in one chain result <- spotify_dt[popularity > 70, .(avg_energy = mean(energy)), by = main_genre][order(-avg_energy)] head(result)