# Read the dataset using fread (much faster than read.csv() or read_csv())#spotify_dt <- fread("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")#read in teh data from data folderspotify_dt <-fread("data/spotify_clean.csv")spotify_csv <-read_csv("data/spotify_clean.csv")
When you have an existing data frame (like one from tidyverse or base R like spotify_csv), you can convert it to a data.table using setDT().
# Option 1: setDT() - converts in place (memory efficient)#setDT(spotify_csv) # Converts the existing object to data.table# Option 2: as.data.table() - creates a copy (uses more memory)#spotify_dt <- as.data.table(spotify_csv)
setDT() is more memory efficient because it modifies the existing object
as.data.table() creates a copy, using twice the memory
For big data, it’s best to read directly with fread() rather than converting
The conversion only needs to happen if you start with a regular data frame
Without using any external resources, what can you discover about this dataset using data.table methods?
💡Think about….
How many observations and variables are there?
What types of data are present?
"# Use these data.table commands to gather clues:# Clue 1: Basic structure and dimensionsspotify_dt[, .N] # Number of rowsncol(spotify_dt) # Number of columns# Clue 2: Column names and typesspotify_dt[, lapply(.SD, class)] # Shows class of each column#Clure 3: look at data contentspotify_dt[1:5]# Clue 4: Unique values in categorical columnsspotify_dt[, lapply(.SD, function(x) uniqueN(x)), .SDcols = is.character]"
[1] "\n# Use these data.table commands to gather clues:\n\n# Clue 1: Basic structure and dimensions\nspotify_dt[, .N] # Number of rows\nncol(spotify_dt) # Number of columns\n\n# Clue 2: Column names and types\nspotify_dt[, lapply(.SD, class)] # Shows class of each column\n\n#Clure 3: look at data content\nspotify_dt[1:5]\n\n# Clue 4: Unique values in categorical columns\nspotify_dt[, lapply(.SD, function(x) uniqueN(x)), .SDcols = is.character]\n"
Data.table Operations - data.table Syntax: [ i, j, by ]
Now let’s explore data.table’s powerful syntax for data manipulation. Think of these operations as your detective’s toolkit:
i (WHERE): Your filter 🔍 (selects specific rows)(like filter() & selec())
j (SELECT): Your magnifying glass 🔬 (focuses on specific columns or computations) (like select() or summarize())
by (GROUP BY): Your filing system 📁 (organizes findings)(like group_by())
1.) Basic Operations
Selecting Rows (i)
When you only specify i, data.table returns a data.table (like a data frame), not a vector.
'# Select first 5 rowsspotify_dt[1:5]# Filter by popularityspotify_dt[popularity > 75]# Multiple conditionsspotify_dt[popularity > 75 & main_genre == "Rock"]# Excluding certain valuesspotify_dt[key != 10]# Select tracks by genrespotify_dt[main_genre %in% c("Rock", "Pop")]'
[1] "\n# Select first 5 rows\nspotify_dt[1:5]\n\n# Filter by popularity\nspotify_dt[popularity > 75]\n\n# Multiple conditions\nspotify_dt[popularity > 75 & main_genre == \"Rock\"]\n\n# Excluding certain values\nspotify_dt[key != 10]\n\n# Select tracks by genre\nspotify_dt[main_genre %in% c(\"Rock\", \"Pop\")]\n"
✅Now you try:
Selecting Rows (i)
"# Selecting data with track_duration_ms of > 300000spotify_dt[track_duration_ms > 300000 ]"
[1] "\n# Selecting data with track_duration_ms of > 300000\nspotify_dt[track_duration_ms > 300000 ]\n"
AS a reminder
Comparison Operators
== means “exactly equal to”
!= means “not equal to”
> means “greater than”
< means “less than”
>= means “greater than or equal to”
<= means “less than or equal to”
Logical Operators
& means “AND” (both conditions must be true)
| means “OR” (at least one condition must be true)
! means “NOT” (reverses the condition)
2.) Column Operations (j)
The j argument lets you select columns and perform computations:
"# Select specific columnsspotify_dt[, .(track_name, artist_name_s, popularity)]# Compute summary statisticsspotify_dt[, .(avg_popularity = mean(popularity), max_energy = max(energy))]# Multiple computationsspotify_dt[, .( track_count = .N, avg_duration = mean(track_duration_ms)/1000, avg_popularity = mean(popularity))]# Select specific columnsspotify_dt[, .(artist_name_s, track_name)]# Create new columns in outputspotify_dt[, .( song = track_name, artist = artist_name_s, popularity_score = popularity/100)]# Calculate summary statisticsspotify_dt[, .( avg_popularity = mean(popularity), total_tracks = .N, avg_duration = mean(track_duration_ms)/1000 # in seconds)]"
[1] "\n# Select specific columns\nspotify_dt[, .(track_name, artist_name_s, popularity)]\n\n# Compute summary statistics\nspotify_dt[, .(avg_popularity = mean(popularity),\n max_energy = max(energy))]\n\n# Multiple computations\nspotify_dt[, .(\n track_count = .N,\n avg_duration = mean(track_duration_ms)/1000,\n avg_popularity = mean(popularity)\n)]\n\n# Select specific columns\nspotify_dt[, .(artist_name_s, track_name)]\n\n# Create new columns in output\nspotify_dt[, .(\n song = track_name,\n artist = artist_name_s,\n popularity_score = popularity/100\n)]\n\n# Calculate summary statistics\nspotify_dt[, .(\n avg_popularity = mean(popularity),\n total_tracks = .N,\n avg_duration = mean(track_duration_ms)/1000 # in seconds\n)]\n"
✅Now you try:
Selecting and computing (i)
'# Selecting the popularity and track duration columnsspotify_dt[, .(popularity, track_duration_ms)]'
[1] "\n# Selecting the popularity and track duration columns\nspotify_dt[, .(popularity, track_duration_ms)]\n"
Grouping Operations (by)
'# Group by genrespotify_dt[, .( track_count = .N, avg_popularity = mean(popularity), avg_danceability = mean(danceability)), by = main_genre]# Multiple grouping variablesspotify_dt[, .( track_count = .N, avg_popularity = mean(popularity)), by = .(main_genre, explicit)]# Group by genre for popularity analysisgenre_stats <- spotify_dt[, .( avg_popularity = mean(popularity, na.rm = TRUE), track_count = .N, avg_duration = mean(track_duration_ms)/1000), by = main_genre][]'
[1] "\n# Group by genre\nspotify_dt[, .(\n track_count = .N,\n avg_popularity = mean(popularity),\n avg_danceability = mean(danceability)\n), by = main_genre]\n\n# Multiple grouping variables\nspotify_dt[, .(\n track_count = .N,\n avg_popularity = mean(popularity)\n), by = .(main_genre, explicit)]\n\n# Group by genre for popularity analysis\ngenre_stats <- spotify_dt[, .(\n avg_popularity = mean(popularity, na.rm = TRUE),\n track_count = .N,\n avg_duration = mean(track_duration_ms)/1000\n), by = main_genre][]\n\n"
✅Now you try:
Grouping (i)
'#comment and write your code below:spotify_dt[, .( track_count = .N, avg_popularity = mean(popularity)), by = explicit]'
[1] "\n#comment and write your code below:\nspotify_dt[, .(\n track_count = .N,\n avg_popularity = mean(popularity)\n), by = explicit]\n"
Think of it like a sentence:
i = “Which rows do you want?”
j = “What do you want to do with them?”
by = “How should they be grouped?”
5. Advanced Features
Adding/Updating Columns (:=)
We use the empty brackets at the end to print but you could have also
"#comment and write your code below:# Create new columnsspotify_dt[, `:=`( is_popular = popularity > 75, loudness_energy_ratio = loudness / energy)]# Verify new columnsnames(spotify_dt) #from data.tablespotify_dt[, names(.SD)]"
[1] "\n#comment and write your code below:\n\n# Create new columns\nspotify_dt[, `:=`(\n is_popular = popularity > 75,\n loudness_energy_ratio = loudness / energy\n)]\n\n# Verify new columns\nnames(spotify_dt) \n\n#from data.table\nspotify_dt[, names(.SD)]\n"
Working with Dates
Here’s how to handle dates in data.table:
"# Load lubridatelibrary(lubridate)# Convert and extract date componentsspotify_dt <- spotify_dt[, `:=`( release_date = as.Date(album_release_date), release_year = year(as.Date(album_release_date)), release_month = month(as.Date(album_release_date)), release_day = day(as.Date(album_release_date)))]# View the new date columnsspotify_dt[, .(release_date, release_year, release_month, release_day)][1:5]class(spotify_dt$album_release_date)"
[1] "\n# Load lubridate\nlibrary(lubridate)\n\n# Convert and extract date components\nspotify_dt <- spotify_dt[, `:=`(\n release_date = as.Date(album_release_date),\n release_year = year(as.Date(album_release_date)),\n release_month = month(as.Date(album_release_date)),\n release_day = day(as.Date(album_release_date))\n)]\n\n# View the new date columns\nspotify_dt[, .(release_date, release_year, release_month, release_day)][1:5]\n\nclass(spotify_dt$album_release_date)\n"
Ordering Results
Data.table provides efficient ways to sort:
'# Sort genres by popularity (descending)genre_stats[order(-avg_popularity)]# Get top 5 most popular genresgenre_stats[order(-avg_popularity)][1:5]'
[1] "\n# Sort genres by popularity (descending)\ngenre_stats[order(-avg_popularity)]\n\n# Get top 5 most popular genres\ngenre_stats[order(-avg_popularity)][1:5]\n"
'# Sort genres by popularity (descending)genre_stats[order(-avg_popularity)]# Sort by multiple columnsyear_genre_stats[order(release_year, -avg_popularity)]# Get top 5 most popular genresgenre_stats[order(-avg_popularity)][1:5]'
[1] "\n# Sort genres by popularity (descending)\ngenre_stats[order(-avg_popularity)]\n\n# Sort by multiple columns\nyear_genre_stats[order(release_year, -avg_popularity)]\n\n# Get top 5 most popular genres\ngenre_stats[order(-avg_popularity)][1:5]\n"
Chaining Operations
Data.table allows you to chain operations efficiently:
Forgetting the .() when selecting multiple columns
Using $ instead of := for assignments
Forgetting to add [] at the end when you want to see results
Remember:
tidyverse is great for readability and smaller datasets
data.table shines with big data and performance needs
Both have their place in your R toolkit
🎊🎉 Congratulations you finished week 4 and now on to a mini project analysis using data.table functions.
💡 Mini-Project Guidelines:
Create a comprehensive analysis using data.table that:
Demonstrates at least three different data.table operations Includes performance comparisons with equivalent dplyr operations Creates insightful visualizations from the results
Remember to:
Document your code
Explain your analysis choices
Consider performance implications
Think about scalability
Want more….
Explore more advanced data.table features
Prompt for Chat GPT/Claude/Gemini
You are a data scientist using exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to …. {Add what you want it to do}
Mini Project
Project Description: For this project, I wanted to replicate the project done in week 3, just utilizing data.table instead of tidyverse. To reiterate the goal of this project, I would like to look at the popularity of songs by decade, and how it corresponds to whether or not a song is explicit.
I chose to analyze whether there is a trend between whether a song is explicit or not and its popularity, as well as how this trend may change by decade. To complete this project I group by ‘decade’ and subgroup by whether the ‘explicit’ value is true or false. During this process, I removed all entried that had missing value in the release date, explicit, and popularity columns. Additionally, I exempted the years that dindt contain bit explicit and non explicit content. Within the groups defined before, I averaged and displayed the popularity value to determine if there is a correlation between explicit content and popularity, as well as and changes by decade.
library(data.table)library(lubridate) # For date manipulationlibrary(tidyverse)spotify_csv <-read_csv("data/spotify_clean.csv")
Rows: 9999 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): track_name, artist_name_s, album_name, album_artist_name_s, album...
dbl (16): disc_number, track_number, track_duration_ms, popularity, danceab...
lgl (2): explicit, album_genres
dttm (1): added_at
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
spotify_dt <-fread("data/spotify_clean.csv")# Convert album_release_date to Date formatsystem.time(spotify_dt[, release_date :=ymd(album_release_date)])
# Group by decade and explicit columns and calculate average popularitysystem.time(popularity_result <- spotify_clean[, .(average_popularity =mean(popularity, na.rm =TRUE)), by = .(decade, explicit)])
user system elapsed
0.00 0.00 0.01
# Remove decades that don't have both explicit and non-explicit songssystem.time(valid_decades <- popularity_result[, .N, by = decade][N >1, decade])
# Grouping by decade and explicit columnssystem.time(popularity_result_dplyr <- spotify_clean_dplyr %>%group_by(decade, explicit) %>%summarise(average_popularity =mean(popularity, na.rm =TRUE),.groups ="drop"))
user system elapsed
0.02 0.00 0.00
# Removing the years that dont have both explicit and non explicit songssystem.time(popularity_result_dplyr <- popularity_result_dplyr %>%group_by(decade) %>%filter(any(explicit ==TRUE)) %>%# Keep only decades with at least one TRUEungroup())
user system elapsed
0.00 0.00 0.01
AI Mini Project
## First Prompt"You are a data scientist using exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to look for trends in popularity based on the main_genre, loudness, tempo, and explicit rating.Attachment: spotify_clean.csv"
[1] "\nYou are a data scientist using exploring a large data set using \nexploratory data analysis principles. Your task is to create code \nin R using data.table functions to look for trends in popularity \nbased on the main_genre, loudness, tempo, and explicit rating.\nAttachment: spotify_clean.csv\n"
library(data.table)# Load datasetdt <-fread("data/spotify_clean.csv")# Remove rows with missing values in key columnsdt <- dt[!is.na(loudness) &!is.na(tempo) &!is.na(popularity) &!is.na(main_genre)]# Convert 'explicit' to a factordt[, explicit :=as.factor(explicit)]# Average popularity by main_genregenre_popularity <- dt[, .(avg_popularity =mean(popularity, na.rm =TRUE)), by = main_genre][order(-avg_popularity)]print(genre_popularity)
main_genre avg_popularity
<char> <num>
1: Reggae/Caribbean 48.22222
2: Pop 40.27116
3: Rock 37.57563
4: R&B/Soul 34.93709
5: Folk/Country 34.65934
6: Dance/Electronic 32.76378
7: Hip Hop 32.50935
8: Other 26.15360
9: Jazz/Blues 22.24074
10: Classical/Instrumental 14.75000
# Correlation between loudness, tempo, and popularitycor_matrix <- dt[, .(cor_loudness_pop =cor(loudness, popularity, use ="complete.obs"),cor_tempo_pop =cor(tempo, popularity, use ="complete.obs"))]print(cor_matrix)
## Second prompt"Remove the explicit quantifier. I would like a breakdown showing the ranges of tempo and loudness within each genre and the breakdowns of popularity in each subgroup."
[1] "\nRemove the explicit quantifier. I would like a breakdown showing \nthe ranges of tempo and loudness within each genre and the breakdowns \nof popularity in each subgroup.\n"
library(data.table)# Load datasetdt <-fread("data/spotify_clean.csv")# Remove rows with missing values in key columnsdt <- dt[!is.na(loudness) &!is.na(tempo) &!is.na(popularity) &!is.na(main_genre)]# Average popularity by main_genregenre_popularity <- dt[, .(avg_popularity =mean(popularity, na.rm =TRUE)), by = main_genre][order(-avg_popularity)]print(genre_popularity)
main_genre avg_popularity
<char> <num>
1: Reggae/Caribbean 48.22222
2: Pop 40.27116
3: Rock 37.57563
4: R&B/Soul 34.93709
5: Folk/Country 34.65934
6: Dance/Electronic 32.76378
7: Hip Hop 32.50935
8: Other 26.15360
9: Jazz/Blues 22.24074
10: Classical/Instrumental 14.75000
# Correlation between loudness, tempo, and popularitycor_matrix <- dt[, .(cor_loudness_pop =cor(loudness, popularity, use ="complete.obs"),cor_tempo_pop =cor(tempo, popularity, use ="complete.obs"))]print(cor_matrix)
# Breakdown of popularity within each tempo and loudness subgrouptempo_loudness_popularity <- dt[, .(avg_popularity =mean(popularity, na.rm =TRUE)), by = .(cut(tempo, breaks =5), cut(loudness, breaks =5))]print(tempo_loudness_popularity)
## Third prompt"Finally, can you take all columns into consideration and find the three variables with the strongest correlation to popularity."
[1] "\nFinally, can you take all columns into consideration and find the three \nvariables with the strongest correlation to popularity.\n"
library(data.table)# Load datasetdt <-fread("data/spotify_clean.csv")# Remove rows with missing values in key columnsdt <- dt[!is.na(loudness) &!is.na(tempo) &!is.na(popularity) &!is.na(main_genre)]# Average popularity by main_genregenre_popularity <- dt[, .(avg_popularity =mean(popularity, na.rm =TRUE)), by = main_genre][order(-avg_popularity)]print(genre_popularity)
main_genre avg_popularity
<char> <num>
1: Reggae/Caribbean 48.22222
2: Pop 40.27116
3: Rock 37.57563
4: R&B/Soul 34.93709
5: Folk/Country 34.65934
6: Dance/Electronic 32.76378
7: Hip Hop 32.50935
8: Other 26.15360
9: Jazz/Blues 22.24074
10: Classical/Instrumental 14.75000
# Correlation between loudness, tempo, and popularitycor_matrix <- dt[, .(cor_loudness_pop =cor(loudness, popularity, use ="complete.obs"),cor_tempo_pop =cor(tempo, popularity, use ="complete.obs"))]print(cor_matrix)
# Breakdown of popularity within each tempo and loudness subgrouptempo_loudness_popularity <- dt[, .(avg_popularity =mean(popularity, na.rm =TRUE)), by = .(cut(tempo, breaks =5), cut(loudness, breaks =5))]print(tempo_loudness_popularity)