# Read the dataset using fread (much faster than read.csv() or read_csv())#spotify_dt <- fread("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")#read in teh data from data folderspotify_dt <-fread("data/spotify_clean.csv")spotify_csv <-read_csv("data/spotify_clean.csv")
When you have an existing data frame (like one from tidyverse or base R like spotify_csv), you can convert it to a data.table using setDT().
# Option 1: setDT() - converts in place (memory efficient)#setDT(spotify_csv) # Converts the existing object to data.table# Option 2: as.data.table() - creates a copy (uses more memory)#spotify_dt <- as.data.table(spotify_csv)
setDT() is more memory efficient because it modifies the existing object
as.data.table() creates a copy, using twice the memory
For big data, it’s best to read directly with fread() rather than converting
The conversion only needs to happen if you start with a regular data frame
#comment and write your code below:# Select songs released after 2015recent_songs <- spotify_dt[album_release_date =="2014-10-20"]# Display the first few recent songshead(recent_songs)
| means “OR” (at least one condition must be true)
! means “NOT” (reverses the condition)
2.) Column Operations (j)
The j argument lets you select columns and perform computations:
# Select specific columnsspotify_dt[, .(track_name, artist_name_s, popularity)]
track_name
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
artist_name_s popularity
<char> <int>
1: The KLF 0
2: Pitbull 64
3: Britney Spears 56
4: The Kinks 42
5: The Rolling Stones 0
---
9995: Topic, A7S 79
9996: Vandalism 17
9997: Spiller, Sophie Ellis-Bextor 62
9998: Tiësto 87
9999: Kylie Minogue 69
# Select specific columnsspotify_dt[, .(artist_name_s, track_name)]
artist_name_s
<char>
1: The KLF
2: Pitbull
3: Britney Spears
4: The Kinks
5: The Rolling Stones
---
9995: Topic, A7S
9996: Vandalism
9997: Spiller, Sophie Ellis-Bextor
9998: Tiësto
9999: Kylie Minogue
track_name
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
# Create new columns in outputspotify_dt[, .(song = track_name,artist = artist_name_s,popularity_score = popularity/100)]
song
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
artist popularity_score
<char> <num>
1: The KLF 0.00
2: Pitbull 0.64
3: Britney Spears 0.56
4: The Kinks 0.42
5: The Rolling Stones 0.00
---
9995: Topic, A7S 0.79
9996: Vandalism 0.17
9997: Spiller, Sophie Ellis-Bextor 0.62
9998: Tiësto 0.87
9999: Kylie Minogue 0.69
#comment and write your code below:# Select songs with high energy and compute average danceabilityhigh_energy_stats <- spotify_dt[energy >0.8, .(avg_danceability =mean(danceability, na.rm =TRUE))]# Display the resulthigh_energy_stats
avg_danceability
<num>
1: 0.6077926
Grouping Operations (by)
# Group by genrespotify_dt[, .(track_count = .N,avg_popularity =mean(popularity),avg_danceability =mean(danceability)), by = main_genre]
main_genre track_count avg_popularity avg_danceability
<char> <int> <num> <num>
1: Dance/Electronic 254 32.76378 0.6779331
2: Pop 5694 40.27116 0.6243319
3: Rock 2413 37.57563 0.5480435
4: Hip Hop 214 32.50935 0.7607477
5: Other 946 26.09831 NA
6: R&B/Soul 302 34.93709 0.6557980
7: Jazz/Blues 54 22.24074 0.5241852
8: Folk/Country 91 34.65934 0.5683407
9: Reggae/Caribbean 27 48.22222 0.7564815
10: Classical/Instrumental 4 14.75000 0.5580000
main_genre explicit track_count avg_popularity
<char> <lgcl> <int> <num>
1: Dance/Electronic FALSE 252 32.81746
2: Pop FALSE 5345 39.74275
3: Rock FALSE 2363 37.50994
4: Hip Hop FALSE 132 31.15152
5: Other FALSE 924 25.98377
6: R&B/Soul FALSE 301 35.05316
7: Pop TRUE 349 48.36390
8: Jazz/Blues FALSE 54 22.24074
9: Hip Hop TRUE 82 34.69512
10: Folk/Country FALSE 89 33.40449
11: Reggae/Caribbean FALSE 27 48.22222
12: Rock TRUE 50 40.68000
13: Other TRUE 22 30.90909
14: Classical/Instrumental TRUE 1 0.00000
15: Dance/Electronic TRUE 2 26.00000
16: Classical/Instrumental FALSE 3 19.66667
17: Folk/Country TRUE 2 90.50000
18: R&B/Soul TRUE 1 0.00000
# Group by genre for popularity analysisgenre_stats <- spotify_dt[, .(avg_popularity =mean(popularity, na.rm =TRUE),track_count = .N,avg_duration =mean(track_duration_ms)/1000), by = main_genre][]
✅Now you try:
Grouping (i)
#comment and write your code below:# Group by artist and calculate total popularityartist_popularity <- spotify_dt[, .(total_popularity =sum(popularity, na.rm =TRUE)), by = artist_name_s]# Display the first few resultshead(artist_popularity)
artist_name_s total_popularity
<char> <int>
1: The KLF 155
2: Pitbull 217
3: Britney Spears 1652
4: The Kinks 289
5: The Rolling Stones 1394
6: Fleetwood Mac 916
Think of it like a sentence:
i = “Which rows do you want?”
j = “What do you want to do with them?”
by = “How should they be grouped?”
5. Advanced Features
Adding/Updating Columns (:=)
We use the empty brackets at the end to print but you could have also
# Add duration in secondsspotify_dt[, duration_sec := track_duration_ms/1000][]
#comment and write your code below:# Create new columnsspotify_dt[, `:=`(is_popular = popularity >75,loudness_energy_ratio = loudness / energy)]# Verify new columnsnames(spotify_dt)
# Display the first few rows to check new columnshead(spotify_dt[, .(track_name, is_popular, loudness_energy_ratio)])
track_name is_popular loudness_energy_ratio
<char> <lgcl> <num>
1: Justified & Ancient - Stand by the Jams FALSE -14.111239
2: I Know You Want Me (Calle Ocho) FALSE -8.068641
3: From the Bottom of My Broken Heart FALSE -7.775940
4: Apeman - 2014 Remastered Version FALSE -12.252747
5: You Can't Always Get What You Want FALSE -15.328549
6: Don't Stop - 2004 Remaster TRUE -10.878873
Working with Dates
Here’s how to handle dates in data.table:
# Load lubridatelibrary(lubridate)# Convert and extract date componentsspotify_dt <- spotify_dt[, `:=`(release_date =as.Date(album_release_date),release_year =year(as.Date(album_release_date)),release_month =month(as.Date(album_release_date)),release_day =day(as.Date(album_release_date)))]# View the new date columnsspotify_dt[, .(release_date, release_year, release_month, release_day)][1:5]
Key: <main_genre>
main_genre danceability energy valence
<char> <num> <num> <num>
1: Classical/Instrumental 0.5580000 0.5570000 0.6500000
2: Dance/Electronic 0.6779331 0.8231969 0.5845252
3: Folk/Country 0.5683407 0.5402066 0.5439121
4: Hip Hop 0.7607477 0.7146449 0.6088201
5: Jazz/Blues 0.5241852 0.5895185 0.7212222
6: Other NA NA NA
7: Pop 0.6243319 0.6954419 0.5665183
8: R&B/Soul 0.6557980 0.6104570 0.6947848
9: Reggae/Caribbean 0.7564815 0.6411481 0.7690000
10: Rock 0.5480435 0.6759753 0.5957726
Watch out for these common mistakes:
Forgetting the .() when selecting multiple columns
Using $ instead of := for assignments
Forgetting to add [] at the end when you want to see results
Remember:
tidyverse is great for readability and smaller datasets
data.table shines with big data and performance needs
Both have their place in your R toolkit
🎊🎉 Congratulations you finished week 4 and now on to a mini project analysis using data.table functions.
💡 Mini-Project Guidelines:
Create a comprehensive analysis using data.table that:
Demonstrates at least three different data.table operations Includes performance comparisons with equivalent dplyr operations Creates insightful visualizations from the results
Remember to:
Document your code
Explain your analysis choices
Consider performance implications
Think about scalability
Want more….
Explore more advanced data.table features
Prompt for Chat GPT/Claude/Gemini
You are a data scientist using exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to …. {Add what you want it to do}