# Read the dataset using fread (much faster than read.csv() or read_csv())#spotify_dt <- fread("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")#read in teh data from data folderspotify_dt <-fread("week_4/data/spotify_clean.csv")spotify_csv <-read_csv("week_4/data/spotify_clean.csv")
When you have an existing data frame (like one from tidyverse or base R like spotify_csv), you can convert it to a data.table using setDT().
# Option 1: setDT() - converts in place (memory efficient)#setDT(spotify_csv) # Converts the existing object to data.table# Option 2: as.data.table() - creates a copy (uses more memory)#spotify_dt <- as.data.table(spotify_csv)
setDT() is more memory efficient because it modifies the existing object
as.data.table() creates a copy, using twice the memory
For big data, it’s best to read directly with fread() rather than converting
The conversion only needs to happen if you start with a regular data frame
spotify_dt <-fread("week_4/data/spotify_clean.csv")if (nrow(spotify_dt) ==0) {stop("Error: spotify_clean.csv did not load correctly. Check the file path!")}# Then apply your filterpopular_tracks <- spotify_dt[popularity >75]# Clue 1: Basic structure and dimensionsspotify_dt[, .N] # Number of rows
[1] 9999
ncol(spotify_dt) # Number of columns
[1] 30
# Clue 2: Column names and typesspotify_dt[, lapply(.SD, class)] # Shows class of each column
track_name artist_name_s album_name album_artist_name_s album_release_date
<char> <char> <char> <char> <char>
1: character character character character character
2: character character character character character
disc_number track_number track_duration_ms explicit popularity isrc
<char> <char> <char> <char> <char> <char>
1: integer integer integer logical integer character
2: integer integer integer logical integer character
added_by added_at artist_genres danceability energy key loudness
<char> <char> <char> <char> <char> <char> <char>
1: character POSIXct character numeric numeric integer numeric
2: character POSIXt character numeric numeric integer numeric
mode speechiness acousticness instrumentalness liveness valence tempo
<char> <char> <char> <char> <char> <char> <char>
1: integer numeric numeric numeric numeric numeric numeric
2: integer numeric numeric numeric numeric numeric numeric
time_signature album_genres label copyrights main_genre
<char> <char> <char> <char> <char>
1: integer logical character character character
2: integer logical character character character
| means “OR” (at least one condition must be true)
! means “NOT” (reverses the condition)
2.) Column Operations (j)
The j argument lets you select columns and perform computations:
# Select specific columnsspotify_dt[, .(track_name, artist_name_s, popularity)]
track_name
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
artist_name_s popularity
<char> <int>
1: The KLF 0
2: Pitbull 64
3: Britney Spears 56
4: The Kinks 42
5: The Rolling Stones 0
---
9995: Topic, A7S 79
9996: Vandalism 17
9997: Spiller, Sophie Ellis-Bextor 62
9998: Tiësto 87
9999: Kylie Minogue 69
# Select specific columnsspotify_dt[, .(artist_name_s, track_name)]
artist_name_s
<char>
1: The KLF
2: Pitbull
3: Britney Spears
4: The Kinks
5: The Rolling Stones
---
9995: Topic, A7S
9996: Vandalism
9997: Spiller, Sophie Ellis-Bextor
9998: Tiësto
9999: Kylie Minogue
track_name
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
# Create new columns in outputspotify_dt[, .(song = track_name,artist = artist_name_s,popularity_score = popularity/100)]
song
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
artist popularity_score
<char> <num>
1: The KLF 0.00
2: Pitbull 0.64
3: Britney Spears 0.56
4: The Kinks 0.42
5: The Rolling Stones 0.00
---
9995: Topic, A7S 0.79
9996: Vandalism 0.17
9997: Spiller, Sophie Ellis-Bextor 0.62
9998: Tiësto 0.87
9999: Kylie Minogue 0.69
main_genre explicit track_count avg_popularity
<char> <lgcl> <int> <num>
1: Dance/Electronic FALSE 252 32.81746
2: Pop FALSE 5345 39.74275
3: Rock FALSE 2363 37.50994
4: Hip Hop FALSE 132 31.15152
5: Other FALSE 924 25.98377
6: R&B/Soul FALSE 301 35.05316
7: Pop TRUE 349 48.36390
8: Jazz/Blues FALSE 54 22.24074
9: Hip Hop TRUE 82 34.69512
10: Folk/Country FALSE 89 33.40449
11: Reggae/Caribbean FALSE 27 48.22222
12: Rock TRUE 50 40.68000
13: Other TRUE 22 30.90909
14: Classical/Instrumental TRUE 1 0.00000
15: Dance/Electronic TRUE 2 26.00000
16: Classical/Instrumental FALSE 3 19.66667
17: Folk/Country TRUE 2 90.50000
18: R&B/Soul TRUE 1 0.00000
# Group by genre for popularity analysisgenre_stats <- spotify_dt[, .(avg_popularity =mean(popularity, na.rm =TRUE),track_count = .N,avg_duration =mean(track_duration_ms)/1000), by = main_genre][]
✅Now you try:
Grouping (i)
#comment and write your code below:
Think of it like a sentence:
i = “Which rows do you want?”
j = “What do you want to do with them?”
by = “How should they be grouped?”
5. Advanced Features
Adding/Updating Columns (:=)
We use the empty brackets at the end to print but you could have also
# Add duration in secondsspotify_dt[, duration_sec := track_duration_ms/1000][]
#comment and write your code below:# Create new columnsspotify_dt[, `:=`(is_popular = popularity >75,loudness_energy_ratio = loudness / energy)]# Verify new columnsnames(spotify_dt)
# Load lubridatelibrary(lubridate)# Convert and extract date componentsspotify_dt <- spotify_dt[, `:=`(release_date =as.Date(album_release_date),release_year =year(as.Date(album_release_date)),release_month =month(as.Date(album_release_date)),release_day =day(as.Date(album_release_date)))]# View the new date columnsspotify_dt[, .(release_date, release_year, release_month, release_day)][1:5]
Key: <main_genre>
main_genre danceability energy valence
<char> <num> <num> <num>
1: Classical/Instrumental 0.5580000 0.5570000 0.6500000
2: Dance/Electronic 0.6779331 0.8231969 0.5845252
3: Folk/Country 0.5683407 0.5402066 0.5439121
4: Hip Hop 0.7607477 0.7146449 0.6088201
5: Jazz/Blues 0.5241852 0.5895185 0.7212222
6: Other NA NA NA
7: Pop 0.6243319 0.6954419 0.5665183
8: R&B/Soul 0.6557980 0.6104570 0.6947848
9: Reggae/Caribbean 0.7564815 0.6411481 0.7690000
10: Rock 0.5480435 0.6759753 0.5957726
Watch out for these common mistakes:
Forgetting the .() when selecting multiple columns
Using $ instead of := for assignments
Forgetting to add [] at the end when you want to see results
Remember:
tidyverse is great for readability and smaller datasets
data.table shines with big data and performance needs
Both have their place in your R toolkit
🎊🎉 Congratulations you finished week 4 and now on to a mini project analysis using data.table functions.
💡 Mini-Project Guidelines:
Create a comprehensive analysis using data.table that:
Demonstrates at least three different data.table operations Includes performance comparisons with equivalent dplyr operations Creates insightful visualizations from the results
Remember to:
Document your code
Explain your analysis choices
Consider performance implications
Think about scalability
Want more….
Explore more advanced data.table features
Prompt for Chat GPT/Claude/Gemini
You are a data scientist using exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to …. {Add what you want it to do}
Analysis: Who are the most popular artists (top 5) in their respective categories - such as Pop, Rock, and Other
Prompt for ChatGPT:You are a data scientist using exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to find out which artists have the most danceability.
#load the datasetspotify_dt <-fread("week_4/data/spotify_clean.csv")#compute average danceability by artist, trach count is more than 2artist_dance <- spotify_dt[, .(track_count = .N,avg_danceability =mean(danceability)), by = artist_name_s] [track_count >10]#filter out artist with more than 0.75 danceabilityartist_dance[avg_danceability >=0.80]
Empty data.table (0 rows and 3 cols): artist_name_s,track_count,avg_danceability
#sort the artists in descending order to get moset danceable firstartist_danceability <- artist_dance[order(-avg_danceability)]#display the top 10 artistsprint(head(artist_danceability, 10))
artist_name_s track_count avg_danceability
<char> <int> <num>
1: Eminem 23 0.7915217
2: Prince 15 0.7909333
3: Nicki Minaj 11 0.7731818
4: Michael Jackson 21 0.7664286
5: Black Eyed Peas 28 0.7599286
6: Janet Jackson 14 0.7599286
7: Jennifer Lopez 13 0.7559231
8: Justin Timberlake 13 0.7335385
9: Britney Spears 27 0.7272963
10: Meghan Trainor 12 0.7260000
#find which genre is most danceable by groupingdance_genre_dt <- spotify_dt[, .(track_count = .N,avg_danceability_genre =mean(danceability)), by = main_genre]# create a visualization for genre and average danceabilityggplot(dance_genre_dt, aes(x = main_genre, y = avg_danceability_genre)) +geom_col(fill ="plum") +coord_flip() +labs(title ="Average Danceability By Genre", x ="Genre", y ="Average Danceability") +theme_minimal()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_col()`).