# Read the dataset using fread (much faster than read.csv() or read_csv())#spotify_dt <- fread("https://docs.google.com/spreadsheets/d/e/2PACX-1vT9iNNoFknWSmgIglkiEXnmXkJplHHeEf9wPn07tvtOV-5eTh-MdYnLS3eyST_Bx2OXpd0GC9dw5gBt/pub?gid=1801845240&single=true&output=csv")#read in teh data from data folderspotify_dt <-fread("spotify_clean.csv")spotify_csv <-read_csv("spotify_clean.csv")
Now your turn try it out with the readr function read_csv
#comment and write your code below:
When you have an existing data frame (like one from tidyverse or base R like spotify_csv), you can convert it to a data.table using setDT().
# Option 1: setDT() - converts in place (memory efficient)#setDT(spotify_csv) # Converts the existing object to data.table# Option 2: as.data.table() - creates a copy (uses more memory)#spotify_dt <- as.data.table(spotify_csv)
setDT() is more memory efficient because it modifies the existing object
as.data.table() creates a copy, using twice the memory
For big data, it’s best to read directly with fread() rather than converting
The conversion only needs to happen if you start with a regular data frame
| means “OR” (at least one condition must be true)
! means “NOT” (reverses the condition)
2.) Column Operations (j)
The j argument lets you select columns and perform computations:
# Select specific columnsspotify_dt[, .(track_name, artist_name_s, popularity)]
track_name
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
artist_name_s popularity
<char> <int>
1: The KLF 0
2: Pitbull 64
3: Britney Spears 56
4: The Kinks 42
5: The Rolling Stones 0
---
9995: Topic, A7S 79
9996: Vandalism 17
9997: Spiller, Sophie Ellis-Bextor 62
9998: Tiësto 87
9999: Kylie Minogue 69
# Select specific columnsspotify_dt[, .(artist_name_s, track_name)]
artist_name_s
<char>
1: The KLF
2: Pitbull
3: Britney Spears
4: The Kinks
5: The Rolling Stones
---
9995: Topic, A7S
9996: Vandalism
9997: Spiller, Sophie Ellis-Bextor
9998: Tiësto
9999: Kylie Minogue
track_name
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
# Create new columns in outputspotify_dt[, .(song = track_name,artist = artist_name_s,popularity_score = popularity/100)]
song
<char>
1: Justified & Ancient - Stand by the Jams
2: I Know You Want Me (Calle Ocho)
3: From the Bottom of My Broken Heart
4: Apeman - 2014 Remastered Version
5: You Can't Always Get What You Want
---
9995: Kernkraft 400 (A Better Day)
9996: Never Say Never - Radio Edit
9997: Groovejet (If This Ain't Love) [feat. Sophie Ellis-Bextor]
9998: Lay Low
9999: Padam Padam
artist popularity_score
<char> <num>
1: The KLF 0.00
2: Pitbull 0.64
3: Britney Spears 0.56
4: The Kinks 0.42
5: The Rolling Stones 0.00
---
9995: Topic, A7S 0.79
9996: Vandalism 0.17
9997: Spiller, Sophie Ellis-Bextor 0.62
9998: Tiësto 0.87
9999: Kylie Minogue 0.69
main_genre explicit track_count avg_popularity
<char> <lgcl> <int> <num>
1: Dance/Electronic FALSE 252 32.81746
2: Pop FALSE 5345 39.74275
3: Rock FALSE 2363 37.50994
4: Hip Hop FALSE 132 31.15152
5: Other FALSE 924 25.98377
6: R&B/Soul FALSE 301 35.05316
7: Pop TRUE 349 48.36390
8: Jazz/Blues FALSE 54 22.24074
9: Hip Hop TRUE 82 34.69512
10: Folk/Country FALSE 89 33.40449
11: Reggae/Caribbean FALSE 27 48.22222
12: Rock TRUE 50 40.68000
13: Other TRUE 22 30.90909
14: Classical/Instrumental TRUE 1 0.00000
15: Dance/Electronic TRUE 2 26.00000
16: Classical/Instrumental FALSE 3 19.66667
17: Folk/Country TRUE 2 90.50000
18: R&B/Soul TRUE 1 0.00000
# Group by genre for popularity analysisgenre_stats <- spotify_dt[, .(avg_popularity =mean(popularity, na.rm =TRUE),track_count = .N,avg_duration =mean(track_duration_ms)/1000), by = main_genre][]
✅Now you try:
Grouping (i)
#comment and write your code below:
Think of it like a sentence:
i = “Which rows do you want?”
j = “What do you want to do with them?”
by = “How should they be grouped?”
5. Advanced Features
Adding/Updating Columns (:=)
We use the empty brackets at the end to print but you could have also
# Add duration in secondsspotify_dt[, duration_sec := track_duration_ms/1000][]
#comment and write your code below:# Create new columnsspotify_dt[, `:=`(is_popular = popularity >75,loudness_energy_ratio = loudness / energy)]# Verify new columnsnames(spotify_dt)
# Load lubridatelibrary(lubridate)# Convert and extract date componentsspotify_dt <- spotify_dt[, `:=`(release_date =as.Date(album_release_date),release_year =year(as.Date(album_release_date)),release_month =month(as.Date(album_release_date)),release_day =day(as.Date(album_release_date)))]# View the new date columnsspotify_dt[, .(release_date, release_year, release_month, release_day)][1:5]
Key: <main_genre>
main_genre danceability energy valence
<char> <num> <num> <num>
1: Classical/Instrumental 0.5580000 0.5570000 0.6500000
2: Dance/Electronic 0.6779331 0.8231969 0.5845252
3: Folk/Country 0.5683407 0.5402066 0.5439121
4: Hip Hop 0.7607477 0.7146449 0.6088201
5: Jazz/Blues 0.5241852 0.5895185 0.7212222
6: Other NA NA NA
7: Pop 0.6243319 0.6954419 0.5665183
8: R&B/Soul 0.6557980 0.6104570 0.6947848
9: Reggae/Caribbean 0.7564815 0.6411481 0.7690000
10: Rock 0.5480435 0.6759753 0.5957726
Watch out for these common mistakes:
Forgetting the .() when selecting multiple columns
Using $ instead of := for assignments
Forgetting to add [] at the end when you want to see results
Remember:
tidyverse is great for readability and smaller datasets
data.table shines with big data and performance needs
Both have their place in your R toolkit
🎊🎉 Congratulations you finished week 4 and now on to a mini project analysis using data.table functions.
💡 Mini-Project Guidelines:
Create a comprehensive analysis using data.table that:
Demonstrates at least three different data.table operations Includes performance comparisons with equivalent dplyr operations Creates insightful visualizations from the results
Remember to:
Document your code
Explain your analysis choices
Consider performance implications
Think about scalability
Want more….
Explore more advanced data.table features
Prompt for Chat GPT/Claude/Gemini
You are a data scientist using exploring a large data set using exploratory data analysis principles. Your task is to create code in R using data.table functions to …. {Add what you want it to do}
# Load necessary librarieslibrary(data.table) # Efficient data handlinglibrary(ggplot2) # Visualization# Read the dataset using fread for efficiencyspotify_dt <-fread("spotify_clean.csv")# ---- 1. Filtering & Aggregation ----# Find the top 5 genres by average popularitytop_genres <- spotify_dt[, .(avg_popularity =mean(popularity, na.rm =TRUE), # Calculate average popularitytrack_count = .N # Count number of songs), by = main_genre][order(-avg_popularity)][1:5] # Sort in descending orderprint("Top 5 Genres by Popularity:")
# ---- 2. Creating a New Column with := ----# Add a column scaling popularity from 0 to 1spotify_dt[, popularity_scaled := popularity /100]# View the first 5 rows to confirm the new columnprint("First 5 rows with new popularity_scaled column:")
[1] "First 5 rows with new popularity_scaled column:"
track_name artist_name_s popularity
<char> <char> <int>
1: Justified & Ancient - Stand by the Jams The KLF 0
2: I Know You Want Me (Calle Ocho) Pitbull 64
3: From the Bottom of My Broken Heart Britney Spears 56
4: Apeman - 2014 Remastered Version The Kinks 42
5: You Can't Always Get What You Want The Rolling Stones 0
popularity_scaled
<num>
1: 0.00
2: 0.64
3: 0.56
4: 0.42
5: 0.00
# ---- 3. Grouping & Summarization ----# Group by genre and calculate mean energy and danceabilitygenre_audio_stats <- spotify_dt[, .(avg_energy =mean(energy, na.rm =TRUE),avg_danceability =mean(danceability, na.rm =TRUE),track_count = .N), by = main_genre][order(-avg_energy)] # Sort by energy levelprint("Genre-wise average energy and danceability:")