library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted)
library(skimr)
library(ggcorrplot)
#Reading the data set and printing first 5rows
data <- read.csv("dataset.csv")
head(data)
## X track_id artists
## 1 0 5SuOikwiRyPMVoIQDJUgSV Gen Hoshino
## 2 1 4qPNDBW1i3p13qLCt0Ki3A Ben Woodward
## 3 2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN
## 4 3 6lfxq3CG4xtTiEg7opyCyx Kina Grannis
## 5 4 5vjLSffimiIP26QG5WcN2K Chord Overstreet
## 6 5 01MVOl9KtVTNfFiBU9I7dc Tyrone Wells
## album_name
## 1 Comedy
## 2 Ghost (Acoustic)
## 3 To Begin Again
## 4 Crazy Rich Asians (Original Motion Picture Soundtrack)
## 5 Hold On
## 6 Days I Will Remember
## track_name popularity duration_ms explicit danceability
## 1 Comedy 73 230666 False 0.676
## 2 Ghost - Acoustic 55 149610 False 0.420
## 3 To Begin Again 57 210826 False 0.438
## 4 Can't Help Falling In Love 71 201933 False 0.266
## 5 Hold On 82 198853 False 0.618
## 6 Days I Will Remember 58 214240 False 0.688
## energy key loudness mode speechiness acousticness instrumentalness liveness
## 1 0.4610 1 -6.746 0 0.1430 0.0322 1.01e-06 0.3580
## 2 0.1660 1 -17.235 1 0.0763 0.9240 5.56e-06 0.1010
## 3 0.3590 0 -9.734 1 0.0557 0.2100 0.00e+00 0.1170
## 4 0.0596 0 -18.515 1 0.0363 0.9050 7.07e-05 0.1320
## 5 0.4430 2 -9.681 1 0.0526 0.4690 0.00e+00 0.0829
## 6 0.4810 6 -8.807 1 0.1050 0.2890 0.00e+00 0.1890
## valence tempo time_signature track_genre
## 1 0.715 87.917 4 acoustic
## 2 0.267 77.489 4 acoustic
## 3 0.120 76.332 4 acoustic
## 4 0.143 181.740 3 acoustic
## 5 0.167 119.949 4 acoustic
## 6 0.666 98.017 4 acoustic
# Display column names and types
str(data)
## 'data.frame': 114000 obs. of 21 variables:
## $ X : int 0 1 2 3 4 5 6 7 8 9 ...
## $ track_id : chr "5SuOikwiRyPMVoIQDJUgSV" "4qPNDBW1i3p13qLCt0Ki3A" "1iJBSr7s7jYXzM8EGcbK5b" "6lfxq3CG4xtTiEg7opyCyx" ...
## $ artists : chr "Gen Hoshino" "Ben Woodward" "Ingrid Michaelson;ZAYN" "Kina Grannis" ...
## $ album_name : chr "Comedy" "Ghost (Acoustic)" "To Begin Again" "Crazy Rich Asians (Original Motion Picture Soundtrack)" ...
## $ track_name : chr "Comedy" "Ghost - Acoustic" "To Begin Again" "Can't Help Falling In Love" ...
## $ popularity : int 73 55 57 71 82 58 74 80 74 56 ...
## $ duration_ms : int 230666 149610 210826 201933 198853 214240 229400 242946 189613 205594 ...
## $ explicit : chr "False" "False" "False" "False" ...
## $ danceability : num 0.676 0.42 0.438 0.266 0.618 0.688 0.407 0.703 0.625 0.442 ...
## $ energy : num 0.461 0.166 0.359 0.0596 0.443 0.481 0.147 0.444 0.414 0.632 ...
## $ key : int 1 1 0 0 2 6 2 11 0 1 ...
## $ loudness : num -6.75 -17.23 -9.73 -18.52 -9.68 ...
## $ mode : int 0 1 1 1 1 1 1 1 1 1 ...
## $ speechiness : num 0.143 0.0763 0.0557 0.0363 0.0526 0.105 0.0355 0.0417 0.0369 0.0295 ...
## $ acousticness : num 0.0322 0.924 0.21 0.905 0.469 0.289 0.857 0.559 0.294 0.426 ...
## $ instrumentalness: num 1.01e-06 5.56e-06 0.00 7.07e-05 0.00 0.00 2.89e-06 0.00 0.00 4.19e-03 ...
## $ liveness : num 0.358 0.101 0.117 0.132 0.0829 0.189 0.0913 0.0973 0.151 0.0735 ...
## $ valence : num 0.715 0.267 0.12 0.143 0.167 0.666 0.0765 0.712 0.669 0.196 ...
## $ tempo : num 87.9 77.5 76.3 181.7 119.9 ...
## $ time_signature : int 4 4 4 3 4 4 3 4 4 4 ...
## $ track_genre : chr "acoustic" "acoustic" "acoustic" "acoustic" ...
typeof(data)
## [1] "list"
#Verify
class(data)
## [1] "data.frame"
The str() function shows the structure of the data (column names and types), while typeof() and class() provide additional type details of the entire dataset.
conflicted::conflicts_prefer(dplyr::filter)
## [conflicted] Will prefer dplyr::filter over any other package.
# Filter dataset where explicit is "True" and sample 10,000 rows
sample_data <- data |> filter(explicit == "True") |> sample_n(9000)
data <- sample_data
nrow(data)
## [1] 9000
# Display first few rows
head(data)
## X track_id artists album_name track_name
## 1 15689 6elDvgK5qSrVcSf8dAkzGf Vluestar I’ll Keep You Safe Reminiscing
## 2 33062 46eK1kembooWblMi7M7yLM Brennan Savage Look at Me Now Look at Me Now
## 3 89167 7guW1bVC4aTIjxCeZpjbsR KAROL G Bájale como 7500 SEJODIOTO
## 4 85563 0KBEpTAJPI8U23yP8wSuEo 1280 Almas Pueblo Alimaña Tu Sonrisa
## 5 29448 77z5bT7KYcOJioVQ7zQyNG Jkyl & Hyde Chemical Corridor Arpf*cks
## 6 11815 6T4nmPgRs3yT7rXb4ONcIR YUNGBLUD YUNGBLUD Don't Go
## popularity duration_ms explicit danceability energy key loudness mode
## 1 55 73557 True 0.701 0.335 1 -11.377 1
## 2 68 150000 True 0.649 0.538 0 -9.793 0
## 3 0 181106 True 0.822 0.736 1 -3.969 1
## 4 37 277280 True 0.588 0.720 4 -9.752 0
## 5 27 219428 True 0.692 0.819 2 -0.896 1
## 6 56 149672 True 0.521 0.916 0 -2.832 0
## speechiness acousticness instrumentalness liveness valence tempo
## 1 0.5980 0.77200 3.62e-06 0.1350 0.2800 84.350
## 2 0.0918 0.04830 4.01e-01 0.0928 0.3550 160.075
## 3 0.1500 0.15700 7.81e-04 0.0748 0.6790 175.930
## 4 0.0307 0.00712 3.22e-04 0.1130 0.7800 109.941
## 5 0.0590 0.00748 8.46e-01 0.1070 0.0665 139.886
## 6 0.0555 0.00594 0.00e+00 0.0781 0.8410 170.012
## time_signature track_genre
## 1 4 chill
## 2 4 emo
## 3 4 reggaeton
## 4 4 punk-rock
## 5 4 dubstep
## 6 4 british
# 1. Numeric Summary for Columns - popularity and danceability
# Summary statistics using summary()
print("Summary statistics for 'popularity' and 'danceability':")
## [1] "Summary statistics for 'popularity' and 'danceability':"
print(summary(data[c("popularity", "danceability")]))
## popularity danceability
## Min. : 0.00 Min. :0.0614
## 1st Qu.:20.00 1st Qu.:0.5220
## Median :38.00 Median :0.6570
## Mean :36.52 Mean :0.6360
## 3rd Qu.:56.00 3rd Qu.:0.7720
## Max. :98.00 Max. :0.9800
# Quartiles for popularity
print("Quartiles for popularity:")
## [1] "Quartiles for popularity:"
print(quantile(data$popularity, probs = seq(0, 1, 0.25)))
## 0% 25% 50% 75% 100%
## 0 20 38 56 98
# Quartiles for danceability
print("Quartiles for danceability:")
## [1] "Quartiles for danceability:"
print(quantile(data$danceability, probs = seq(0, 1, 0.25)))
## 0% 25% 50% 75% 100%
## 0.0614 0.5220 0.6570 0.7720 0.9800
# Standard deviation for both columns
print("Standard deviation of popularity:")
## [1] "Standard deviation of popularity:"
print(sd(data$popularity))
## [1] 24.34067
print("Standard deviation of danceability:")
## [1] "Standard deviation of danceability:"
print(sd(data$danceability))
## [1] 0.1725115
# Numeric summary for two columns: popularity and duration_ms
summary(data$popularity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 20.00 38.00 36.52 56.00 98.00
summary(data$duration_ms)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31186 162990 194000 205213 232048 4246206
data |>
summarise(
popularity_min = min(popularity, na.rm = TRUE),
popularity_max = max(popularity, na.rm = TRUE),
popularity_mean = mean(popularity, na.rm = TRUE),
popularity_median = median(popularity, na.rm = TRUE),
popularity_quantiles = quantile(popularity, probs = c(0.25, 0.5, 0.75), na.rm = TRUE),
duration_min = min(duration_ms, na.rm = TRUE),
duration_max = max(duration_ms, na.rm = TRUE),
duration_mean = mean(duration_ms, na.rm = TRUE),
duration_median = median(duration_ms, na.rm = TRUE),
duration_quantiles = quantile(duration_ms, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
)
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## popularity_min popularity_max popularity_mean popularity_median
## 1 0 98 36.51678 38
## 2 0 98 36.51678 38
## 3 0 98 36.51678 38
## popularity_quantiles duration_min duration_max duration_mean duration_median
## 1 20 31186 4246206 205213.4 194000
## 2 38 31186 4246206 205213.4 194000
## 3 56 31186 4246206 205213.4 194000
## duration_quantiles
## 1 162990
## 2 194000
## 3 232048
# 2. Categorical Summary (Unique Values & Counts)
# Count unique values in categorical columns
print("Number of unique values in 'track_genre' and 'explicit':")
## [1] "Number of unique values in 'track_genre' and 'explicit':"
print(sapply(data[c("track_genre", "explicit")], function(x) length(unique(x))))
## track_genre explicit
## 106 1
# Frequency counts of each category in 'track_genre'
print("Frequency count of track genres:")
## [1] "Frequency count of track genres:"
print(table(data$track_genre))
##
## acoustic afrobeat alt-rock alternative
## 47 18 53 151
## ambient anime black-metal bluegrass
## 5 50 118 4
## blues brazil breakbeat british
## 14 79 44 39
## cantopop chicago-house chill club
## 2 14 162 77
## comedy country dance dancehall
## 596 24 159 278
## death-metal deep-house detroit-techno disco
## 234 23 7 6
## disney drum-and-bass dub dubstep
## 1 22 65 86
## edm electro electronic emo
## 103 139 118 431
## folk forro french funk
## 6 18 198 284
## garage german goth grindcore
## 89 119 61 142
## groove grunge guitar happy
## 94 69 2 115
## hard-rock hardcore hardstyle heavy-metal
## 42 298 88 81
## hip-hop house idm indian
## 290 99 12 21
## indie indie-pop industrial iranian
## 76 100 223 78
## j-dance j-idol j-pop j-rock
## 362 1 10 16
## jazz k-pop kids latin
## 1 47 7 123
## latino malay mandopop metal
## 232 34 7 133
## metalcore minimal-techno mpb new-age
## 267 4 4 4
## opera pagode party piano
## 1 13 39 12
## pop pop-film power-pop progressive-house
## 69 1 28 49
## psych-rock punk punk-rock r-n-b
## 13 98 71 88
## reggae reggaeton rock rock-n-roll
## 155 197 39 2
## rockabilly romance sad salsa
## 5 2 415 3
## samba sertanejo show-tunes singer-songwriter
## 2 6 17 42
## ska songwriter soul spanish
## 21 45 92 33
## swedish synth-pop techno trance
## 92 41 23 21
## trip-hop turkish
## 37 202
# Sorted frequency count of 'track_genre' in decreasing order
print("Sorted count of track genres (most common first):")
## [1] "Sorted count of track genres (most common first):"
print(sort(table(data$track_genre), decreasing = TRUE))
##
## comedy emo sad j-dance
## 596 431 415 362
## hardcore hip-hop funk dancehall
## 298 290 284 278
## metalcore death-metal latino industrial
## 267 234 232 223
## turkish french reggaeton chill
## 202 198 197 162
## dance reggae alternative grindcore
## 159 155 151 142
## electro metal latin german
## 139 133 123 119
## black-metal electronic happy edm
## 118 118 115 103
## indie-pop house punk groove
## 100 99 98 94
## soul swedish garage hardstyle
## 92 92 89 88
## r-n-b dubstep heavy-metal brazil
## 88 86 81 79
## iranian club indie punk-rock
## 78 77 76 71
## grunge pop dub goth
## 69 69 65 61
## alt-rock anime progressive-house acoustic
## 53 50 49 47
## k-pop songwriter breakbeat hard-rock
## 47 45 44 42
## singer-songwriter synth-pop british party
## 42 41 39 39
## rock trip-hop malay spanish
## 39 37 34 33
## power-pop country deep-house techno
## 28 24 23 23
## drum-and-bass indian ska trance
## 22 21 21 21
## afrobeat forro show-tunes j-rock
## 18 18 17 16
## blues chicago-house pagode psych-rock
## 14 14 13 13
## idm piano j-pop detroit-techno
## 12 12 10 7
## kids mandopop disco folk
## 7 7 6 6
## sertanejo ambient rockabilly bluegrass
## 6 5 5 4
## minimal-techno mpb new-age salsa
## 4 4 4 3
## cantopop guitar rock-n-roll romance
## 2 2 2 2
## samba disney j-idol jazz
## 2 1 1 1
## opera pop-film
## 1 1
# Frequency count of explicit (True/False)
print("Frequency count of explicit (True/False):")
## [1] "Frequency count of explicit (True/False):"
print(table(data$explicit))
##
## True
## 9000
# Function to show unique values and their counts for a categorical column
unique_values <- function(col) {
# Create a frequency table
freq_table <- table(col)
# Convert the table into a data frame with 'Value' and 'Count'
data.frame(
Value = names(freq_table),
Count.Freq = as.integer(freq_table)
)
}
# Example usage for a categorical column like 'track_genre'
unique_values(data$track_genre)
## Value Count.Freq
## 1 acoustic 47
## 2 afrobeat 18
## 3 alt-rock 53
## 4 alternative 151
## 5 ambient 5
## 6 anime 50
## 7 black-metal 118
## 8 bluegrass 4
## 9 blues 14
## 10 brazil 79
## 11 breakbeat 44
## 12 british 39
## 13 cantopop 2
## 14 chicago-house 14
## 15 chill 162
## 16 club 77
## 17 comedy 596
## 18 country 24
## 19 dance 159
## 20 dancehall 278
## 21 death-metal 234
## 22 deep-house 23
## 23 detroit-techno 7
## 24 disco 6
## 25 disney 1
## 26 drum-and-bass 22
## 27 dub 65
## 28 dubstep 86
## 29 edm 103
## 30 electro 139
## 31 electronic 118
## 32 emo 431
## 33 folk 6
## 34 forro 18
## 35 french 198
## 36 funk 284
## 37 garage 89
## 38 german 119
## 39 goth 61
## 40 grindcore 142
## 41 groove 94
## 42 grunge 69
## 43 guitar 2
## 44 happy 115
## 45 hard-rock 42
## 46 hardcore 298
## 47 hardstyle 88
## 48 heavy-metal 81
## 49 hip-hop 290
## 50 house 99
## 51 idm 12
## 52 indian 21
## 53 indie 76
## 54 indie-pop 100
## 55 industrial 223
## 56 iranian 78
## 57 j-dance 362
## 58 j-idol 1
## 59 j-pop 10
## 60 j-rock 16
## 61 jazz 1
## 62 k-pop 47
## 63 kids 7
## 64 latin 123
## 65 latino 232
## 66 malay 34
## 67 mandopop 7
## 68 metal 133
## 69 metalcore 267
## 70 minimal-techno 4
## 71 mpb 4
## 72 new-age 4
## 73 opera 1
## 74 pagode 13
## 75 party 39
## 76 piano 12
## 77 pop 69
## 78 pop-film 1
## 79 power-pop 28
## 80 progressive-house 49
## 81 psych-rock 13
## 82 punk 98
## 83 punk-rock 71
## 84 r-n-b 88
## 85 reggae 155
## 86 reggaeton 197
## 87 rock 39
## 88 rock-n-roll 2
## 89 rockabilly 5
## 90 romance 2
## 91 sad 415
## 92 salsa 3
## 93 samba 2
## 94 sertanejo 6
## 95 show-tunes 17
## 96 singer-songwriter 42
## 97 ska 21
## 98 songwriter 45
## 99 soul 92
## 100 spanish 33
## 101 swedish 92
## 102 synth-pop 41
## 103 techno 23
## 104 trance 21
## 105 trip-hop 37
## 106 turkish 202
# Example for multiple columns
unique_values(data$explicit)
## Value Count.Freq
## 1 True 9000
unique_values(data$track_genre)
## Value Count.Freq
## 1 acoustic 47
## 2 afrobeat 18
## 3 alt-rock 53
## 4 alternative 151
## 5 ambient 5
## 6 anime 50
## 7 black-metal 118
## 8 bluegrass 4
## 9 blues 14
## 10 brazil 79
## 11 breakbeat 44
## 12 british 39
## 13 cantopop 2
## 14 chicago-house 14
## 15 chill 162
## 16 club 77
## 17 comedy 596
## 18 country 24
## 19 dance 159
## 20 dancehall 278
## 21 death-metal 234
## 22 deep-house 23
## 23 detroit-techno 7
## 24 disco 6
## 25 disney 1
## 26 drum-and-bass 22
## 27 dub 65
## 28 dubstep 86
## 29 edm 103
## 30 electro 139
## 31 electronic 118
## 32 emo 431
## 33 folk 6
## 34 forro 18
## 35 french 198
## 36 funk 284
## 37 garage 89
## 38 german 119
## 39 goth 61
## 40 grindcore 142
## 41 groove 94
## 42 grunge 69
## 43 guitar 2
## 44 happy 115
## 45 hard-rock 42
## 46 hardcore 298
## 47 hardstyle 88
## 48 heavy-metal 81
## 49 hip-hop 290
## 50 house 99
## 51 idm 12
## 52 indian 21
## 53 indie 76
## 54 indie-pop 100
## 55 industrial 223
## 56 iranian 78
## 57 j-dance 362
## 58 j-idol 1
## 59 j-pop 10
## 60 j-rock 16
## 61 jazz 1
## 62 k-pop 47
## 63 kids 7
## 64 latin 123
## 65 latino 232
## 66 malay 34
## 67 mandopop 7
## 68 metal 133
## 69 metalcore 267
## 70 minimal-techno 4
## 71 mpb 4
## 72 new-age 4
## 73 opera 1
## 74 pagode 13
## 75 party 39
## 76 piano 12
## 77 pop 69
## 78 pop-film 1
## 79 power-pop 28
## 80 progressive-house 49
## 81 psych-rock 13
## 82 punk 98
## 83 punk-rock 71
## 84 r-n-b 88
## 85 reggae 155
## 86 reggaeton 197
## 87 rock 39
## 88 rock-n-roll 2
## 89 rockabilly 5
## 90 romance 2
## 91 sad 415
## 92 salsa 3
## 93 samba 2
## 94 sertanejo 6
## 95 show-tunes 17
## 96 singer-songwriter 42
## 97 ska 21
## 98 songwriter 45
## 99 soul 92
## 100 spanish 33
## 101 swedish 92
## 102 synth-pop 41
## 103 techno 23
## 104 trance 21
## 105 trip-hop 37
## 106 turkish 202
Combined Summary: This combines a numeric summary and a categorical summary (unique values) in one concise table.
# 3. Combined Summary Using dplyr
summary_data <- data |>
summarise(
min_popularity = min(popularity),
max_popularity = max(popularity),
mean_popularity = mean(popularity),
median_popularity = median(popularity),
sd_popularity = sd(popularity),
min_danceability = min(danceability),
max_danceability = max(danceability),
mean_danceability = mean(danceability),
median_danceability = median(danceability),
sd_danceability = sd(danceability),
unique_genres = length(unique(track_genre)),
unique_explicit = length(unique(explicit))
)
print("Combined summary of numeric and categorical data:")
## [1] "Combined summary of numeric and categorical data:"
print(summary_data)
## min_popularity max_popularity mean_popularity median_popularity sd_popularity
## 1 0 98 36.51678 38 24.34067
## min_danceability max_danceability mean_danceability median_danceability
## 1 0.0614 0.98 0.6360495 0.657
## sd_danceability unique_genres unique_explicit
## 1 0.1725115 106 1
print("Detailed summary using skimr:")
## [1] "Detailed summary using skimr:"
print(skim(data))
## ── Data Summary ────────────────────────
## Values
## Name data
## Number of rows 9000
## Number of columns 21
## _______________________
## Column type frequency:
## character 6
## numeric 15
## ________________________
## Group variables None
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 track_id 0 1 22 22 0 7215 0
## 2 artists 0 1 2 181 0 3720 0
## 3 album_name 0 1 1 138 0 4722 0
## 4 track_name 0 1 1 131 0 6381 0
## 5 explicit 0 1 4 4 0 1 0
## 6 track_genre 0 1 3 17 0 106 0
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0
## 1 X 0 1 51114. 29427. 59
## 2 popularity 0 1 36.5 24.3 0
## 3 duration_ms 0 1 205213. 85355. 31186
## 4 danceability 0 1 0.636 0.173 0.0614
## 5 energy 0 1 0.722 0.187 0.0423
## 6 key 0 1 5.34 3.60 0
## 7 loudness 0 1 -6.46 3.12 -24.8
## 8 mode 0 1 0.578 0.494 0
## 9 speechiness 0 1 0.191 0.208 0.0232
## 10 acousticness 0 1 0.212 0.260 0
## 11 instrumentalness 0 1 0.0515 0.172 0
## 12 liveness 0 1 0.233 0.202 0.0196
## 13 valence 0 1 0.472 0.229 0.0215
## 14 tempo 0 1 122. 30.6 35.4
## 15 time_signature 0 1 3.96 0.406 1
## p25 p50 p75 p100 hist
## 1 28301 4.82e+4 72008. 112983 ▇▇▇▆▅
## 2 20 3.8 e+1 56 98 ▇▇▇▅▁
## 3 162990 1.94e+5 232048 4246206 ▇▁▁▁▁
## 4 0.522 6.57e-1 0.772 0.98 ▁▂▆▇▅
## 5 0.584 7.3 e-1 0.883 1 ▁▂▅▇▇
## 6 2 6 e+0 8 11 ▇▂▃▅▆
## 7 -7.91 -5.90e+0 -4.37 1.82 ▁▁▂▇▂
## 8 0 1 e+0 1 1 ▆▁▁▁▇
## 9 0.0592 1.12e-1 0.245 0.965 ▇▂▁▁▁
## 10 0.00874 9.63e-2 0.328 0.995 ▇▂▁▁▁
## 11 0 1.49e-6 0.000493 0.971 ▇▁▁▁▁
## 12 0.103 1.44e-1 0.312 0.992 ▇▃▁▁▁
## 13 0.299 4.73e-1 0.645 0.989 ▅▇▇▆▂
## 14 96.4 1.20e+2 143. 214. ▁▇▇▅▁
## 15 4 4 e+0 4 5 ▁▁▁▇▁
Does the popularity of songs vary significantly across genres?
How does the duration of a song correlate with its popularity?
Is there any connection between a song’s explicit content and its popularity or genre?
Do explicit songs tend to have higher or lower popularity?
How does the popularity of songs vary based on whether they are explicit or not?
Do songs with longer durations have higher or lower popularity?
Is there a trend in song popularity across different genres?
#Question 1: Does Popularity Vary by Genre?
# Aggregate popularity by genre
agg_genre <- data |>
group_by(track_genre) |>
summarise(mean_popularity = mean(popularity, na.rm = TRUE))
print(agg_genre)
## # A tibble: 106 × 2
## track_genre mean_popularity
## <chr> <dbl>
## 1 acoustic 30.5
## 2 afrobeat 26.6
## 3 alt-rock 40.2
## 4 alternative 23.8
## 5 ambient 57.2
## 6 anime 48.0
## 7 black-metal 24.8
## 8 bluegrass 24.5
## 9 blues 49.3
## 10 brazil 46.6
## # ℹ 96 more rows
This aggregation finds the mean popularity for each genre.
The aggregated table will reveal if some genres are more popular on average. This insight can inform marketing strategies or targeted content creation.
If certain genres consistently have higher popularity scores, it may suggest genre preferences or trends that can be leveraged for recommendations or advertising.
How do the distribution patterns compare for each genre?
#Visualization 1: Distribution of Popularity (Histogram)
ggplot(data, aes(x = popularity)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
labs(title = "Distribution of Popularity", x = "Popularity", y = "Count")
### Explanation: The histogram shows the distribution of popularity
scores across songs.
From the histogram, we can observe if popularity scores are normally distributed or if they are skewed towards higher or lower values.
Are there significant peaks in the popularity distribution that could indicate genre-specific trends?
#Visualization:2 Scatterplot using Energy and Loundness
ggplot(sample_data, aes(x = energy, y = loudness)) +
geom_point(alpha = 0.5, color = "green") +
labs(title = "Energy vs Loudness", x = "Energy", y = "Loudness (dB)") +
theme_minimal()
This scatter plot explores the relationship between a song’s energy and its loudness.
Understanding this relationship helps identify how music production techniques influence perception—high-energy tracks with low loudness could be a unique characteristic of certain genres.
# Visualisation: 3 Density plot of Acousticness
ggplot(sample_data, aes(x = acousticness)) +
geom_density(fill = "purple", alpha = 0.5) +
labs(title = "Density Plot of Acousticness", x = "Acousticness", y = "Density") +
theme_minimal()
This density plot shows the probability distribution of acousticness values in the dataset.
A peak near 0 indicates many tracks are highly acoustic. A bimodal or multimodal distribution suggests there are distinct clusters of acoustic and non-acoustic tracks. A left-skewed distribution means most tracks are electronic or heavily processed.
Acousticness is an important factor in determining whether a song is more organic or digitally produced.
Do explicit tracks tend to have lower acousticness compared to clean tracks? How does acousticness relate to energy and tempo?