library(tidytuesdayR)
## Warning: package 'tidytuesdayR' was built under R version 4.4.3
tt = tt_load("2020-01-21")
## ---- Compiling #TidyTuesday Information for 2020-01-21 ----
## --- There is 1 file available ---
##
##
## ── Downloading files ───────────────────────────────────────────────────────────
##
## 1 of 1: "spotify_songs.csv"
df = tt$spotify_songs
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Ini digunakan untuk melihat secara sekilas mengenai data, seperti beberapa value awal dan tipe datanya
glimpse(df)
## Rows: 32,833
## Columns: 23
## $ track_id <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdfa…
## $ track_name <chr> "I Don't Care (with Justin Bieber) - Loud Lux…
## $ track_artist <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "Th…
## $ track_popularity <dbl> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, 6…
## $ track_album_id <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E6…
## $ track_album_name <chr> "I Don't Care (with Justin Bieber) [Loud Luxu…
## $ track_album_release_date <chr> "2019-06-14", "2019-12-13", "2019-07-05", "20…
## $ playlist_name <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop R…
## $ playlist_id <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7cf…
## $ playlist_genre <chr> "pop", "pop", "pop", "pop", "pop", "pop", "po…
## $ playlist_subgenre <chr> "dance pop", "dance pop", "dance pop", "dance…
## $ danceability <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.4…
## $ energy <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.8…
## $ key <dbl> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5,…
## $ loudness <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.38…
## $ mode <dbl> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ speechiness <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.127…
## $ acousticness <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030, …
## $ instrumentalness <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00e…
## $ liveness <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.143…
## $ valence <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.1…
## $ tempo <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, 1…
## $ duration_ms <dbl> 194754, 162600, 176616, 169093, 189052, 16304…
summary(df)
## track_id track_name track_artist track_popularity
## Length:32833 Length:32833 Length:32833 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 24.00
## Mode :character Mode :character Mode :character Median : 45.00
## Mean : 42.48
## 3rd Qu.: 62.00
## Max. :100.00
## track_album_id track_album_name track_album_release_date
## Length:32833 Length:32833 Length:32833
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## playlist_name playlist_id playlist_genre playlist_subgenre
## Length:32833 Length:32833 Length:32833 Length:32833
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## danceability energy key loudness
## Min. :0.0000 Min. :0.000175 Min. : 0.000 Min. :-46.448
## 1st Qu.:0.5630 1st Qu.:0.581000 1st Qu.: 2.000 1st Qu.: -8.171
## Median :0.6720 Median :0.721000 Median : 6.000 Median : -6.166
## Mean :0.6548 Mean :0.698619 Mean : 5.374 Mean : -6.720
## 3rd Qu.:0.7610 3rd Qu.:0.840000 3rd Qu.: 9.000 3rd Qu.: -4.645
## Max. :0.9830 Max. :1.000000 Max. :11.000 Max. : 1.275
## mode speechiness acousticness instrumentalness
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000000
## 1st Qu.:0.0000 1st Qu.:0.0410 1st Qu.:0.0151 1st Qu.:0.0000000
## Median :1.0000 Median :0.0625 Median :0.0804 Median :0.0000161
## Mean :0.5657 Mean :0.1071 Mean :0.1753 Mean :0.0847472
## 3rd Qu.:1.0000 3rd Qu.:0.1320 3rd Qu.:0.2550 3rd Qu.:0.0048300
## Max. :1.0000 Max. :0.9180 Max. :0.9940 Max. :0.9940000
## liveness valence tempo duration_ms
## Min. :0.0000 Min. :0.0000 Min. : 0.00 Min. : 4000
## 1st Qu.:0.0927 1st Qu.:0.3310 1st Qu.: 99.96 1st Qu.:187819
## Median :0.1270 Median :0.5120 Median :121.98 Median :216000
## Mean :0.1902 Mean :0.5106 Mean :120.88 Mean :225800
## 3rd Qu.:0.2480 3rd Qu.:0.6930 3rd Qu.:133.92 3rd Qu.:253585
## Max. :0.9960 Max. :0.9910 Max. :239.44 Max. :517810
Menghapus beberapa kolom yang tidak digunakan untuk analisis
df = df %>%
select(-c(track_id, track_album_id, playlist_id))
Set agar output terlihat semua
options(tibble.width = Inf)
df %>%
summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 20
## track_name track_artist track_popularity track_album_name
## <int> <int> <int> <int>
## 1 5 5 0 5
## track_album_release_date playlist_name playlist_genre playlist_subgenre
## <int> <int> <int> <int>
## 1 0 0 0 0
## danceability energy key loudness mode speechiness acousticness
## <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0 0
## instrumentalness liveness valence tempo duration_ms
## <int> <int> <int> <int> <int>
## 1 0 0 0 0 0
df %>%
duplicated %>%
sum
## [1] 0
Ini digunakan untuk melihat apakah ada nilai dengan arti yang sama, namun beda penulisan
df %>%
count(playlist_genre)
## # A tibble: 6 × 2
## playlist_genre n
## <chr> <int>
## 1 edm 6043
## 2 latin 5155
## 3 pop 5507
## 4 r&b 5431
## 5 rap 5746
## 6 rock 4951
df %>%
count(playlist_subgenre)
## # A tibble: 24 × 2
## playlist_subgenre n
## <chr> <int>
## 1 album rock 1065
## 2 big room 1206
## 3 classic rock 1296
## 4 dance pop 1298
## 5 electro house 1511
## 6 electropop 1408
## 7 gangster rap 1458
## 8 hard rock 1485
## 9 hip hop 1322
## 10 hip pop 1256
## # ℹ 14 more rows
Dalam data deskripsi, terdapat beberapa data yang memiliki rentang wajarnya. Kelompok kami menyesuaikan data dengan itu dan menghapus yang tidak sesuai. Untuk selain itu, dianggap masih wajar dan akan tetap digunakan sebagai informasi yang berguna. Berdasarkan summary dari data diatas, ditemukan nilai yang tidak sesuai tepatnya pada kolom Loudness dan Tempo.
ggplot(df, aes(y = loudness)) +
geom_boxplot()+
labs(title = "Boxplot of Loudness")
df %>%
filter(loudness>0) %>%
summarise(count = n())
## # A tibble: 1 × 1
## count
## <int>
## 1 6
df = df %>%
filter(loudness < 0 )
ggplot(df, aes(y = tempo)) +
geom_boxplot()+
labs(title = "Boxplot of Tempo")
df %>%
filter(tempo <= 0) %>%
summarise(count = n())
## # A tibble: 1 × 1
## count
## <int>
## 1 1
df = df %>%
filter(tempo > 0 )
df %>% summarise(across(everything(), class))
## # A tibble: 1 × 20
## track_name track_artist track_popularity track_album_name
## <chr> <chr> <chr> <chr>
## 1 character character numeric character
## track_album_release_date playlist_name playlist_genre playlist_subgenre
## <chr> <chr> <chr> <chr>
## 1 character character character character
## danceability energy key loudness mode speechiness acousticness
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 numeric numeric numeric numeric numeric numeric numeric
## instrumentalness liveness valence tempo duration_ms
## <chr> <chr> <chr> <chr> <chr>
## 1 numeric numeric numeric numeric numeric
Ini mempermudah proses analisis, karena untuk data kategorik sudah dijadikan faktor bukan character sehingga dianggap sebagai sebuah kategori dan untuk date agar data bisa diparse menjadi year maupun month sesuai kebutuhan analisis.
df$playlist_genre = as.factor(df$playlist_genre)
df$playlist_subgenre = as.factor(df$playlist_subgenre)
df$track_album_release_date = as.Date(df$track_album_release_date)
df = df %>%
drop_na()
summary(df)
## track_name track_artist track_popularity track_album_name
## Length:30935 Length:30935 Min. : 0.00 Length:30935
## Class :character Class :character 1st Qu.: 25.00 Class :character
## Mode :character Mode :character Median : 45.00 Mode :character
## Mean : 42.76
## 3rd Qu.: 62.00
## Max. :100.00
##
## track_album_release_date playlist_name playlist_genre
## Min. :1957-01-01 Length:30935 edm :5967
## 1st Qu.:2010-12-03 Class :character latin:4961
## Median :2017-01-27 Mode :character pop :5303
## Mean :2012-09-09 r&b :5094
## 3rd Qu.:2019-05-16 rap :5465
## Max. :2020-01-29 rock :4145
##
## playlist_subgenre danceability energy
## progressive electro house: 1760 Min. :0.0771 Min. :0.000175
## indie poptimism : 1647 1st Qu.:0.5660 1st Qu.:0.582000
## latin hip hop : 1572 Median :0.6740 Median :0.721000
## neo soul : 1547 Mean :0.6573 Mean :0.698825
## southern hip hop : 1512 3rd Qu.:0.7620 3rd Qu.:0.840000
## pop edm : 1506 Max. :0.9830 Max. :1.000000
## (Other) :21391
## key loudness mode speechiness
## Min. : 0.000 Min. :-46.448 Min. :0.0000 Min. :0.0224
## 1st Qu.: 2.000 1st Qu.: -8.073 1st Qu.:0.0000 1st Qu.:0.0415
## Median : 6.000 Median : -6.095 Median :1.0000 Median :0.0636
## Mean : 5.368 Mean : -6.640 Mean :0.5609 Mean :0.1082
## 3rd Qu.: 9.000 3rd Qu.: -4.606 3rd Qu.:1.0000 3rd Qu.:0.1340
## Max. :11.000 Max. : -0.046 Max. :1.0000 Max. :0.9180
##
## acousticness instrumentalness liveness valence
## Min. :1.40e-06 Min. :0.0000000 Min. :0.00936 Min. :0.00001
## 1st Qu.:1.52e-02 1st Qu.:0.0000000 1st Qu.:0.09310 1st Qu.:0.32600
## Median :8.10e-02 Median :0.0000153 Median :0.12700 Median :0.50600
## Mean :1.76e-01 Mean :0.0869723 Mean :0.18997 Mean :0.50507
## 3rd Qu.:2.56e-01 3rd Qu.:0.0050600 3rd Qu.:0.24700 3rd Qu.:0.68700
## Max. :9.94e-01 Max. :0.9940000 Max. :0.99600 Max. :0.99100
##
## tempo duration_ms
## Min. : 35.48 Min. : 29493
## 1st Qu.: 99.97 1st Qu.:186750
## Median :122.00 Median :214400
## Mean :120.94 Mean :223962
## 3rd Qu.:133.52 3rd Qu.:251117
## Max. :239.44 Max. :517810
##
df$year = format(df$track_album_release_date, "%Y")
Kami memutuskan untuk memilih tahun 2019, karena kami merasa bahwa itu saat dimana meningkatnya pengguna Spotify sehingga bisa mendapatkan lebih banyak insight.
df_2019 = df%>%
filter(year == 2019)
Ditemukan bahwa terdapat tren positif yang cukup terlihat pada latin, pop, dan rap Pada rock, r&b, dan edm, tren terlihat cukup flat relasinya
ggplot(df, aes(x = danceability, y = track_popularity)) +
geom_point(alpha= 0.3) +
facet_wrap(~playlist_genre, scales = 'free',labeller = label_wrap_gen(width = 15))+
geom_smooth(method = "loess", se = FALSE, color = "black") +
geom_smooth(method = "lm", se = FALSE, color = "red")+
theme(
strip.text = element_text(face = "bold", size = 12, color = "blue"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)+
labs(title = "Danceability vs Popularity",
x = "Danceability", y = "Popularity")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Ditemukan bahwa terdapat tren positif yang cukup terlihat pada latin, pop, r&b dan rap Pada rock dan edm, tren terlihat cukup flat relasinya
ggplot(df_2019, aes(x = danceability, y = track_popularity)) +
geom_point(alpha= 0.3) +
facet_wrap(~playlist_genre, scales = 'free',labeller = label_wrap_gen(width = 15))+
geom_smooth(method = "loess", se = FALSE, color = "black") +
geom_smooth(method = "lm", se = FALSE, color = "red")+
theme(
strip.text = element_text(face = "bold", size = 12, color = "blue"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)+
labs(title = "Danceability vs Popularity (2019)",
x = "Danceability", y = "Popularity")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Group dibagi berdasarkan kuartil data.
df$dance_group <- cut(df$danceability,
breaks = quantile(df$danceability, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low (0.0771 - 0.605)", "Medium (0.606 - 0.732)", "High (0.733 - 0.983"),
include.lowest = TRUE)
df %>%
group_by(dance_group) %>%
summarise(
min_danceability = min(danceability, na.rm = TRUE),
max_danceability = max(danceability, na.rm = TRUE),
.groups = "drop"
)
## # A tibble: 3 × 3
## dance_group min_danceability max_danceability
## <fct> <dbl> <dbl>
## 1 Low (0.0771 - 0.605) 0.0771 0.605
## 2 Medium (0.606 - 0.732) 0.606 0.732
## 3 High (0.733 - 0.983 0.733 0.983
Ditemukan bahwa pada median setiap boxplot, terlihat hampir sama antara group low dengan nilai di sekitar 43-44, sedangkan untuk group high menunjukan nilai di sekitar 50.
ggplot(df, aes(x = dance_group, y = track_popularity, fill = dance_group)) +
geom_boxplot() +
labs(title = "Popularity by Danceability Group",
x = "Danceability Group",
y = "Popularity",
fill = "Dancebility") +
theme(
strip.text = element_text(face = "bold", size = 12, color = "blue"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
legend.position = "none"
)
df_2019$dance_group <- cut(df_2019$danceability,
breaks = quantile(df$danceability, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
labels = c("Low (0.0771-0.605)", "Medium (0.606-0.732)", "High (0.733-0.979)"),
include.lowest = TRUE)
df_2019 %>%
group_by(dance_group) %>%
summarise(
min_danceability = min(danceability, na.rm = TRUE),
max_danceability = max(danceability, na.rm = TRUE),
.groups = "drop"
)
## # A tibble: 3 × 3
## dance_group min_danceability max_danceability
## <fct> <dbl> <dbl>
## 1 Low (0.0771-0.605) 0.0771 0.605
## 2 Medium (0.606-0.732) 0.606 0.732
## 3 High (0.733-0.979) 0.733 0.979
Untuk data 2019, ditemukan bahwa pada median setiap boxplot, terlihat hampir sama antara group low dengan nilai di sekitar 44-45, sedangkan untuk group high menunjukan nilai di sekitar 60.
ggplot(df_2019, aes(x = dance_group, y = track_popularity, fill = dance_group)) +
geom_boxplot() +
labs(title = "Popularity by Danceability Group (2019)",
x = "Danceability Group",
y = "Popularity",
fill = "Dancebility") +
theme(
strip.text = element_text(face = "bold", size = 12, color = "blue"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
legend.position = "none"
)
Ini menunjukan bahwa terdapat kemungkinan tingkat danceability bisa mempengaruhi popularitas lagu, terutama pada genre tertentu. Ini mengindikasikan bahwa penambahan elemen danceable dapat meningkatkan daya tarik lagu dalam genre Latin, Pop, R&B, dan Rap.
Didapat bahwa terdapat tren negatif yang cukup terlihat genre edm,latin,pop,r&b,dan rap Pada genre rock tren terlihat cukup datar
ggplot(df,aes(x=duration_ms,y=track_popularity)) +
geom_point()+
geom_smooth(method="lm",color="red")+
facet_wrap(~playlist_genre)+
theme_minimal()+
labs(title="Duration vs Popularity",x="Duration (ms)",y="Popularity")
## `geom_smooth()` using formula = 'y ~ x'
## Scatter Plot Hubungan Popularity dengan Duration (2019) Didapat bahwa
terdapat tren positif yang cukup terlihat genre latin dan rap dan tren
negatif pada genre edm, pop, dan r&b.
ggplot(df_2019,aes(x=duration_ms,y=track_popularity)) +
geom_point()+
geom_smooth(method="lm",color="red")+
facet_wrap(~playlist_genre)+
theme_minimal()+
labs(title="Duration vs Popularity (2019)",x="Duration (ms)",y="Popularity")
## `geom_smooth()` using formula = 'y ~ x'
df$duration_group <- cut(df$duration_ms,
breaks=quantile(df$duration_ms,probs=c(0,1/3,2/3,1),na.rm=T),
labels=c("Low(29493 - 196320 ms)","Medium (196348 - 236093 ms)","High (236107 - 517810 ms)"),
include.lowest=T)
df_2019$duration_group <- cut(df_2019$duration_ms,
breaks=quantile(df_2019$duration_ms,probs=c(0,1/3,2/3,1),na.rm=T),
labels=c("Low (54656 - 177838 ms)","Medium(177847 - 208567 ms)","High (208571 - 508545 ms"),
include.lowest=T)
df %>%
group_by(duration_group) %>%
summarise(
min_danceability = min(duration_ms, na.rm = TRUE),
max_danceability = max(duration_ms, na.rm = TRUE),
.groups = "drop"
)
## # A tibble: 3 × 3
## duration_group min_danceability max_danceability
## <fct> <dbl> <dbl>
## 1 "Low(29493 - 196320 ms)" 29493 196320
## 2 "Medium (196348 - 236093\tms)" 196348 236093
## 3 "High (236107 - 517810 ms)" 236107 517810
df_2019 %>%
group_by(duration_group) %>%
summarise(
min_danceability = min(duration_ms, na.rm = TRUE),
max_danceability = max(duration_ms, na.rm = TRUE),
.groups = "drop"
)
## # A tibble: 3 × 3
## duration_group min_danceability max_danceability
## <fct> <dbl> <dbl>
## 1 "Low (54656\t- 177838 ms)" 54656 177838
## 2 "Medium(177847 - 208567 ms)" 177847 208567
## 3 "High (208571 -\t508545 ms" 208571 508545
Pada median setiap boxplot, terlihat hampir sama antara group low dengan group medium dengan nilai sekitar 48-49, sedangkan group high menunjukkan nilai sekitar 46
ggplot(df,aes(x=duration_group,y=track_popularity,fill=duration_group))+
geom_boxplot()+
theme_minimal()+
labs(title="Popularity by Duration Group",
x="Duration Group (ms)",
y="Popularity")
Pada data 2019, terlihat sama untuk median group Low dan Medium sekitar 52-55, sedangkan pada high memiliki nilai median di sekitar 48-49
ggplot(df_2019,aes(x=duration_group,y=track_popularity,fill=duration_group))+
geom_boxplot()+
theme_minimal()+
labs(title="Popularity by Duration Group (2019)",
x="Duration Group (ms)",
y="Popularity")
Ini menunjukan bahwa terdapat kemungkinan bahwa duration bisa
mempengaruhi popularitas, dimana lagu yang populer cenderung memiliki
duration yang lebih rendah khususnya pada genre edm, pop, dan r&b,
sedangkan untuk genre latin dan rap cenderung lebih tinggi untuk
duration yang lebih lama
genre_trend = df %>%
group_by(playlist_genre,year) %>%
summarise(avg_popularity = mean(track_popularity, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'playlist_genre'. You can override using
## the `.groups` argument.
genre_trend_filtered <- genre_trend %>%
filter(year >= 2000)
Ditemukan bahwa rock dan mengalami penurunan yang cukup banyak hingga mencapai nilai di sekitar 40 Pop, r&b, rap, dan latin menunjukan pertumbuhan popularitas dari tahun ke tahun hingga mencapai di sekitar 50
ggplot(genre_trend_filtered, aes(x = year, y = avg_popularity,color = playlist_genre)) +
geom_line(aes(group = playlist_genre), linewidth = 1) +
geom_point(size = 2) +
labs(title = "Genre Popularity Over Time",
x = "Year", y = "Average Popularity") +
theme(
strip.text = element_text(face = "bold", size = 12, color = "blue"),
plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)
)
Ini menunjukan bahwa bagaimana perilaku atau tanggapan audiens dari
tahun ke tahun, menunjukan ketertarikan lebih ke arah genre pop,
r&b, rap, dan latin