IMPORTING DATA

library(tidytuesdayR)
## Warning: package 'tidytuesdayR' was built under R version 4.4.3
tt = tt_load("2020-01-21")
## ---- Compiling #TidyTuesday Information for 2020-01-21 ----
## --- There is 1 file available ---
## 
## 
## ── Downloading files ───────────────────────────────────────────────────────────
## 
##   1 of 1: "spotify_songs.csv"
df = tt$spotify_songs

ENABLE PACKAGES

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Quick Look on the Data

Ini digunakan untuk melihat secara sekilas mengenai data, seperti beberapa value awal dan tipe datanya

glimpse(df)
## Rows: 32,833
## Columns: 23
## $ track_id                 <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdfa…
## $ track_name               <chr> "I Don't Care (with Justin Bieber) - Loud Lux…
## $ track_artist             <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "Th…
## $ track_popularity         <dbl> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, 6…
## $ track_album_id           <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E6…
## $ track_album_name         <chr> "I Don't Care (with Justin Bieber) [Loud Luxu…
## $ track_album_release_date <chr> "2019-06-14", "2019-12-13", "2019-07-05", "20…
## $ playlist_name            <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop R…
## $ playlist_id              <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7cf…
## $ playlist_genre           <chr> "pop", "pop", "pop", "pop", "pop", "pop", "po…
## $ playlist_subgenre        <chr> "dance pop", "dance pop", "dance pop", "dance…
## $ danceability             <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.4…
## $ energy                   <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.8…
## $ key                      <dbl> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5,…
## $ loudness                 <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.38…
## $ mode                     <dbl> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ speechiness              <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.127…
## $ acousticness             <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030, …
## $ instrumentalness         <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00e…
## $ liveness                 <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.143…
## $ valence                  <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.1…
## $ tempo                    <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, 1…
## $ duration_ms              <dbl> 194754, 162600, 176616, 169093, 189052, 16304…

Summary of Data

summary(df)
##    track_id          track_name        track_artist       track_popularity
##  Length:32833       Length:32833       Length:32833       Min.   :  0.00  
##  Class :character   Class :character   Class :character   1st Qu.: 24.00  
##  Mode  :character   Mode  :character   Mode  :character   Median : 45.00  
##                                                           Mean   : 42.48  
##                                                           3rd Qu.: 62.00  
##                                                           Max.   :100.00  
##  track_album_id     track_album_name   track_album_release_date
##  Length:32833       Length:32833       Length:32833            
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##  playlist_name      playlist_id        playlist_genre     playlist_subgenre 
##  Length:32833       Length:32833       Length:32833       Length:32833      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   danceability        energy              key            loudness      
##  Min.   :0.0000   Min.   :0.000175   Min.   : 0.000   Min.   :-46.448  
##  1st Qu.:0.5630   1st Qu.:0.581000   1st Qu.: 2.000   1st Qu.: -8.171  
##  Median :0.6720   Median :0.721000   Median : 6.000   Median : -6.166  
##  Mean   :0.6548   Mean   :0.698619   Mean   : 5.374   Mean   : -6.720  
##  3rd Qu.:0.7610   3rd Qu.:0.840000   3rd Qu.: 9.000   3rd Qu.: -4.645  
##  Max.   :0.9830   Max.   :1.000000   Max.   :11.000   Max.   :  1.275  
##       mode         speechiness      acousticness    instrumentalness   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000000  
##  1st Qu.:0.0000   1st Qu.:0.0410   1st Qu.:0.0151   1st Qu.:0.0000000  
##  Median :1.0000   Median :0.0625   Median :0.0804   Median :0.0000161  
##  Mean   :0.5657   Mean   :0.1071   Mean   :0.1753   Mean   :0.0847472  
##  3rd Qu.:1.0000   3rd Qu.:0.1320   3rd Qu.:0.2550   3rd Qu.:0.0048300  
##  Max.   :1.0000   Max.   :0.9180   Max.   :0.9940   Max.   :0.9940000  
##     liveness         valence           tempo         duration_ms    
##  Min.   :0.0000   Min.   :0.0000   Min.   :  0.00   Min.   :  4000  
##  1st Qu.:0.0927   1st Qu.:0.3310   1st Qu.: 99.96   1st Qu.:187819  
##  Median :0.1270   Median :0.5120   Median :121.98   Median :216000  
##  Mean   :0.1902   Mean   :0.5106   Mean   :120.88   Mean   :225800  
##  3rd Qu.:0.2480   3rd Qu.:0.6930   3rd Qu.:133.92   3rd Qu.:253585  
##  Max.   :0.9960   Max.   :0.9910   Max.   :239.44   Max.   :517810

Removing Irrelevant Columns

Menghapus beberapa kolom yang tidak digunakan untuk analisis

df = df %>% 
  select(-c(track_id, track_album_id, playlist_id))

Set agar output terlihat semua

options(tibble.width = Inf)

Mengecek Nilai Null / NA

df %>%
  summarise(across(everything(), ~ sum(is.na(.))))
## # A tibble: 1 × 20
##   track_name track_artist track_popularity track_album_name
##        <int>        <int>            <int>            <int>
## 1          5            5                0                5
##   track_album_release_date playlist_name playlist_genre playlist_subgenre
##                      <int>         <int>          <int>             <int>
## 1                        0             0              0                 0
##   danceability energy   key loudness  mode speechiness acousticness
##          <int>  <int> <int>    <int> <int>       <int>        <int>
## 1            0      0     0        0     0           0            0
##   instrumentalness liveness valence tempo duration_ms
##              <int>    <int>   <int> <int>       <int>
## 1                0        0       0     0           0

Mengecek baris duplikat

df %>%
  duplicated %>%
  sum
## [1] 0

Mengecek Nilai Unik Tipe Data Kategori

Ini digunakan untuk melihat apakah ada nilai dengan arti yang sama, namun beda penulisan

df %>%
  count(playlist_genre)
## # A tibble: 6 × 2
##   playlist_genre     n
##   <chr>          <int>
## 1 edm             6043
## 2 latin           5155
## 3 pop             5507
## 4 r&b             5431
## 5 rap             5746
## 6 rock            4951
df %>%
  count(playlist_subgenre)
## # A tibble: 24 × 2
##    playlist_subgenre     n
##    <chr>             <int>
##  1 album rock         1065
##  2 big room           1206
##  3 classic rock       1296
##  4 dance pop          1298
##  5 electro house      1511
##  6 electropop         1408
##  7 gangster rap       1458
##  8 hard rock          1485
##  9 hip hop            1322
## 10 hip pop            1256
## # ℹ 14 more rows

Mengecek Outlier dari Data dan Data Tidak Logis

Dalam data deskripsi, terdapat beberapa data yang memiliki rentang wajarnya. Kelompok kami menyesuaikan data dengan itu dan menghapus yang tidak sesuai. Untuk selain itu, dianggap masih wajar dan akan tetap digunakan sebagai informasi yang berguna. Berdasarkan summary dari data diatas, ditemukan nilai yang tidak sesuai tepatnya pada kolom Loudness dan Tempo.

ggplot(df, aes(y = loudness)) +
  geom_boxplot()+
  labs(title = "Boxplot of Loudness")

df %>%
  filter(loudness>0) %>%
  summarise(count = n())
## # A tibble: 1 × 1
##   count
##   <int>
## 1     6
df = df %>%
  filter(loudness < 0 )
ggplot(df, aes(y = tempo)) +
  geom_boxplot()+
  labs(title = "Boxplot of Tempo")

df %>%
  filter(tempo <= 0) %>%
  summarise(count = n())
## # A tibble: 1 × 1
##   count
##   <int>
## 1     1
df = df %>%
  filter(tempo > 0 )

Mengecek Tipe Data

df %>% summarise(across(everything(), class))
## # A tibble: 1 × 20
##   track_name track_artist track_popularity track_album_name
##   <chr>      <chr>        <chr>            <chr>           
## 1 character  character    numeric          character       
##   track_album_release_date playlist_name playlist_genre playlist_subgenre
##   <chr>                    <chr>         <chr>          <chr>            
## 1 character                character     character      character        
##   danceability energy  key     loudness mode    speechiness acousticness
##   <chr>        <chr>   <chr>   <chr>    <chr>   <chr>       <chr>       
## 1 numeric      numeric numeric numeric  numeric numeric     numeric     
##   instrumentalness liveness valence tempo   duration_ms
##   <chr>            <chr>    <chr>   <chr>   <chr>      
## 1 numeric          numeric  numeric numeric numeric

Mengubah Tipe Data

Ini mempermudah proses analisis, karena untuk data kategorik sudah dijadikan faktor bukan character sehingga dianggap sebagai sebuah kategori dan untuk date agar data bisa diparse menjadi year maupun month sesuai kebutuhan analisis.

df$playlist_genre = as.factor(df$playlist_genre)
df$playlist_subgenre = as.factor(df$playlist_subgenre)
df$track_album_release_date = as.Date(df$track_album_release_date)

Drop Nilai NA

df  =  df %>%
  drop_na()

Mengecek Kembali Hasil Summary

summary(df)
##   track_name        track_artist       track_popularity track_album_name  
##  Length:30935       Length:30935       Min.   :  0.00   Length:30935      
##  Class :character   Class :character   1st Qu.: 25.00   Class :character  
##  Mode  :character   Mode  :character   Median : 45.00   Mode  :character  
##                                        Mean   : 42.76                     
##                                        3rd Qu.: 62.00                     
##                                        Max.   :100.00                     
##                                                                           
##  track_album_release_date playlist_name      playlist_genre
##  Min.   :1957-01-01       Length:30935       edm  :5967    
##  1st Qu.:2010-12-03       Class :character   latin:4961    
##  Median :2017-01-27       Mode  :character   pop  :5303    
##  Mean   :2012-09-09                          r&b  :5094    
##  3rd Qu.:2019-05-16                          rap  :5465    
##  Max.   :2020-01-29                          rock :4145    
##                                                            
##                  playlist_subgenre  danceability        energy        
##  progressive electro house: 1760   Min.   :0.0771   Min.   :0.000175  
##  indie poptimism          : 1647   1st Qu.:0.5660   1st Qu.:0.582000  
##  latin hip hop            : 1572   Median :0.6740   Median :0.721000  
##  neo soul                 : 1547   Mean   :0.6573   Mean   :0.698825  
##  southern hip hop         : 1512   3rd Qu.:0.7620   3rd Qu.:0.840000  
##  pop edm                  : 1506   Max.   :0.9830   Max.   :1.000000  
##  (Other)                  :21391                                      
##       key            loudness            mode         speechiness    
##  Min.   : 0.000   Min.   :-46.448   Min.   :0.0000   Min.   :0.0224  
##  1st Qu.: 2.000   1st Qu.: -8.073   1st Qu.:0.0000   1st Qu.:0.0415  
##  Median : 6.000   Median : -6.095   Median :1.0000   Median :0.0636  
##  Mean   : 5.368   Mean   : -6.640   Mean   :0.5609   Mean   :0.1082  
##  3rd Qu.: 9.000   3rd Qu.: -4.606   3rd Qu.:1.0000   3rd Qu.:0.1340  
##  Max.   :11.000   Max.   : -0.046   Max.   :1.0000   Max.   :0.9180  
##                                                                      
##   acousticness      instrumentalness       liveness          valence       
##  Min.   :1.40e-06   Min.   :0.0000000   Min.   :0.00936   Min.   :0.00001  
##  1st Qu.:1.52e-02   1st Qu.:0.0000000   1st Qu.:0.09310   1st Qu.:0.32600  
##  Median :8.10e-02   Median :0.0000153   Median :0.12700   Median :0.50600  
##  Mean   :1.76e-01   Mean   :0.0869723   Mean   :0.18997   Mean   :0.50507  
##  3rd Qu.:2.56e-01   3rd Qu.:0.0050600   3rd Qu.:0.24700   3rd Qu.:0.68700  
##  Max.   :9.94e-01   Max.   :0.9940000   Max.   :0.99600   Max.   :0.99100  
##                                                                            
##      tempo         duration_ms    
##  Min.   : 35.48   Min.   : 29493  
##  1st Qu.: 99.97   1st Qu.:186750  
##  Median :122.00   Median :214400  
##  Mean   :120.94   Mean   :223962  
##  3rd Qu.:133.52   3rd Qu.:251117  
##  Max.   :239.44   Max.   :517810  
## 

Mengambil Data Year untuk Analisa

df$year = format(df$track_album_release_date, "%Y")

Membuat Dataframe untuk Analisa tepatnya untuk tahun 2019

Kami memutuskan untuk memilih tahun 2019, karena kami merasa bahwa itu saat dimana meningkatnya pengguna Spotify sehingga bisa mendapatkan lebih banyak insight.

df_2019  = df%>%
  filter(year == 2019)

Scatter Plot Hubungan Antara Danceability dengan Track Popularity untuK Setiap Genre

Ditemukan bahwa terdapat tren positif yang cukup terlihat pada latin, pop, dan rap Pada rock, r&b, dan edm, tren terlihat cukup flat relasinya

ggplot(df, aes(x = danceability, y = track_popularity)) +
  geom_point(alpha= 0.3) +
  facet_wrap(~playlist_genre, scales = 'free',labeller = label_wrap_gen(width = 15))+
  geom_smooth(method = "loess", se = FALSE, color = "black") +
  geom_smooth(method = "lm", se = FALSE, color = "red")+
  theme(
    strip.text = element_text(face = "bold", size = 12, color = "blue"),
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )+
  labs(title = "Danceability vs Popularity",
       x = "Danceability", y = "Popularity")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Scatter Plot Hubungan Antara Danceability dengan Track Popularity untuK Setiap Genre (2019)

Ditemukan bahwa terdapat tren positif yang cukup terlihat pada latin, pop, r&b dan rap Pada rock dan edm, tren terlihat cukup flat relasinya

ggplot(df_2019, aes(x = danceability, y = track_popularity)) +
  geom_point(alpha= 0.3) +
  facet_wrap(~playlist_genre, scales = 'free',labeller = label_wrap_gen(width = 15))+
  geom_smooth(method = "loess", se = FALSE, color = "black") +
  geom_smooth(method = "lm", se = FALSE, color = "red")+
  theme(
    strip.text = element_text(face = "bold", size = 12, color = "blue"),
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )+
  labs(title = "Danceability vs Popularity (2019)",
       x = "Danceability", y = "Popularity")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Boxplot Popularity per Danceability Group

Group dibagi berdasarkan kuartil data.

df$dance_group <- cut(df$danceability,
                      breaks = quantile(df$danceability, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
                      labels = c("Low (0.0771 - 0.605)", "Medium (0.606 - 0.732)", "High (0.733 - 0.983"),
                      include.lowest = TRUE)
df %>%
  group_by(dance_group) %>%
  summarise(
    min_danceability = min(danceability, na.rm = TRUE),
    max_danceability = max(danceability, na.rm = TRUE),
    .groups = "drop"
  )
## # A tibble: 3 × 3
##   dance_group            min_danceability max_danceability
##   <fct>                             <dbl>            <dbl>
## 1 Low (0.0771 - 0.605)             0.0771            0.605
## 2 Medium (0.606 - 0.732)           0.606             0.732
## 3 High (0.733 - 0.983              0.733             0.983

Ditemukan bahwa pada median setiap boxplot, terlihat hampir sama antara group low dengan nilai di sekitar 43-44, sedangkan untuk group high menunjukan nilai di sekitar 50.

ggplot(df, aes(x = dance_group, y = track_popularity, fill = dance_group)) +
  geom_boxplot() +
  labs(title = "Popularity by Danceability Group",
       x = "Danceability Group",
       y = "Popularity",
       fill = "Dancebility") +
  theme(
    strip.text = element_text(face = "bold", size = 12, color = "blue"),
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.position = "none"
  )

Popularity by Danceability Group (2019)

df_2019$dance_group <- cut(df_2019$danceability,
                      breaks = quantile(df$danceability, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE),
                      labels = c("Low (0.0771-0.605)", "Medium (0.606-0.732)", "High (0.733-0.979)"),
                      include.lowest = TRUE)
df_2019 %>%
  group_by(dance_group) %>%
  summarise(
    min_danceability = min(danceability, na.rm = TRUE),
    max_danceability = max(danceability, na.rm = TRUE),
    .groups = "drop"
  )
## # A tibble: 3 × 3
##   dance_group          min_danceability max_danceability
##   <fct>                           <dbl>            <dbl>
## 1 Low (0.0771-0.605)             0.0771            0.605
## 2 Medium (0.606-0.732)           0.606             0.732
## 3 High (0.733-0.979)             0.733             0.979

Untuk data 2019, ditemukan bahwa pada median setiap boxplot, terlihat hampir sama antara group low dengan nilai di sekitar 44-45, sedangkan untuk group high menunjukan nilai di sekitar 60.

ggplot(df_2019, aes(x = dance_group, y = track_popularity, fill = dance_group)) +
  geom_boxplot() +
  labs(title = "Popularity by Danceability Group (2019)",
       x = "Danceability Group",
       y = "Popularity",
       fill = "Dancebility") +
  theme(
    strip.text = element_text(face = "bold", size = 12, color = "blue"),
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.position = "none"
  )

Ini menunjukan bahwa terdapat kemungkinan tingkat danceability bisa mempengaruhi popularitas lagu, terutama pada genre tertentu. Ini mengindikasikan bahwa penambahan elemen danceable dapat meningkatkan daya tarik lagu dalam genre Latin, Pop, R&B, dan Rap.

Scatter Plot Hubungan Popularity dengan Duration

Didapat bahwa terdapat tren negatif yang cukup terlihat genre edm,latin,pop,r&b,dan rap Pada genre rock tren terlihat cukup datar

ggplot(df,aes(x=duration_ms,y=track_popularity)) +
  geom_point()+
  geom_smooth(method="lm",color="red")+
  facet_wrap(~playlist_genre)+
  theme_minimal()+
  labs(title="Duration vs Popularity",x="Duration (ms)",y="Popularity")
## `geom_smooth()` using formula = 'y ~ x'

## Scatter Plot Hubungan Popularity dengan Duration (2019) Didapat bahwa terdapat tren positif yang cukup terlihat genre latin dan rap dan tren negatif pada genre edm, pop, dan r&b.

ggplot(df_2019,aes(x=duration_ms,y=track_popularity)) +
  geom_point()+
  geom_smooth(method="lm",color="red")+
  facet_wrap(~playlist_genre)+
  theme_minimal()+
  labs(title="Duration vs Popularity (2019)",x="Duration (ms)",y="Popularity")
## `geom_smooth()` using formula = 'y ~ x'

df$duration_group <- cut(df$duration_ms,
                         breaks=quantile(df$duration_ms,probs=c(0,1/3,2/3,1),na.rm=T),
                         labels=c("Low(29493 - 196320 ms)","Medium (196348 - 236093 ms)","High (236107 - 517810 ms)"),
                         include.lowest=T)
df_2019$duration_group <- cut(df_2019$duration_ms,
                         breaks=quantile(df_2019$duration_ms,probs=c(0,1/3,2/3,1),na.rm=T),
                         labels=c("Low (54656   - 177838 ms)","Medium(177847 - 208567 ms)","High (208571 -  508545 ms"),
                         include.lowest=T)
df %>%
  group_by(duration_group) %>%
  summarise(
    min_danceability = min(duration_ms, na.rm = TRUE),
    max_danceability = max(duration_ms, na.rm = TRUE),
    .groups = "drop"
  )
## # A tibble: 3 × 3
##   duration_group                 min_danceability max_danceability
##   <fct>                                     <dbl>            <dbl>
## 1 "Low(29493 - 196320 ms)"                  29493           196320
## 2 "Medium (196348 - 236093\tms)"           196348           236093
## 3 "High (236107 - 517810 ms)"              236107           517810
df_2019 %>%
  group_by(duration_group) %>%
  summarise(
    min_danceability = min(duration_ms, na.rm = TRUE),
    max_danceability = max(duration_ms, na.rm = TRUE),
    .groups = "drop"
  )
## # A tibble: 3 × 3
##   duration_group               min_danceability max_danceability
##   <fct>                                   <dbl>            <dbl>
## 1 "Low (54656\t- 177838 ms)"              54656           177838
## 2 "Medium(177847 - 208567 ms)"           177847           208567
## 3 "High (208571 -\t508545 ms"            208571           508545

Boxplot Popularity dengan Duration Group

Pada median setiap boxplot, terlihat hampir sama antara group low dengan group medium dengan nilai sekitar 48-49, sedangkan group high menunjukkan nilai sekitar 46

ggplot(df,aes(x=duration_group,y=track_popularity,fill=duration_group))+
  geom_boxplot()+
  theme_minimal()+
  labs(title="Popularity by Duration Group",
       x="Duration Group (ms)",
       y="Popularity")

Boxplot Popularity dengan Duration Group (2019)

Pada data 2019, terlihat sama untuk median group Low dan Medium sekitar 52-55, sedangkan pada high memiliki nilai median di sekitar 48-49

ggplot(df_2019,aes(x=duration_group,y=track_popularity,fill=duration_group))+
  geom_boxplot()+
  theme_minimal()+
  labs(title="Popularity by Duration Group (2019)",
       x="Duration Group (ms)",
       y="Popularity")

Ini menunjukan bahwa terdapat kemungkinan bahwa duration bisa mempengaruhi popularitas, dimana lagu yang populer cenderung memiliki duration yang lebih rendah khususnya pada genre edm, pop, dan r&b, sedangkan untuk genre latin dan rap cenderung lebih tinggi untuk duration yang lebih lama

Trend Line Popularity Each Year (From 2000)

genre_trend = df %>%
  group_by(playlist_genre,year) %>%
  summarise(avg_popularity = mean(track_popularity, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'playlist_genre'. You can override using
## the `.groups` argument.
genre_trend_filtered <- genre_trend %>%
  filter(year >= 2000)

Ditemukan bahwa rock dan mengalami penurunan yang cukup banyak hingga mencapai nilai di sekitar 40 Pop, r&b, rap, dan latin menunjukan pertumbuhan popularitas dari tahun ke tahun hingga mencapai di sekitar 50

ggplot(genre_trend_filtered, aes(x = year, y = avg_popularity,color = playlist_genre)) +
  geom_line(aes(group = playlist_genre), linewidth = 1) +
  geom_point(size = 2) +
  labs(title = "Genre Popularity Over Time",
       x = "Year", y = "Average Popularity") +
  theme(
    strip.text = element_text(face = "bold", size = 12, color = "blue"),
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )

Ini menunjukan bahwa bagaimana perilaku atau tanggapan audiens dari tahun ke tahun, menunjukan ketertarikan lebih ke arah genre pop, r&b, rap, dan latin