Top 50 Spotify Songs - 2019

Prepare and overview for the whole data

library(readr)
library(magrittr)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
## 
##     nasa
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
top50 <- read_csv("Downloads/top50.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Track.Name = col_character(),
##   Artist.Name = col_character(),
##   Genre = col_character(),
##   Beats.Per.Minute = col_double(),
##   Energy = col_double(),
##   Danceability = col_double(),
##   Loudness..dB.. = col_double(),
##   Liveness = col_double(),
##   Valence. = col_double(),
##   Length. = col_double(),
##   Acousticness.. = col_double(),
##   Speechiness. = col_double(),
##   Popularity = col_double()
## )
head(top50)
## # A tibble: 6 x 14
##      X1 Track.Name Artist.Name Genre Beats.Per.Minute Energy Danceability
##   <dbl> <chr>      <chr>       <chr>            <dbl>  <dbl>        <dbl>
## 1     1 "Se\xf1or… Shawn Mend… cana…              117     55           76
## 2     2 China      Anuel AA    regg…              105     81           79
## 3     3 boyfriend… Ariana Gra… danc…              190     80           40
## 4     4 Beautiful… Ed Sheeran  pop                 93     65           64
## 5     5 Goodbyes … Post Malone dfw …              150     65           58
## 6     6 I Don't C… Ed Sheeran  pop                102     68           80
## # … with 7 more variables: Loudness..dB.. <dbl>, Liveness <dbl>,
## #   Valence. <dbl>, Length. <dbl>, Acousticness.. <dbl>,
## #   Speechiness. <dbl>, Popularity <dbl>
ggpairs(top50, columns = 5:14,axisLabels = "internal")

artist_count=top50 %>% group_by(Artist.Name) %>% summarise(count=n()) %>% arrange(desc(count)) 
artist_top20=artist_count %>% slice(1:20)

#order count(artistname)
artist_top20$Artist.Name=artist_top20$Artist.Name %>% factor(levels = artist_top20$Artist.Name[order(-artist_top20$count)])
a1= ggplot(artist_top20, aes(x = artist_top20$Artist.Name, y = count)) 
a1 + geom_bar(stat = "identity",fill = "dodgerblue3")+theme_minimal()+theme(axis.text.x=element_text(angle=45,hjust=1)) +labs(title = "Top20 artist in Spotify",x="Artist name",y="The count of artist")

genre_count=top50 %>% group_by(Genre) %>% summarise(count=n()) %>% arrange(desc(count))
genre_top20=genre_count %>% slice(1:20)

#order count(genre)
genre_top20$Genre=genre_top20$Genre %>% factor(levels = genre_top20$Genre[order(-genre_top20$count)])
g1=ggplot(genre_top20,aes(x=genre_top20$Genre,y=count))
g1+geom_bar(stat = "identity",fill = "dodgerblue3")+theme_minimal()+theme(axis.text.x=element_text(angle=45,hjust=1)) +labs(title = "Top20 genre in Spotify",x="Genre",y="The count of genre")

Explore the factor of Top50 music

As you can see Top20 genres diagram, we will find that most of music seem to be high beat or more energy (ex:dance pop and canadian hip hop). As a result, let me just find out.

#Beat
top50$beat_band=top50$Beats.Per.Minute %>% cut2(g=3,minmax = F)
top50$beat_band %>% summary()
## [ 85,100) [100,136) [136,190] 
##        19        16        15
top50$beat_band=top50$beat_band %>% factor(labels = c("Low beat"," Medium","High beat"))

g3=ggplot(data = top50,aes(x=beat_band,y=top50$Popularity))
g3+geom_boxplot(fill=c("#FFD700", "#FFA500", "#FF7F00"))+labs(x="The level of beat",y="Popularity")

#Energy
top50$energy_band=top50$Energy %>% cut2(g=3,minmax = F)
top50$energy_band %>% summary()
## [32,62) [62,72) [72,88] 
##      17      17      16
top50$energy_band=top50$energy_band %>% factor(labels = c("Less energy"," Medium","More energy"))

g4=ggplot(data = top50,aes(x=energy_band,y=top50$Popularity))
g4+geom_boxplot(fill=c("#FFD700", "#FFA500", "#FF7F00"))+labs(x="The level of energy",y="Popularity")  

#Danceability
top50$Danceability_band=top50$Danceability %>% cut2(g=3,minmax = F)
top50$Danceability_band %>% summary()
## [29,70) [70,78) [78,90] 
##      18      16      16
top50$Danceability_band=top50$Danceability_band %>% factor(labels = c("Less danceable"," Medium","More danceable"))

g5=ggplot(data = top50,aes(x=Danceability_band,y=top50$Popularity))
g5+geom_boxplot(fill=c("#FFD700", "#FFA500", "#FF7F00"))+labs(x="The level of danceability",y="Popularity")

Multivariate data

m1=top50 %>% ggplot(aes(x=Beats.Per.Minute,y=Speechiness.,colour=Popularity))
m1+geom_point()+scale_fill_continuous(name="Popularity")+labs(x="Beat per minute",y="Speechiness")+ geom_smooth(method = "lm")+facet_grid(.~beat_band)

m2=top50 %>% ggplot(aes(x=Energy,y=Loudness..dB..,colour=Popularity))
m2+geom_point()+scale_fill_continuous(name="Popularity")+labs(x="Energy",y="Loudness")+ geom_smooth(method = "lm")

Interactive data visualisation

p1=top50 %>% plot_ly(x = ~Beats.Per.Minute, y = ~Speechiness.,
              type = "scatter", mode = "markers",color =top50$energy_band,size=top50$Popularity) %>% 
  layout(yaxis = list(zeroline = F, title = "Beats.Per.Minute"),
         xaxis = list(zeroline = FALSE, title = "Loudness..dB")) 
  
p1
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Reference

Dataset resourced from : https://www.kaggle.com/leonardopena/top50spotify2019 and 50 songs 13 variables Data were stracted from: http://organizeyourmusic.playlistmachinery.com/