Prepare and overview for the whole data
library(readr)
library(magrittr)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
top50 <- read_csv("Downloads/top50.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## Track.Name = col_character(),
## Artist.Name = col_character(),
## Genre = col_character(),
## Beats.Per.Minute = col_double(),
## Energy = col_double(),
## Danceability = col_double(),
## Loudness..dB.. = col_double(),
## Liveness = col_double(),
## Valence. = col_double(),
## Length. = col_double(),
## Acousticness.. = col_double(),
## Speechiness. = col_double(),
## Popularity = col_double()
## )
head(top50)
## # A tibble: 6 x 14
## X1 Track.Name Artist.Name Genre Beats.Per.Minute Energy Danceability
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 "Se\xf1or… Shawn Mend… cana… 117 55 76
## 2 2 China Anuel AA regg… 105 81 79
## 3 3 boyfriend… Ariana Gra… danc… 190 80 40
## 4 4 Beautiful… Ed Sheeran pop 93 65 64
## 5 5 Goodbyes … Post Malone dfw … 150 65 58
## 6 6 I Don't C… Ed Sheeran pop 102 68 80
## # … with 7 more variables: Loudness..dB.. <dbl>, Liveness <dbl>,
## # Valence. <dbl>, Length. <dbl>, Acousticness.. <dbl>,
## # Speechiness. <dbl>, Popularity <dbl>
ggpairs(top50, columns = 5:14,axisLabels = "internal")
artist_count=top50 %>% group_by(Artist.Name) %>% summarise(count=n()) %>% arrange(desc(count))
artist_top20=artist_count %>% slice(1:20)
#order count(artistname)
artist_top20$Artist.Name=artist_top20$Artist.Name %>% factor(levels = artist_top20$Artist.Name[order(-artist_top20$count)])
a1= ggplot(artist_top20, aes(x = artist_top20$Artist.Name, y = count))
a1 + geom_bar(stat = "identity",fill = "dodgerblue3")+theme_minimal()+theme(axis.text.x=element_text(angle=45,hjust=1)) +labs(title = "Top20 artist in Spotify",x="Artist name",y="The count of artist")
genre_count=top50 %>% group_by(Genre) %>% summarise(count=n()) %>% arrange(desc(count))
genre_top20=genre_count %>% slice(1:20)
#order count(genre)
genre_top20$Genre=genre_top20$Genre %>% factor(levels = genre_top20$Genre[order(-genre_top20$count)])
g1=ggplot(genre_top20,aes(x=genre_top20$Genre,y=count))
g1+geom_bar(stat = "identity",fill = "dodgerblue3")+theme_minimal()+theme(axis.text.x=element_text(angle=45,hjust=1)) +labs(title = "Top20 genre in Spotify",x="Genre",y="The count of genre")
As you can see Top20 genres diagram, we will find that most of music seem to be high beat or more energy (ex:dance pop and canadian hip hop). As a result, let me just find out.
#Beat
top50$beat_band=top50$Beats.Per.Minute %>% cut2(g=3,minmax = F)
top50$beat_band %>% summary()
## [ 85,100) [100,136) [136,190]
## 19 16 15
top50$beat_band=top50$beat_band %>% factor(labels = c("Low beat"," Medium","High beat"))
g3=ggplot(data = top50,aes(x=beat_band,y=top50$Popularity))
g3+geom_boxplot(fill=c("#FFD700", "#FFA500", "#FF7F00"))+labs(x="The level of beat",y="Popularity")
#Energy
top50$energy_band=top50$Energy %>% cut2(g=3,minmax = F)
top50$energy_band %>% summary()
## [32,62) [62,72) [72,88]
## 17 17 16
top50$energy_band=top50$energy_band %>% factor(labels = c("Less energy"," Medium","More energy"))
g4=ggplot(data = top50,aes(x=energy_band,y=top50$Popularity))
g4+geom_boxplot(fill=c("#FFD700", "#FFA500", "#FF7F00"))+labs(x="The level of energy",y="Popularity")
#Danceability
top50$Danceability_band=top50$Danceability %>% cut2(g=3,minmax = F)
top50$Danceability_band %>% summary()
## [29,70) [70,78) [78,90]
## 18 16 16
top50$Danceability_band=top50$Danceability_band %>% factor(labels = c("Less danceable"," Medium","More danceable"))
g5=ggplot(data = top50,aes(x=Danceability_band,y=top50$Popularity))
g5+geom_boxplot(fill=c("#FFD700", "#FFA500", "#FF7F00"))+labs(x="The level of danceability",y="Popularity")
m1=top50 %>% ggplot(aes(x=Beats.Per.Minute,y=Speechiness.,colour=Popularity))
m1+geom_point()+scale_fill_continuous(name="Popularity")+labs(x="Beat per minute",y="Speechiness")+ geom_smooth(method = "lm")+facet_grid(.~beat_band)
m2=top50 %>% ggplot(aes(x=Energy,y=Loudness..dB..,colour=Popularity))
m2+geom_point()+scale_fill_continuous(name="Popularity")+labs(x="Energy",y="Loudness")+ geom_smooth(method = "lm")
p1=top50 %>% plot_ly(x = ~Beats.Per.Minute, y = ~Speechiness.,
type = "scatter", mode = "markers",color =top50$energy_band,size=top50$Popularity) %>%
layout(yaxis = list(zeroline = F, title = "Beats.Per.Minute"),
xaxis = list(zeroline = FALSE, title = "Loudness..dB"))
p1
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
Dataset resourced from : https://www.kaggle.com/leonardopena/top50spotify2019 and 50 songs 13 variables Data were stracted from: http://organizeyourmusic.playlistmachinery.com/