library(readxl)
spotify_dataset_TOP_200_ <-read_excel("C:/Users/glupinacci/OneDrive - BUSINESS INTEGRATION PARTNERS SPA/Desktop/R/DATASETS/spotify_dataset TOP 200 .xlsx", col_types = c("numeric", "skip", "numeric","skip", "skip", "numeric", "skip","numeric", "skip", "skip", "skip","skip", "numeric", "numeric", "numeric","numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "skip"))
chart <- spotify_dataset_TOP_200_
head (chart)
## # A tibble: 6 x 14
## Index `Number of Time~ Streams `Artist Followe~ Popularity Danceability Energy
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 8 4.86e7 3377762 100 0.714 0.8
## 2 2 3 4.72e7 2230022 99 0.591 0.764
## 3 3 11 4.02e7 6266514 99 0.563 0.664
## 4 4 5 3.78e7 83293380 98 0.808 0.897
## 5 5 1 3.39e7 5473565 96 0.736 0.704
## 6 6 18 3.01e7 5473565 97 0.61 0.508
## # ... with 7 more variables: Loudness <dbl>, Speechiness <dbl>,
## # Acousticness <dbl>, Liveness <dbl>, Tempo <dbl>, Duration (ms) <dbl>,
## # Valence <dbl>
dim(chart)
## [1] 1556 14
##Installing needed packages
library(rpart)
library(rpart.plot)
library(corrplot)
## corrplot 0.90 loaded
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##Multiple regression
model <- lm(chart$Popularity ~ ., data = chart )
summary(model)
##
## Call:
## lm(formula = chart$Popularity ~ ., data = chart)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86.916 -3.215 2.780 7.510 25.304
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.873e+01 5.171e+00 15.224 < 2e-16 ***
## Index -1.048e-02 9.428e-04 -11.119 < 2e-16 ***
## `Number of Times Charted` 1.154e-01 2.472e-02 4.666 3.34e-06 ***
## Streams 2.652e-07 1.159e-07 2.287 0.0223 *
## `Artist Followers` 1.141e-07 2.280e-08 5.007 6.18e-07 ***
## Danceability 5.984e+00 3.141e+00 1.905 0.0569 .
## Energy -1.215e+00 3.794e+00 -0.320 0.7488
## Loudness 9.743e-01 2.232e-01 4.364 1.36e-05 ***
## Speechiness 2.161e+00 3.526e+00 0.613 0.5400
## Acousticness -2.596e+00 1.893e+00 -1.371 0.1704
## Liveness -1.724e+00 2.616e+00 -0.659 0.5100
## Tempo -1.375e-02 1.268e-02 -1.084 0.2784
## `Duration (ms)` 1.310e-05 8.000e-06 1.638 0.1017
## Valence -4.759e+00 1.916e+00 -2.483 0.0131 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.4 on 1531 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.1792, Adjusted R-squared: 0.1723
## F-statistic: 25.72 on 13 and 1531 DF, p-value: < 2.2e-16
boxplot(chart$Energy,chart$Acousticness,chart$Speechiness,chart$Liveness)
##Categorised Data analysis
spotify_dataset_TOP_200_CAT<- read_excel("C:/Users/glupinacci/OneDrive - BUSINESS INTEGRATION PARTNERS SPA/Desktop/R/DATASETS/spotify_dataset TOP 200 .xlsx", col_types = c("numeric", "skip", "numeric", "skip", "skip", "numeric", "skip", "skip", "skip", "text", "skip", "skip", "numeric", "skip", "skip", "skip", "skip", "skip", "skip", "numeric", "skip", "skip", "text"))
Catchart <-spotify_dataset_TOP_200_CAT
Catchart
## # A tibble: 1,556 x 7
## Index `Number of Times Charted` Streams Genre Popularity Tempo Chord
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr>
## 1 1 8 48633449 ['indie rock~ 100 134. B
## 2 2 3 47248719 ['australian~ 99 170. C#/Db
## 3 3 11 40162559 ['pop'] 99 167. A
## 4 4 5 37799456 ['pop', 'uk ~ 98 126. B
## 5 5 1 33948454 ['lgbtq+ hip~ 96 150. D#/Eb
## 6 6 18 30071134 ['lgbtq+ hip~ 97 179. G#/Ab
## 7 7 16 29356736 ['dance pop'~ 94 111. G#/Ab
## 8 8 10 26951613 ['puerto ric~ 95 128. D#/Eb
## 9 9 8 25030128 ['latin', 'r~ 96 180. C#/Db
## 10 10 10 24551591 ['indie rock~ 95 133. C#/Db
## # ... with 1,546 more rows
##Categorizing popularity
range(Catchart$`Number of Times Charted`)
## [1] 1 142
succcescat <- rep(1, nrow(Catchart))
succcescat[Catchart[,"Number of Times Charted"]>20]<- 2
succcescat[Catchart[,"Number of Times Charted"]>40]<- 3
succcescat[Catchart[,"Number of Times Charted"]>60]<- 4
succcescat[Catchart[,"Number of Times Charted"]>80]<- 5
succcescat[Catchart[,"Number of Times Charted"]>100]<- 6
table(succcescat)
## succcescat
## 1 2 3 4 5 6
## 1293 175 43 21 23 1
hist(succcescat)
First two cathegories:
6 –> more than a 100 times 5 –> more than 80 times
Song Name Streams Artist Blinding Lights 15,011,809 The Weeknd Watermelon Sugar 11,996,689 Harry Styles Don’t Start Now 8,821,971 Dua Lipa Someone You Loved 8,490,162 Lewis Capaldi Dance Monkey 7,687,058 Tones And I Circles 7,156,162 Post Malone Before You Go 6,813,800 Lewis Capaldi Believer 6,699,895 Imagine Dragons Roses - Imanbek Remix 6,653,690 SAINt JHN lovely (with Khalid) 6,569,547 Billie Eilish Sunflower - Spider-Man: Into the Spider-Verse 6,546,948 Post Malone, Swae Lee Señorita 6,317,487 Shawn Mendes, Camila Cabello Perfect 6,278,765 Ed Sheeran Memories 5,928,329 Maroon 5 Bohemian Rhapsody - Remastered 2011 5,756,583 Queen goosebumps 5,691,540 Travis Scott Shallow 5,672,480 Lady Gaga, Bradley Cooper Lucid Dreams 5,477,563 Juice WRLD bad guy 5,436,286 Billie Eilish Say You Won’t Let Go 5,375,426 James Arthur Adore You 5,166,847 Harry Styles 7 rings 5,002,227 Ariana Grande Ride It 5,235,088 Regard Falling 5,294,368 Harry Styles
table(succcescat,Catchart$Chord)
##
## succcescat A A#/Bb B C C#/Db D D#/Eb E F F#/Gb G G#/Ab
## 1 102 98 113 128 179 108 35 92 109 95 120 104
## 2 11 15 21 17 23 13 4 15 16 14 9 16
## 3 4 5 4 5 6 0 0 2 6 7 2 2
## 4 0 2 1 2 2 3 0 1 0 3 2 5
## 5 1 2 2 3 4 1 1 1 0 2 3 3
## 6 0 0 0 0 0 0 0 1 0 0 0 0
PCA analysis
require(easyCODA)
## Loading required package: easyCODA
## Loading required package: ca
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
## Loading required package: ellipse
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
##
## pairs
CHORD4SUCCESS <- table(succcescat,Catchart$Chord)
my.ca <- ca(CHORD4SUCCESS)
PLOT.CA(my.ca, map="symmetric", rescale=4, dim=c(1,2), axes.inv = c(1,1), main="CHORDS FOR SUCCESS",cols=c("blue","red"), colarrows = "pink", cexs=c(0.8,0.8), fonts=c(2,4))
chisq.test(CHORD4SUCCESS)
## Warning in chisq.test(CHORD4SUCCESS): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: CHORD4SUCCESS
## X-squared = 50.746, df = 55, p-value = 0.6378
##Insights
library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.3 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
spotify_dataset_TOP_200_ <- read_excel("C:/Users/glupinacci/OneDrive - BUSINESS INTEGRATION PARTNERS SPA/Desktop/R/DATASETS/spotify_dataset TOP 200 .xlsx")
TOPchart <- spotify_dataset_TOP_200_
head(TOPchart)
## # A tibble: 6 x 23
## Index `Highest Chartin~ `Number of Times~ `Week of Highes~ `Song Name` Streams
## <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 1 8 2021-07-23--202~ Beggin' 4.86e7
## 2 2 2 3 2021-07-23--202~ STAY (with~ 4.72e7
## 3 3 1 11 2021-06-25--202~ good 4 u 4.02e7
## 4 4 3 5 2021-07-02--202~ Bad Habits 3.78e7
## 5 5 5 1 2021-07-23--202~ INDUSTRY B~ 3.39e7
## 6 6 1 18 2021-05-07--202~ MONTERO (C~ 3.01e7
## # ... with 17 more variables: Artist <chr>, Artist Followers <dbl>,
## # Song ID <chr>, Genre <chr>, Release Date <chr>, Weeks Charted <chr>,
## # Popularity <dbl>, Danceability <dbl>, Energy <dbl>, Loudness <dbl>,
## # Speechiness <dbl>, Acousticness <dbl>, Liveness <dbl>, Tempo <dbl>,
## # Duration (ms) <dbl>, Valence <dbl>, Chord <chr>
library(dplyr)
Top200 <- spotify_dataset_TOP_200_ %>% filter(Index %in% (1:200) )
Top200
## # A tibble: 200 x 23
## Index `Highest Chartin~ `Number of Time~ `Week of Highes~ `Song Name` Streams
## <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 1 8 2021-07-23--202~ Beggin' 4.86e7
## 2 2 2 3 2021-07-23--202~ STAY (with~ 4.72e7
## 3 3 1 11 2021-06-25--202~ good 4 u 4.02e7
## 4 4 3 5 2021-07-02--202~ Bad Habits 3.78e7
## 5 5 5 1 2021-07-23--202~ INDUSTRY B~ 3.39e7
## 6 6 1 18 2021-05-07--202~ MONTERO (C~ 3.01e7
## 7 7 3 16 2021-05-14--202~ Kiss Me Mo~ 2.94e7
## 8 8 2 10 2021-06-18--202~ Todo De Ti 2.70e7
## 9 9 3 8 2021-06-18--202~ Yonaguni 2.50e7
## 10 10 8 10 2021-07-02--202~ I WANNA BE~ 2.46e7
## # ... with 190 more rows, and 17 more variables: Artist <chr>,
## # Artist Followers <dbl>, Song ID <chr>, Genre <chr>, Release Date <chr>,
## # Weeks Charted <chr>, Popularity <dbl>, Danceability <dbl>, Energy <dbl>,
## # Loudness <dbl>, Speechiness <dbl>, Acousticness <dbl>, Liveness <dbl>,
## # Tempo <dbl>, Duration (ms) <dbl>, Valence <dbl>, Chord <chr>
##Most recurrent Artist ##Most recurrent Genre ##Most recurrent key
Artist <- Top200$Artist
Genre <- Top200$Genre
Key <- Top200$Chord
freqfunc <- function(x, n){tail(sort(table(unlist(strsplit(as.character(x), ", ")))), n)}
freqfunc(Top200$Artist, 5)
##
## Pop Smoke Bad Bunny Doja Cat The Weeknd Olivia Rodrigo
## 5 6 6 6 10
freqfunc(Top200$Genre, 5)
##
## 'reggaeton' 'trap latino'] [] ['dance pop' 'pop'
## 18 19 19 32 35
freqfunc(Top200$Chord, 5)
##
## C B F#/Gb G#/Ab C#/Db
## 17 19 22 22 33