library(readxl)
spotify_dataset_TOP_200_ <-read_excel("C:/Users/glupinacci/OneDrive - BUSINESS INTEGRATION PARTNERS SPA/Desktop/R/DATASETS/spotify_dataset TOP 200 .xlsx", col_types = c("numeric", "skip", "numeric","skip", "skip", "numeric", "skip","numeric", "skip", "skip", "skip","skip", "numeric", "numeric", "numeric","numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "skip"))


chart <- spotify_dataset_TOP_200_
head (chart)
## # A tibble: 6 x 14
##   Index `Number of Time~ Streams `Artist Followe~ Popularity Danceability Energy
##   <dbl>            <dbl>   <dbl>            <dbl>      <dbl>        <dbl>  <dbl>
## 1     1                8  4.86e7          3377762        100        0.714  0.8  
## 2     2                3  4.72e7          2230022         99        0.591  0.764
## 3     3               11  4.02e7          6266514         99        0.563  0.664
## 4     4                5  3.78e7         83293380         98        0.808  0.897
## 5     5                1  3.39e7          5473565         96        0.736  0.704
## 6     6               18  3.01e7          5473565         97        0.61   0.508
## # ... with 7 more variables: Loudness <dbl>, Speechiness <dbl>,
## #   Acousticness <dbl>, Liveness <dbl>, Tempo <dbl>, Duration (ms) <dbl>,
## #   Valence <dbl>
dim(chart)
## [1] 1556   14

##Installing needed packages

library(rpart)
library(rpart.plot)
library(corrplot)
## corrplot 0.90 loaded
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

##Multiple regression

model <- lm(chart$Popularity  ~ ., data = chart )
summary(model)
## 
## Call:
## lm(formula = chart$Popularity ~ ., data = chart)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -86.916  -3.215   2.780   7.510  25.304 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                7.873e+01  5.171e+00  15.224  < 2e-16 ***
## Index                     -1.048e-02  9.428e-04 -11.119  < 2e-16 ***
## `Number of Times Charted`  1.154e-01  2.472e-02   4.666 3.34e-06 ***
## Streams                    2.652e-07  1.159e-07   2.287   0.0223 *  
## `Artist Followers`         1.141e-07  2.280e-08   5.007 6.18e-07 ***
## Danceability               5.984e+00  3.141e+00   1.905   0.0569 .  
## Energy                    -1.215e+00  3.794e+00  -0.320   0.7488    
## Loudness                   9.743e-01  2.232e-01   4.364 1.36e-05 ***
## Speechiness                2.161e+00  3.526e+00   0.613   0.5400    
## Acousticness              -2.596e+00  1.893e+00  -1.371   0.1704    
## Liveness                  -1.724e+00  2.616e+00  -0.659   0.5100    
## Tempo                     -1.375e-02  1.268e-02  -1.084   0.2784    
## `Duration (ms)`            1.310e-05  8.000e-06   1.638   0.1017    
## Valence                   -4.759e+00  1.916e+00  -2.483   0.0131 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.4 on 1531 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.1792, Adjusted R-squared:  0.1723 
## F-statistic: 25.72 on 13 and 1531 DF,  p-value: < 2.2e-16
boxplot(chart$Energy,chart$Acousticness,chart$Speechiness,chart$Liveness)

##Categorised Data analysis

spotify_dataset_TOP_200_CAT<- read_excel("C:/Users/glupinacci/OneDrive - BUSINESS INTEGRATION PARTNERS SPA/Desktop/R/DATASETS/spotify_dataset TOP 200 .xlsx", col_types = c("numeric", "skip", "numeric", "skip", "skip", "numeric", "skip", "skip", "skip", "text", "skip", "skip", "numeric", "skip", "skip", "skip", "skip", "skip", "skip", "numeric", "skip", "skip", "text"))
Catchart <-spotify_dataset_TOP_200_CAT
Catchart
## # A tibble: 1,556 x 7
##    Index `Number of Times Charted`  Streams Genre         Popularity Tempo Chord
##    <dbl>                     <dbl>    <dbl> <chr>              <dbl> <dbl> <chr>
##  1     1                         8 48633449 ['indie rock~        100  134. B    
##  2     2                         3 47248719 ['australian~         99  170. C#/Db
##  3     3                        11 40162559 ['pop']               99  167. A    
##  4     4                         5 37799456 ['pop', 'uk ~         98  126. B    
##  5     5                         1 33948454 ['lgbtq+ hip~         96  150. D#/Eb
##  6     6                        18 30071134 ['lgbtq+ hip~         97  179. G#/Ab
##  7     7                        16 29356736 ['dance pop'~         94  111. G#/Ab
##  8     8                        10 26951613 ['puerto ric~         95  128. D#/Eb
##  9     9                         8 25030128 ['latin', 'r~         96  180. C#/Db
## 10    10                        10 24551591 ['indie rock~         95  133. C#/Db
## # ... with 1,546 more rows

##Categorizing popularity

range(Catchart$`Number of Times Charted`)
## [1]   1 142
succcescat <- rep(1, nrow(Catchart))
succcescat[Catchart[,"Number of Times Charted"]>20]<- 2
succcescat[Catchart[,"Number of Times Charted"]>40]<- 3
succcescat[Catchart[,"Number of Times Charted"]>60]<- 4
succcescat[Catchart[,"Number of Times Charted"]>80]<- 5
succcescat[Catchart[,"Number of Times Charted"]>100]<- 6
table(succcescat)
## succcescat
##    1    2    3    4    5    6 
## 1293  175   43   21   23    1
hist(succcescat)

First two cathegories:

6 –> more than a 100 times 5 –> more than 80 times

Song Name Streams Artist Blinding Lights 15,011,809 The Weeknd Watermelon Sugar 11,996,689 Harry Styles Don’t Start Now 8,821,971 Dua Lipa Someone You Loved 8,490,162 Lewis Capaldi Dance Monkey 7,687,058 Tones And I Circles 7,156,162 Post Malone Before You Go 6,813,800 Lewis Capaldi Believer 6,699,895 Imagine Dragons Roses - Imanbek Remix 6,653,690 SAINt JHN lovely (with Khalid) 6,569,547 Billie Eilish Sunflower - Spider-Man: Into the Spider-Verse 6,546,948 Post Malone, Swae Lee Señorita 6,317,487 Shawn Mendes, Camila Cabello Perfect 6,278,765 Ed Sheeran Memories 5,928,329 Maroon 5 Bohemian Rhapsody - Remastered 2011 5,756,583 Queen goosebumps 5,691,540 Travis Scott Shallow 5,672,480 Lady Gaga, Bradley Cooper Lucid Dreams 5,477,563 Juice WRLD bad guy 5,436,286 Billie Eilish Say You Won’t Let Go 5,375,426 James Arthur Adore You 5,166,847 Harry Styles 7 rings 5,002,227 Ariana Grande Ride It 5,235,088 Regard Falling 5,294,368 Harry Styles

table(succcescat,Catchart$Chord)
##           
## succcescat   A A#/Bb   B   C C#/Db   D D#/Eb   E   F F#/Gb   G G#/Ab
##          1 102    98 113 128   179 108    35  92 109    95 120   104
##          2  11    15  21  17    23  13     4  15  16    14   9    16
##          3   4     5   4   5     6   0     0   2   6     7   2     2
##          4   0     2   1   2     2   3     0   1   0     3   2     5
##          5   1     2   2   3     4   1     1   1   0     2   3     3
##          6   0     0   0   0     0   0     0   1   0     0   0     0

PCA analysis

require(easyCODA)
## Loading required package: easyCODA
## Loading required package: ca
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.5-7
## Loading required package: ellipse
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
CHORD4SUCCESS <- table(succcescat,Catchart$Chord)
my.ca <- ca(CHORD4SUCCESS)
PLOT.CA(my.ca, map="symmetric", rescale=4, dim=c(1,2), axes.inv = c(1,1), main="CHORDS FOR SUCCESS",cols=c("blue","red"), colarrows = "pink", cexs=c(0.8,0.8), fonts=c(2,4))

chisq.test(CHORD4SUCCESS)
## Warning in chisq.test(CHORD4SUCCESS): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  CHORD4SUCCESS
## X-squared = 50.746, df = 55, p-value = 0.6378

##Insights

library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.3     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
spotify_dataset_TOP_200_ <- read_excel("C:/Users/glupinacci/OneDrive - BUSINESS INTEGRATION PARTNERS SPA/Desktop/R/DATASETS/spotify_dataset TOP 200 .xlsx")
TOPchart <- spotify_dataset_TOP_200_
head(TOPchart)
## # A tibble: 6 x 23
##   Index `Highest Chartin~ `Number of Times~ `Week of Highes~ `Song Name` Streams
##   <dbl>             <dbl>             <dbl> <chr>            <chr>         <dbl>
## 1     1                 1                 8 2021-07-23--202~ Beggin'      4.86e7
## 2     2                 2                 3 2021-07-23--202~ STAY (with~  4.72e7
## 3     3                 1                11 2021-06-25--202~ good 4 u     4.02e7
## 4     4                 3                 5 2021-07-02--202~ Bad Habits   3.78e7
## 5     5                 5                 1 2021-07-23--202~ INDUSTRY B~  3.39e7
## 6     6                 1                18 2021-05-07--202~ MONTERO (C~  3.01e7
## # ... with 17 more variables: Artist <chr>, Artist Followers <dbl>,
## #   Song ID <chr>, Genre <chr>, Release Date <chr>, Weeks Charted <chr>,
## #   Popularity <dbl>, Danceability <dbl>, Energy <dbl>, Loudness <dbl>,
## #   Speechiness <dbl>, Acousticness <dbl>, Liveness <dbl>, Tempo <dbl>,
## #   Duration (ms) <dbl>, Valence <dbl>, Chord <chr>
library(dplyr)
Top200 <- spotify_dataset_TOP_200_ %>% filter(Index %in% (1:200) )
Top200 
## # A tibble: 200 x 23
##    Index `Highest Chartin~ `Number of Time~ `Week of Highes~ `Song Name` Streams
##    <dbl>             <dbl>            <dbl> <chr>            <chr>         <dbl>
##  1     1                 1                8 2021-07-23--202~ Beggin'      4.86e7
##  2     2                 2                3 2021-07-23--202~ STAY (with~  4.72e7
##  3     3                 1               11 2021-06-25--202~ good 4 u     4.02e7
##  4     4                 3                5 2021-07-02--202~ Bad Habits   3.78e7
##  5     5                 5                1 2021-07-23--202~ INDUSTRY B~  3.39e7
##  6     6                 1               18 2021-05-07--202~ MONTERO (C~  3.01e7
##  7     7                 3               16 2021-05-14--202~ Kiss Me Mo~  2.94e7
##  8     8                 2               10 2021-06-18--202~ Todo De Ti   2.70e7
##  9     9                 3                8 2021-06-18--202~ Yonaguni     2.50e7
## 10    10                 8               10 2021-07-02--202~ I WANNA BE~  2.46e7
## # ... with 190 more rows, and 17 more variables: Artist <chr>,
## #   Artist Followers <dbl>, Song ID <chr>, Genre <chr>, Release Date <chr>,
## #   Weeks Charted <chr>, Popularity <dbl>, Danceability <dbl>, Energy <dbl>,
## #   Loudness <dbl>, Speechiness <dbl>, Acousticness <dbl>, Liveness <dbl>,
## #   Tempo <dbl>, Duration (ms) <dbl>, Valence <dbl>, Chord <chr>

##Most recurrent Artist ##Most recurrent Genre ##Most recurrent key

Artist <- Top200$Artist
Genre <- Top200$Genre
Key <- Top200$Chord

freqfunc <- function(x, n){tail(sort(table(unlist(strsplit(as.character(x), ", ")))), n)}

freqfunc(Top200$Artist, 5) 
## 
##      Pop Smoke      Bad Bunny       Doja Cat     The Weeknd Olivia Rodrigo 
##              5              6              6              6             10
freqfunc(Top200$Genre, 5)
## 
##    'reggaeton' 'trap latino']             []   ['dance pop'          'pop' 
##             18             19             19             32             35
freqfunc(Top200$Chord, 5)
## 
##     C     B F#/Gb G#/Ab C#/Db 
##    17    19    22    22    33