FinalGroupProject_BCE

Importing and Renaming the Data Set

remove(list=ls())

Most.Streamed.Spotify.Songs.2024 <- read.csv("~/Desktop/BCE Coding/Most Streamed Spotify Songs 2024.csv", comment.char="#")

spotify_songs <- Most.Streamed.Spotify.Songs.2024

Cleaning the Data

library(visdat)
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(stargazer)
library(psych)
#install.packages("nanair")
library(naniar)
library(ggplot2)

Attaching package: 'ggplot2'
The following objects are masked from 'package:psych':

    %+%, alpha
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
# install.packages("Hmisc")
library("Hmisc")

Attaching package: 'Hmisc'
The following objects are masked from 'package:dplyr':

    src, summarize
The following object is masked from 'package:psych':

    describe
The following objects are masked from 'package:base':

    format.pval, units
vis_dat(spotify_songs)

vis_miss(spotify_songs)

missing_values_count <- sapply(spotify_songs, function(x) sum(is.na(x)))
print(missing_values_count)
                     Track                 Album.Name 
                         0                          0 
                    Artist               Release.Date 
                         0                          0 
                      ISRC              All.Time.Rank 
                         0                          0 
               Track.Score            Spotify.Streams 
                        14                          0 
    Spotify.Playlist.Count     Spotify.Playlist.Reach 
                         0                          0 
        Spotify.Popularity              YouTube.Views 
                       816                          0 
             YouTube.Likes               TikTok.Posts 
                         0                          0 
              TikTok.Likes               TikTok.Views 
                         0                          0 
    YouTube.Playlist.Reach Apple.Music.Playlist.Count 
                         0                        572 
             AirPlay.Spins             SiriusXM.Spins 
                         0                          0 
     Deezer.Playlist.Count      Deezer.Playlist.Reach 
                       931                          0 
     Amazon.Playlist.Count            Pandora.Streams 
                      1064                          0 
    Pandora.Track.Stations         Soundcloud.Streams 
                         0                          0 
             Shazam.Counts           TIDAL.Popularity 
                         0                       4599 
            Explicit.Track 
                        14 
for(i in colnames(spotify_songs)){
  spotify_songs[, i][is.na(spotify_songs[, i])] <- mean(spotify_songs[, i], na.rm = TRUE)
}
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
vis_miss(spotify_songs)

df_clean <- dplyr::select(.data = spotify_songs, -TIDAL.Popularity)

vis_miss(df_clean)

stargazer(df_clean,
          type = "text",
          title = "Summary statistics",
          digits = 2,
          omit.summary.stat = "n",
          notes = "n = 4599.")

Summary statistics
======================================================
Statistic                  Mean  St. Dev.  Min   Max  
------------------------------------------------------
Track.Score                41.84  38.52   19.40 725.40
Spotify.Popularity         63.49  14.70   1.00  96.00 
Apple.Music.Playlist.Count 54.49  66.72   1.00  859.00
Deezer.Playlist.Count      32.19  48.21   1.00  632.00
Amazon.Playlist.Count      25.29  22.70   1.00  210.00
Explicit.Track             0.36    0.48   0.00   1.00 
------------------------------------------------------
n = 4599.                                             

Creating a Subset

df_subset <- sample_n(tbl = df_clean,
                      size = 500)

Ggplots

ggplot(data = df_clean, 
       mapping = aes(x = TikTok.Posts, y = Spotify.Popularity)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between TikTok Posts and Spotify Popularity")

ggplot(data = df_subset, 
       mapping = aes(x = TikTok.Posts, y = Spotify.Popularity)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between TikTok Posts and Spotify Popularity")

ggplot(data = df_clean, 
       mapping = aes(x = YouTube.Views, y = YouTube.Likes)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between Youtube Views and Youtube Likes")

ggplot(data = df_subset, 
       mapping = aes(x = YouTube.Views, y = YouTube.Likes)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between Youtube Views and Youtube Likes")

?gsub

char_vector <- c("Spotify.Streams", "YouTube.Likes", "YouTube.Views", "TikTok.Posts", "TikTok.Views", "TikTok.Posts")

clean_subset <- gsub(",", "", char_vector)

is.numeric(df_subset)
[1] FALSE
names(df_subset)
 [1] "Track"                      "Album.Name"                
 [3] "Artist"                     "Release.Date"              
 [5] "ISRC"                       "All.Time.Rank"             
 [7] "Track.Score"                "Spotify.Streams"           
 [9] "Spotify.Playlist.Count"     "Spotify.Playlist.Reach"    
[11] "Spotify.Popularity"         "YouTube.Views"             
[13] "YouTube.Likes"              "TikTok.Posts"              
[15] "TikTok.Likes"               "TikTok.Views"              
[17] "YouTube.Playlist.Reach"     "Apple.Music.Playlist.Count"
[19] "AirPlay.Spins"              "SiriusXM.Spins"            
[21] "Deezer.Playlist.Count"      "Deezer.Playlist.Reach"     
[23] "Amazon.Playlist.Count"      "Pandora.Streams"           
[25] "Pandora.Track.Stations"     "Soundcloud.Streams"        
[27] "Shazam.Counts"              "Explicit.Track"            
?numeric
as.numeric(df_subset$Spotify.Streams)
Warning: NAs introduced by coercion
  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
 [26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
 [51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
 [76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[101] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[126] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[151] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[176] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[201] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[226] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[251] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[276] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[301] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[326] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[351] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[376] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[401] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[426] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[451] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[476] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

Correlation Matrix

# computing p values of the data loaded
#p_values <- rcorr(as.matrix(data))
#print(p_values)