remove(list=ls())
Most.Streamed.Spotify.Songs.2024 <- read.csv("~/Desktop/BCE Coding/Most Streamed Spotify Songs 2024.csv", comment.char="#")
spotify_songs <- Most.Streamed.Spotify.Songs.2024FinalGroupProject_BCE
Importing and Renaming the Data Set
Cleaning the Data
library(visdat)
library(stargazer)
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(stargazer)
library(psych)
#install.packages("nanair")
library(naniar)
library(ggplot2)
Attaching package: 'ggplot2'
The following objects are masked from 'package:psych':
%+%, alpha
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
# install.packages("Hmisc")
library("Hmisc")
Attaching package: 'Hmisc'
The following objects are masked from 'package:dplyr':
src, summarize
The following object is masked from 'package:psych':
describe
The following objects are masked from 'package:base':
format.pval, units
vis_dat(spotify_songs)vis_miss(spotify_songs)missing_values_count <- sapply(spotify_songs, function(x) sum(is.na(x)))
print(missing_values_count) Track Album.Name
0 0
Artist Release.Date
0 0
ISRC All.Time.Rank
0 0
Track.Score Spotify.Streams
14 0
Spotify.Playlist.Count Spotify.Playlist.Reach
0 0
Spotify.Popularity YouTube.Views
816 0
YouTube.Likes TikTok.Posts
0 0
TikTok.Likes TikTok.Views
0 0
YouTube.Playlist.Reach Apple.Music.Playlist.Count
0 572
AirPlay.Spins SiriusXM.Spins
0 0
Deezer.Playlist.Count Deezer.Playlist.Reach
931 0
Amazon.Playlist.Count Pandora.Streams
1064 0
Pandora.Track.Stations Soundcloud.Streams
0 0
Shazam.Counts TIDAL.Popularity
0 4599
Explicit.Track
14
for(i in colnames(spotify_songs)){
spotify_songs[, i][is.na(spotify_songs[, i])] <- mean(spotify_songs[, i], na.rm = TRUE)
}Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
Warning in mean.default(spotify_songs[, i], na.rm = TRUE): argument is not
numeric or logical: returning NA
vis_miss(spotify_songs)df_clean <- dplyr::select(.data = spotify_songs, -TIDAL.Popularity)
vis_miss(df_clean)stargazer(df_clean,
type = "text",
title = "Summary statistics",
digits = 2,
omit.summary.stat = "n",
notes = "n = 4599.")
Summary statistics
======================================================
Statistic Mean St. Dev. Min Max
------------------------------------------------------
Track.Score 41.84 38.52 19.40 725.40
Spotify.Popularity 63.49 14.70 1.00 96.00
Apple.Music.Playlist.Count 54.49 66.72 1.00 859.00
Deezer.Playlist.Count 32.19 48.21 1.00 632.00
Amazon.Playlist.Count 25.29 22.70 1.00 210.00
Explicit.Track 0.36 0.48 0.00 1.00
------------------------------------------------------
n = 4599.
Creating a Subset
df_subset <- sample_n(tbl = df_clean,
size = 500)Ggplots
ggplot(data = df_clean,
mapping = aes(x = TikTok.Posts, y = Spotify.Popularity)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between TikTok Posts and Spotify Popularity")ggplot(data = df_subset,
mapping = aes(x = TikTok.Posts, y = Spotify.Popularity)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between TikTok Posts and Spotify Popularity")ggplot(data = df_clean,
mapping = aes(x = YouTube.Views, y = YouTube.Likes)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between Youtube Views and Youtube Likes")ggplot(data = df_subset,
mapping = aes(x = YouTube.Views, y = YouTube.Likes)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between Youtube Views and Youtube Likes")?gsub
char_vector <- c("Spotify.Streams", "YouTube.Likes", "YouTube.Views", "TikTok.Posts", "TikTok.Views", "TikTok.Posts")
clean_subset <- gsub(",", "", char_vector)
is.numeric(df_subset)[1] FALSE
names(df_subset) [1] "Track" "Album.Name"
[3] "Artist" "Release.Date"
[5] "ISRC" "All.Time.Rank"
[7] "Track.Score" "Spotify.Streams"
[9] "Spotify.Playlist.Count" "Spotify.Playlist.Reach"
[11] "Spotify.Popularity" "YouTube.Views"
[13] "YouTube.Likes" "TikTok.Posts"
[15] "TikTok.Likes" "TikTok.Views"
[17] "YouTube.Playlist.Reach" "Apple.Music.Playlist.Count"
[19] "AirPlay.Spins" "SiriusXM.Spins"
[21] "Deezer.Playlist.Count" "Deezer.Playlist.Reach"
[23] "Amazon.Playlist.Count" "Pandora.Streams"
[25] "Pandora.Track.Stations" "Soundcloud.Streams"
[27] "Shazam.Counts" "Explicit.Track"
?numeric
as.numeric(df_subset$Spotify.Streams)Warning: NAs introduced by coercion
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[26] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[51] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[76] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[101] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[126] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[151] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[176] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[201] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[226] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[251] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[276] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[301] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[326] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[351] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[376] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[401] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[426] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[451] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[476] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Correlation Matrix
# computing p values of the data loaded
#p_values <- rcorr(as.matrix(data))
#print(p_values)