remove(list=ls())
#spotify_songs <- read.csv("~/Desktop/BCE Coding/Most Streamed Spotify Songs 2024.csv", comment.char="#")
spotify_songs <- read.csv("~/Desktop/BCE Coding/Most Streamed Spotify Songs 2024.csv", comment.char="#", na.strings = FALSE)FinalGroupProject_BCE
Importing and Renaming the Data Set
Cleaning the Data
library(visdat)
library(stargazer)
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(stargazer)
library(psych)
#install.packages("nanair")
library(naniar)
library(ggplot2)
Attaching package: 'ggplot2'
The following objects are masked from 'package:psych':
%+%, alpha
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
# install.packages("Hmisc")
library("Hmisc")
Attaching package: 'Hmisc'
The following objects are masked from 'package:dplyr':
src, summarize
The following object is masked from 'package:psych':
describe
The following objects are masked from 'package:base':
format.pval, units
library(MASS)
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
vis_dat(spotify_songs)vis_miss(spotify_songs)missing_values_count <- sapply(spotify_songs, function(x) sum(is.na(x)))
print(missing_values_count) Track Album.Name
0 0
Artist Release.Date
0 0
ISRC All.Time.Rank
0 0
Track.Score Spotify.Streams
14 0
Spotify.Playlist.Count Spotify.Playlist.Reach
0 0
Spotify.Popularity YouTube.Views
816 0
YouTube.Likes TikTok.Posts
0 0
TikTok.Likes TikTok.Views
0 0
YouTube.Playlist.Reach Apple.Music.Playlist.Count
0 572
AirPlay.Spins SiriusXM.Spins
0 0
Deezer.Playlist.Count Deezer.Playlist.Reach
931 0
Amazon.Playlist.Count Pandora.Streams
1064 0
Pandora.Track.Stations Soundcloud.Streams
0 0
Shazam.Counts TIDAL.Popularity
0 4599
Explicit.Track
14
spotify_songs$TIDAL.Popularity <- NULLLoop to clean variables
library("dplyr")
str(spotify_songs)'data.frame': 4599 obs. of 28 variables:
$ Track : chr "MILLION DOLLAR BABY" "Not Like Us" "i like the way you kiss me" "Flowers" ...
$ Album.Name : chr "Million Dollar Baby - Single" "Not Like Us" "I like the way you kiss me" "Flowers - Single" ...
$ Artist : chr "Tommy Richman" "Kendrick Lamar" "Artemas" "Miley Cyrus" ...
$ Release.Date : chr "4/26/2024" "5/4/2024" "3/19/2024" "1/12/2023" ...
$ ISRC : chr "QM24S2402528" "USUG12400910" "QZJ842400387" "USSM12209777" ...
$ All.Time.Rank : chr "1" "2" "3" "4" ...
$ Track.Score : num 725 546 538 445 423 ...
$ Spotify.Streams : chr "390,470,936" "323,703,884" "601,309,283" "2,031,280,633" ...
$ Spotify.Playlist.Count : chr "30,716" "28,113" "54,331" "269,802" ...
$ Spotify.Playlist.Reach : chr "196,631,588" "174,597,137" "211,607,669" "136,569,078" ...
$ Spotify.Popularity : int 92 92 92 85 88 83 86 92 NA 86 ...
$ YouTube.Views : chr "84,274,754" "116,347,040" "122,599,116" "1,096,100,899" ...
$ YouTube.Likes : chr "1,713,126" "3,486,739" "2,228,730" "10,629,796" ...
$ TikTok.Posts : chr "5,767,700" "674,700" "3,025,400" "7,189,811" ...
$ TikTok.Likes : chr "651,565,900" "35,223,547" "275,154,237" "1,078,757,968" ...
$ TikTok.Views : chr "5,332,281,936" "208,339,025" "3,369,120,610" "14,603,725,994" ...
$ YouTube.Playlist.Reach : chr "150,597,040" "156,380,351" "373,784,955" "3,351,188,582" ...
$ Apple.Music.Playlist.Count: int 210 188 190 394 182 138 280 160 NA 191 ...
$ AirPlay.Spins : chr "40,975" "40,778" "74,333" "1,474,799" ...
$ SiriusXM.Spins : chr "684" "3" "536" "2,182" ...
$ Deezer.Playlist.Count : int 62 67 136 264 82 86 168 87 NA 78 ...
$ Deezer.Playlist.Reach : chr "17,598,718" "10,422,430" "36,321,847" "24,684,248" ...
$ Amazon.Playlist.Count : int 114 111 172 210 105 152 154 53 NA 92 ...
$ Pandora.Streams : chr "18,004,655" "7,780,028" "5,022,621" "190,260,277" ...
$ Pandora.Track.Stations : chr "22,931" "28,444" "5,639" "203,384" ...
$ Soundcloud.Streams : chr "4,818,457" "6,623,075" "7,208,651" "" ...
$ Shazam.Counts : chr "2,669,262" "1,118,279" "5,285,340" "11,822,942" ...
$ Explicit.Track : int 0 1 0 0 1 1 0 1 1 1 ...
df_clean <- spotify_songs
string_vars <- c("All.Time.Rank",
"Spotify.Playlist.Count",
"Spotify.Playlist.Reach",
"Shazam.Counts",
"YouTube.Views",
"YouTube.Likes",
"TikTok.Posts",
"TikTok.Likes",
"TikTok.Views",
"YouTube.Playlist.Reach", "Spotify.Streams",
"AirPlay.Spins", "SiriusXM.Spins", "Pandora.Streams", "Pandora.Track.Stations", "Soundcloud.Streams", "Pandora.Streams", "Deezer.Playlist.Reach", "AirPlay.Spins", "Pandora.Track.Stations"
)
# string_vars <- names(df_clean)
for (i in string_vars){
print(i)
df_clean[[i]] <- gsub(pattern = ",", replacement = "", x = df_clean[[i]] )
df_clean[[i]] <- as.numeric(df_clean[[i]])
}[1] "All.Time.Rank"
[1] "Spotify.Playlist.Count"
[1] "Spotify.Playlist.Reach"
[1] "Shazam.Counts"
[1] "YouTube.Views"
[1] "YouTube.Likes"
[1] "TikTok.Posts"
[1] "TikTok.Likes"
[1] "TikTok.Views"
[1] "YouTube.Playlist.Reach"
[1] "Spotify.Streams"
[1] "AirPlay.Spins"
[1] "SiriusXM.Spins"
[1] "Pandora.Streams"
[1] "Pandora.Track.Stations"
[1] "Soundcloud.Streams"
[1] "Pandora.Streams"
[1] "Deezer.Playlist.Reach"
[1] "AirPlay.Spins"
[1] "Pandora.Track.Stations"
vis_miss(df_clean)Re-Cleaning Data
for(i in colnames(df_clean)){
df_clean[,i][is.na(df_clean[,i])] <- median(df_clean[,i], na.rm = TRUE)
}
vis_miss(df_clean)Summary Stats
stargazer(df_clean,
type = "text",
title = "Summary statistics",
digits = 2,
omit.summary.stat = "n",
notes = "n = 4599.")
Summary statistics
====================================================================================
Statistic Mean St. Dev. Min Max
------------------------------------------------------------------------------------
All.Time.Rank 2,291.24 1,321.00 1 4,998
Track.Score 41.80 38.52 19.40 725.40
Spotify.Streams 441,758,666.00 532,031,630.00 1,071.00 4,281,468,720.00
Spotify.Playlist.Count 58,896.01 70,554.65 1 590,392
Spotify.Playlist.Reach 23,165,195.00 29,451,055.00 1 262,343,414
Spotify.Popularity 64.11 14.76 1 96
YouTube.Views 385,204,459.00 680,960,386.00 913 16,322,756,555
YouTube.Likes 2,811,887.00 4,453,202.00 25 62,311,179
TikTok.Posts 749,253.00 2,134,003.00 1 42,900,000
TikTok.Likes 94,129,948.00 489,048,075.00 3 23,474,223,833
TikTok.Views 968,241,415.00 5,230,165,630.00 19 233,232,311,463
YouTube.Playlist.Reach 286,352,133.00 590,352,367.00 1 7,289,707,052
Apple.Music.Playlist.Count 51.19 67.29 1 859
AirPlay.Spins 49,153.18 119,356.00 1 1,777,811
SiriusXM.Spins 176.96 414.49 1.00 7,098.00
Deezer.Playlist.Count 28.71 48.70 1 632
Deezer.Playlist.Reach 1,076,784.00 3,202,914.00 1 48,197,850
Amazon.Playlist.Count 23.38 22.97 1 210
Pandora.Streams 67,992,567.00 149,554,570.00 2 1,463,624,043
Pandora.Track.Stations 65,763.56 224,896.40 1 3,780,513
Soundcloud.Streams 6,454,278.00 17,636,382.00 18.00 319,835,943.00
Shazam.Counts 2,311,550.00 5,669,531.00 1 219,794,543
Explicit.Track 0.36 0.48 0 1
------------------------------------------------------------------------------------
n = 4599.
Ggplots
ggplot(data = df_clean,
mapping = aes(x = TikTok.Posts, y = Spotify.Popularity)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between TikTok Posts and Spotify Popularity")ggplot(data = df_clean,
mapping = aes(x = YouTube.Views, y = YouTube.Likes)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between Youtube Views and Youtube Likes")Correlation Matrix
library(ggplot2)
df_melted <- reshape2::melt(df_clean)Using Track, Album.Name, Artist, Release.Date, ISRC as id variables
ggplot(df_melted, aes(x = value)) +
geom_histogram() +
facet_wrap(~variable, scales = "free_x")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histograms
ggplot(data = df_clean, # inputs clean data
mapping = # Inserts data for the homeruns by batters
aes(x = Spotify.Streams)) + geom_histogram() + ggtitle("Histogram of Spotify Streams")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Spotify Regression
ggplot(data = df_clean, # inputs clean data
mapping = # Inserts data for the homeruns by batters
aes(x = Spotify.Popularity)) + geom_histogram() + ggtitle("Histogram of Spotify Streams")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#computing p values of the data loaded
#p_values <- rcorr(as.matrix(df_clean))
#print(p_values)# Assuming df_clean is your original dataframe
df_subset <- df_clean |>
dplyr::select(-ISRC, -Track) |>
filter(!is.na(All.Time.Rank)) |>
mutate(release_year = substr(Release.Date,
start = nchar(Release.Date) - 3,
stop = nchar(Release.Date)
)
) |>
dplyr::select(-Release.Date)
kitchen_sink <-
lm(data = na.omit(df_subset),
formula = Spotify.Streams ~ . - Album.Name - Artist - SiriusXM.Spins - Soundcloud.Streams - Track.Score
)
summary(kitchen_sink)
Call:
lm(formula = Spotify.Streams ~ . - Album.Name - Artist - SiriusXM.Spins -
Soundcloud.Streams - Track.Score, data = na.omit(df_subset))
Residuals:
Min 1Q Median 3Q Max
-894145352 -110967205 -38796475 40396981 4049289159
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.062e+06 7.638e+07 0.053 0.957596
All.Time.Rank -2.172e+04 3.982e+03 -5.455 5.16e-08 ***
Spotify.Playlist.Count 3.167e+03 1.177e+02 26.907 < 2e-16 ***
Spotify.Playlist.Reach 1.166e+00 2.717e-01 4.290 1.82e-05 ***
Spotify.Popularity 1.566e+06 3.156e+05 4.961 7.26e-07 ***
YouTube.Views -2.758e-02 1.199e-02 -2.301 0.021450 *
YouTube.Likes 1.961e+01 1.794e+00 10.933 < 2e-16 ***
TikTok.Posts -4.267e+00 2.481e+00 -1.720 0.085498 .
TikTok.Likes 1.599e-01 7.258e-02 2.203 0.027627 *
TikTok.Views -1.512e-02 6.956e-03 -2.174 0.029792 *
YouTube.Playlist.Reach 7.857e-03 8.156e-03 0.963 0.335418
Apple.Music.Playlist.Count 1.297e+06 1.307e+05 9.925 < 2e-16 ***
AirPlay.Spins 1.681e+02 4.661e+01 3.606 0.000315 ***
Deezer.Playlist.Count 6.935e+05 1.429e+05 4.852 1.26e-06 ***
Deezer.Playlist.Reach -6.455e+00 1.635e+00 -3.948 8.01e-05 ***
Amazon.Playlist.Count -5.198e+05 2.876e+05 -1.807 0.070751 .
Pandora.Streams 4.459e-01 4.441e-02 10.041 < 2e-16 ***
Pandora.Track.Stations -1.025e+02 2.655e+01 -3.860 0.000115 ***
Shazam.Counts -3.294e-01 8.353e-01 -0.394 0.693362
Explicit.Track -2.728e+07 8.933e+06 -3.053 0.002276 **
release_year1987 -6.946e+07 2.854e+08 -0.243 0.807715
release_year1991 -2.917e+08 2.850e+08 -1.024 0.306066
release_year1994 9.698e+08 2.981e+08 3.253 0.001150 **
release_year1998 -9.508e+08 2.835e+08 -3.354 0.000802 ***
release_year1999 -1.769e+08 2.862e+08 -0.618 0.536458
release_year2000 1.315e+08 2.101e+08 0.626 0.531264
release_year2001 -4.657e+08 2.834e+08 -1.643 0.100424
release_year2002 2.094e+08 1.458e+08 1.436 0.151112
release_year2003 1.451e+08 1.751e+08 0.829 0.407216
release_year2004 -1.002e+08 1.455e+08 -0.689 0.490851
release_year2005 8.186e+07 1.507e+08 0.543 0.586953
release_year2006 -7.846e+07 1.753e+08 -0.448 0.654490
release_year2007 -1.263e+07 1.437e+08 -0.088 0.929975
release_year2008 -7.505e+07 1.125e+08 -0.667 0.504601
release_year2009 5.086e+07 1.036e+08 0.491 0.623500
release_year2010 -2.063e+07 9.327e+07 -0.221 0.824971
release_year2011 3.421e+07 8.510e+07 0.402 0.687691
release_year2012 1.857e+08 8.585e+07 2.163 0.030560 *
release_year2013 1.662e+08 8.339e+07 1.993 0.046322 *
release_year2014 1.376e+08 8.130e+07 1.692 0.090666 .
release_year2015 1.790e+08 7.940e+07 2.255 0.024194 *
release_year2016 1.865e+08 7.786e+07 2.395 0.016669 *
release_year2017 1.888e+08 7.595e+07 2.485 0.012982 *
release_year2018 1.615e+08 7.533e+07 2.144 0.032102 *
release_year2019 5.227e+07 7.482e+07 0.699 0.484821
release_year2020 -1.369e+07 7.447e+07 -0.184 0.854194
release_year2021 1.437e+07 7.412e+07 0.194 0.846291
release_year2022 2.684e+07 7.350e+07 0.365 0.715005
release_year2023 2.079e+07 7.318e+07 0.284 0.776379
release_year2024 -2.848e+07 7.359e+07 -0.387 0.698786
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 271600000 on 4549 degrees of freedom
Multiple R-squared: 0.7422, Adjusted R-squared: 0.7394
F-statistic: 267.2 on 49 and 4549 DF, p-value: < 2.2e-16
library(MASS)
AIC_model <-
stepAIC(object = kitchen_sink, direction = "backward")Start: AIC=178673.6
Spotify.Streams ~ (Album.Name + Artist + All.Time.Rank + Track.Score +
Spotify.Playlist.Count + Spotify.Playlist.Reach + Spotify.Popularity +
YouTube.Views + YouTube.Likes + TikTok.Posts + TikTok.Likes +
TikTok.Views + YouTube.Playlist.Reach + Apple.Music.Playlist.Count +
AirPlay.Spins + SiriusXM.Spins + Deezer.Playlist.Count +
Deezer.Playlist.Reach + Amazon.Playlist.Count + Pandora.Streams +
Pandora.Track.Stations + Soundcloud.Streams + Shazam.Counts +
Explicit.Track + release_year) - Album.Name - Artist - SiriusXM.Spins -
Soundcloud.Streams - Track.Score
Df Sum of Sq RSS AIC
- Shazam.Counts 1 1.1470e+16 3.3559e+20 178672
- YouTube.Playlist.Reach 1 6.8462e+16 3.3564e+20 178673
<none> 3.3557e+20 178674
- TikTok.Posts 1 2.1824e+17 3.3579e+20 178675
- Amazon.Playlist.Count 1 2.4101e+17 3.3582e+20 178675
- TikTok.Views 1 3.4850e+17 3.3592e+20 178676
- TikTok.Likes 1 3.5810e+17 3.3593e+20 178676
- YouTube.Views 1 3.9050e+17 3.3597e+20 178677
- Explicit.Track 1 6.8773e+17 3.3626e+20 178681
- AirPlay.Spins 1 9.5901e+17 3.3653e+20 178685
- Pandora.Track.Stations 1 1.0991e+18 3.3667e+20 178687
- Deezer.Playlist.Reach 1 1.1496e+18 3.3672e+20 178687
- Spotify.Playlist.Reach 1 1.3578e+18 3.3693e+20 178690
- Deezer.Playlist.Count 1 1.7367e+18 3.3731e+20 178695
- Spotify.Popularity 1 1.8157e+18 3.3739e+20 178696
- All.Time.Rank 1 2.1950e+18 3.3777e+20 178702
- Apple.Music.Playlist.Count 1 7.2668e+18 3.4284e+20 178770
- Pandora.Streams 1 7.4378e+18 3.4301e+20 178772
- YouTube.Likes 1 8.8169e+18 3.4439e+20 178791
- release_year 30 1.5902e+19 3.5148e+20 178827
- Spotify.Playlist.Count 1 5.3408e+19 3.8898e+20 179351
Step: AIC=178671.7
Spotify.Streams ~ All.Time.Rank + Spotify.Playlist.Count + Spotify.Playlist.Reach +
Spotify.Popularity + YouTube.Views + YouTube.Likes + TikTok.Posts +
TikTok.Likes + TikTok.Views + YouTube.Playlist.Reach + Apple.Music.Playlist.Count +
AirPlay.Spins + Deezer.Playlist.Count + Deezer.Playlist.Reach +
Amazon.Playlist.Count + Pandora.Streams + Pandora.Track.Stations +
Explicit.Track + release_year
Df Sum of Sq RSS AIC
- YouTube.Playlist.Reach 1 6.8670e+16 3.3566e+20 178671
<none> 3.3559e+20 178672
- TikTok.Posts 1 2.1694e+17 3.3580e+20 178673
- Amazon.Playlist.Count 1 2.3862e+17 3.3583e+20 178673
- TikTok.Views 1 3.5529e+17 3.3594e+20 178675
- TikTok.Likes 1 3.6517e+17 3.3595e+20 178675
- YouTube.Views 1 3.8413e+17 3.3597e+20 178675
- Explicit.Track 1 6.8064e+17 3.3627e+20 178679
- AirPlay.Spins 1 9.5811e+17 3.3654e+20 178683
- Pandora.Track.Stations 1 1.0964e+18 3.3668e+20 178685
- Deezer.Playlist.Reach 1 1.1499e+18 3.3674e+20 178685
- Spotify.Playlist.Reach 1 1.3740e+18 3.3696e+20 178689
- Deezer.Playlist.Count 1 1.7255e+18 3.3731e+20 178693
- Spotify.Popularity 1 1.8120e+18 3.3740e+20 178694
- All.Time.Rank 1 2.1836e+18 3.3777e+20 178700
- Apple.Music.Playlist.Count 1 7.3222e+18 3.4291e+20 178769
- Pandora.Streams 1 7.4434e+18 3.4303e+20 178771
- YouTube.Likes 1 8.8055e+18 3.4439e+20 178789
- release_year 30 1.5979e+19 3.5157e+20 178826
- Spotify.Playlist.Count 1 5.3636e+19 3.8922e+20 179352
Step: AIC=178670.7
Spotify.Streams ~ All.Time.Rank + Spotify.Playlist.Count + Spotify.Playlist.Reach +
Spotify.Popularity + YouTube.Views + YouTube.Likes + TikTok.Posts +
TikTok.Likes + TikTok.Views + Apple.Music.Playlist.Count +
AirPlay.Spins + Deezer.Playlist.Count + Deezer.Playlist.Reach +
Amazon.Playlist.Count + Pandora.Streams + Pandora.Track.Stations +
Explicit.Track + release_year
Df Sum of Sq RSS AIC
<none> 3.3566e+20 178671
- TikTok.Posts 1 2.2155e+17 3.3588e+20 178672
- Amazon.Playlist.Count 1 2.3621e+17 3.3589e+20 178672
- TikTok.Views 1 3.6207e+17 3.3602e+20 178674
- TikTok.Likes 1 3.7275e+17 3.3603e+20 178674
- YouTube.Views 1 3.8254e+17 3.3604e+20 178674
- Explicit.Track 1 6.8607e+17 3.3634e+20 178678
- Deezer.Playlist.Reach 1 1.1029e+18 3.3676e+20 178684
- Pandora.Track.Stations 1 1.1298e+18 3.3678e+20 178684
- AirPlay.Spins 1 1.2314e+18 3.3689e+20 178686
- Spotify.Playlist.Reach 1 1.3829e+18 3.3704e+20 178688
- Deezer.Playlist.Count 1 1.7552e+18 3.3741e+20 178693
- Spotify.Popularity 1 1.8202e+18 3.3748e+20 178694
- All.Time.Rank 1 2.2437e+18 3.3790e+20 178699
- Apple.Music.Playlist.Count 1 7.3122e+18 3.4297e+20 178768
- Pandora.Streams 1 7.4529e+18 3.4311e+20 178770
- YouTube.Likes 1 9.2458e+18 3.4490e+20 178794
- release_year 30 1.5975e+19 3.5163e+20 178824
- Spotify.Playlist.Count 1 5.3576e+19 3.8923e+20 179350
stargazer(kitchen_sink, AIC_model, type="text", omit = "release_year")
==================================================================================
Dependent variable:
-------------------------------------------------------
Spotify.Streams
(1) (2)
----------------------------------------------------------------------------------
All.Time.Rank -21,718.560*** -21,843.580***
(3,981.553) (3,960.362)
Spotify.Playlist.Count 3,167.159*** 3,160.753***
(117.708) (117.273)
Spotify.Playlist.Reach 1.166*** 1.175***
(0.272) (0.271)
Spotify.Popularity 1,565,632.000*** 1,567,325.000***
(315,577.400) (315,495.800)
YouTube.Views -0.028** -0.027**
(0.012) (0.012)
YouTube.Likes 19.611*** 19.843***
(1.794) (1.772)
TikTok.Posts -4.267* -4.298*
(2.481) (2.480)
TikTok.Likes 0.160** 0.163**
(0.073) (0.072)
TikTok.Views -0.015** -0.015**
(0.007) (0.007)
YouTube.Playlist.Reach 0.008
(0.008)
Apple.Music.Playlist.Count 1,296,961.000*** 1,289,090.000***
(130,675.200) (129,465.400)
AirPlay.Spins 168.069*** 181.562***
(46.613) (44.435)
Deezer.Playlist.Count 693,503.100*** 695,002.700***
(142,930.600) (142,466.900)
Deezer.Playlist.Reach -6.455*** -6.286***
(1.635) (1.626)
Amazon.Playlist.Count -519,752.400* -514,376.000*
(287,553.600) (287,428.100)
Pandora.Streams 0.446*** 0.446***
(0.044) (0.044)
Pandora.Track.Stations -102.494*** -103.751***
(26.553) (26.509)
Shazam.Counts -0.329
(0.835)
Explicit.Track -27,276,327.000*** -27,208,007.000***
(8,933,361.000) (8,920,871.000)
Constant 4,061,609.000 4,453,666.000
(76,383,717.000) (76,371,655.000)
----------------------------------------------------------------------------------
Observations 4,599 4,599
R2 0.742 0.742
Adjusted R2 0.739 0.739
Residual Std. Error 271,604,390.000 (df = 4549) 271,577,126.000 (df = 4551)
F Statistic 267.223*** (df = 49; 4549) 278.627*** (df = 47; 4551)
==================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
Predictions
reg1 <-
lm(formula = df_clean$Spotify.Streams ~ df_clean$TikTok.Views)
round(sum(reg1$residuals), 15)[1] 0.0044052
sum(reg1$residuals^2)[1] 1.299444e+21
sum(reg1$residuals)^2[1] 1.940579e-05
ggplot2::ggplot(mapping = aes(x = df_clean$Spotify.Streams, y = reg1$fitted.values)) + geom_point()+geom_line()summary(reg1)
Call:
lm(formula = df_clean$Spotify.Streams ~ df_clean$TikTok.Views)
Residuals:
Min 1Q Median 3Q Max
-1.369e+09 -3.662e+08 -1.992e+08 1.682e+08 3.829e+09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.378e+08 7.973e+06 54.915 < 2e-16 ***
df_clean$TikTok.Views 4.042e-03 1.499e-03 2.696 0.00704 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 531700000 on 4597 degrees of freedom
Multiple R-squared: 0.001579, Adjusted R-squared: 0.001362
F-statistic: 7.27 on 1 and 4597 DF, p-value: 0.007037
var(x = df_clean$TikTok.Views)[1] 2.735463e+19
cov(y = df_clean$Spotify.Streams, x = df_clean$TikTok.Views)[1] 1.105716e+17
beta1 <- cov(y = df_clean$Spotify.Streams, x = df_clean$TikTok.Views) / var(x = df_clean$TikTok.Views)
beta1[1] 0.004042152
x_bar <- mean(x = df_clean$TikTok.Views)
y_bar <- mean(x = df_clean$Spotify.Streams)
beta0 <- y_bar - beta1 * x_bar
beta0[1] 437844887
library(stargazer)
?stargazer
stargazer(reg1, type = "text")
===============================================
Dependent variable:
---------------------------
Spotify.Streams
-----------------------------------------------
TikTok.Views 0.004***
(0.001)
Constant 437,844,887.000***
(7,973,128.000)
-----------------------------------------------
Observations 4,599
R2 0.002
Adjusted R2 0.001
Residual Std. Error 531,669,245.000 (df = 4597)
F Statistic 7.270*** (df = 1; 4597)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
plot(reg1)reg2 <-
lm(formula = df_clean$Spotify.Streams ~ sqrt(df_clean$TikTok.Views))
plot(reg2)df_clean$predictions <- predict(object = reg1, newdata = df_clean)
summary(df_clean$predictions) Min. 1st Qu. Median Mean 3rd Qu. Max.
4.378e+08 4.382e+08 4.389e+08 4.418e+08 4.404e+08 1.381e+09
df_clean$predicted_Spotify.Streams <- ifelse(test = df_clean$predictions > .5, yes = 1, no = 0)
head(table(df_clean$Spotify.Streams, df_clean$predicted_Spotify.Streams))
1
1071 1
1186 1
1224 1
1332 1
1384 1
1537 1