FinalGroupProject_BCE

Importing and Renaming the Data Set

remove(list=ls())

#spotify_songs  <- read.csv("~/Desktop/BCE Coding/Most Streamed Spotify Songs 2024.csv", comment.char="#")

spotify_songs  <- read.csv("~/Desktop/BCE Coding/Most Streamed Spotify Songs 2024.csv", comment.char="#", na.strings = FALSE)

Cleaning the Data

library(visdat)
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(stargazer)
library(psych)
#install.packages("nanair")
library(naniar)
library(ggplot2)

Attaching package: 'ggplot2'
The following objects are masked from 'package:psych':

    %+%, alpha
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
# install.packages("Hmisc")
library("Hmisc")

Attaching package: 'Hmisc'
The following objects are masked from 'package:dplyr':

    src, summarize
The following object is masked from 'package:psych':

    describe
The following objects are masked from 'package:base':

    format.pval, units
library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
vis_dat(spotify_songs)

vis_miss(spotify_songs)

missing_values_count <- sapply(spotify_songs, function(x) sum(is.na(x)))
print(missing_values_count)
                     Track                 Album.Name 
                         0                          0 
                    Artist               Release.Date 
                         0                          0 
                      ISRC              All.Time.Rank 
                         0                          0 
               Track.Score            Spotify.Streams 
                        14                          0 
    Spotify.Playlist.Count     Spotify.Playlist.Reach 
                         0                          0 
        Spotify.Popularity              YouTube.Views 
                       816                          0 
             YouTube.Likes               TikTok.Posts 
                         0                          0 
              TikTok.Likes               TikTok.Views 
                         0                          0 
    YouTube.Playlist.Reach Apple.Music.Playlist.Count 
                         0                        572 
             AirPlay.Spins             SiriusXM.Spins 
                         0                          0 
     Deezer.Playlist.Count      Deezer.Playlist.Reach 
                       931                          0 
     Amazon.Playlist.Count            Pandora.Streams 
                      1064                          0 
    Pandora.Track.Stations         Soundcloud.Streams 
                         0                          0 
             Shazam.Counts           TIDAL.Popularity 
                         0                       4599 
            Explicit.Track 
                        14 
spotify_songs$TIDAL.Popularity <- NULL

Loop to clean variables

library("dplyr")
str(spotify_songs)
'data.frame':   4599 obs. of  28 variables:
 $ Track                     : chr  "MILLION DOLLAR BABY" "Not Like Us" "i like the way you kiss me" "Flowers" ...
 $ Album.Name                : chr  "Million Dollar Baby - Single" "Not Like Us" "I like the way you kiss me" "Flowers - Single" ...
 $ Artist                    : chr  "Tommy Richman" "Kendrick Lamar" "Artemas" "Miley Cyrus" ...
 $ Release.Date              : chr  "4/26/2024" "5/4/2024" "3/19/2024" "1/12/2023" ...
 $ ISRC                      : chr  "QM24S2402528" "USUG12400910" "QZJ842400387" "USSM12209777" ...
 $ All.Time.Rank             : chr  "1" "2" "3" "4" ...
 $ Track.Score               : num  725 546 538 445 423 ...
 $ Spotify.Streams           : chr  "390,470,936" "323,703,884" "601,309,283" "2,031,280,633" ...
 $ Spotify.Playlist.Count    : chr  "30,716" "28,113" "54,331" "269,802" ...
 $ Spotify.Playlist.Reach    : chr  "196,631,588" "174,597,137" "211,607,669" "136,569,078" ...
 $ Spotify.Popularity        : int  92 92 92 85 88 83 86 92 NA 86 ...
 $ YouTube.Views             : chr  "84,274,754" "116,347,040" "122,599,116" "1,096,100,899" ...
 $ YouTube.Likes             : chr  "1,713,126" "3,486,739" "2,228,730" "10,629,796" ...
 $ TikTok.Posts              : chr  "5,767,700" "674,700" "3,025,400" "7,189,811" ...
 $ TikTok.Likes              : chr  "651,565,900" "35,223,547" "275,154,237" "1,078,757,968" ...
 $ TikTok.Views              : chr  "5,332,281,936" "208,339,025" "3,369,120,610" "14,603,725,994" ...
 $ YouTube.Playlist.Reach    : chr  "150,597,040" "156,380,351" "373,784,955" "3,351,188,582" ...
 $ Apple.Music.Playlist.Count: int  210 188 190 394 182 138 280 160 NA 191 ...
 $ AirPlay.Spins             : chr  "40,975" "40,778" "74,333" "1,474,799" ...
 $ SiriusXM.Spins            : chr  "684" "3" "536" "2,182" ...
 $ Deezer.Playlist.Count     : int  62 67 136 264 82 86 168 87 NA 78 ...
 $ Deezer.Playlist.Reach     : chr  "17,598,718" "10,422,430" "36,321,847" "24,684,248" ...
 $ Amazon.Playlist.Count     : int  114 111 172 210 105 152 154 53 NA 92 ...
 $ Pandora.Streams           : chr  "18,004,655" "7,780,028" "5,022,621" "190,260,277" ...
 $ Pandora.Track.Stations    : chr  "22,931" "28,444" "5,639" "203,384" ...
 $ Soundcloud.Streams        : chr  "4,818,457" "6,623,075" "7,208,651" "" ...
 $ Shazam.Counts             : chr  "2,669,262" "1,118,279" "5,285,340" "11,822,942" ...
 $ Explicit.Track            : int  0 1 0 0 1 1 0 1 1 1 ...
df_clean <- spotify_songs

string_vars <- c("All.Time.Rank",
  "Spotify.Playlist.Count",
                 "Spotify.Playlist.Reach", 
                 "Shazam.Counts", 
                 "YouTube.Views", 
                 "YouTube.Likes", 
                 "TikTok.Posts", 
                 "TikTok.Likes",
                 "TikTok.Views",
                 "YouTube.Playlist.Reach", "Spotify.Streams",
                 "AirPlay.Spins", "SiriusXM.Spins",  "Pandora.Streams", "Pandora.Track.Stations", "Soundcloud.Streams", "Pandora.Streams", "Deezer.Playlist.Reach", "AirPlay.Spins", "Pandora.Track.Stations"
                 )

# string_vars <- names(df_clean)

for (i in string_vars){
  print(i)
  df_clean[[i]] <- gsub(pattern = ",",  replacement = "",  x = df_clean[[i]] )
  df_clean[[i]] <- as.numeric(df_clean[[i]])
}
[1] "All.Time.Rank"
[1] "Spotify.Playlist.Count"
[1] "Spotify.Playlist.Reach"
[1] "Shazam.Counts"
[1] "YouTube.Views"
[1] "YouTube.Likes"
[1] "TikTok.Posts"
[1] "TikTok.Likes"
[1] "TikTok.Views"
[1] "YouTube.Playlist.Reach"
[1] "Spotify.Streams"
[1] "AirPlay.Spins"
[1] "SiriusXM.Spins"
[1] "Pandora.Streams"
[1] "Pandora.Track.Stations"
[1] "Soundcloud.Streams"
[1] "Pandora.Streams"
[1] "Deezer.Playlist.Reach"
[1] "AirPlay.Spins"
[1] "Pandora.Track.Stations"
vis_miss(df_clean)

Re-Cleaning Data

for(i in colnames(df_clean)){
  df_clean[,i][is.na(df_clean[,i])] <- median(df_clean[,i], na.rm = TRUE)
}

vis_miss(df_clean)

Summary Stats

stargazer(df_clean,
          type = "text",
          title = "Summary statistics",
          digits = 2,
          omit.summary.stat = "n",
          notes = "n = 4599.")

Summary statistics
====================================================================================
Statistic                       Mean          St. Dev.       Min          Max       
------------------------------------------------------------------------------------
All.Time.Rank                 2,291.24        1,321.00        1          4,998      
Track.Score                    41.80           38.52        19.40        725.40     
Spotify.Streams            441,758,666.00  532,031,630.00  1,071.00 4,281,468,720.00
Spotify.Playlist.Count       58,896.01       70,554.65        1         590,392     
Spotify.Playlist.Reach     23,165,195.00   29,451,055.00      1       262,343,414   
Spotify.Popularity             64.11           14.76          1            96       
YouTube.Views              385,204,459.00  680,960,386.00    913     16,322,756,555 
YouTube.Likes               2,811,887.00    4,453,202.00      25       62,311,179   
TikTok.Posts                 749,253.00     2,134,003.00      1        42,900,000   
TikTok.Likes               94,129,948.00   489,048,075.00     3      23,474,223,833 
TikTok.Views               968,241,415.00 5,230,165,630.00    19    233,232,311,463 
YouTube.Playlist.Reach     286,352,133.00  590,352,367.00     1      7,289,707,052  
Apple.Music.Playlist.Count     51.19           67.29          1           859       
AirPlay.Spins                49,153.18       119,356.00       1        1,777,811    
SiriusXM.Spins                 176.96          414.49        1.00       7,098.00    
Deezer.Playlist.Count          28.71           48.70          1           632       
Deezer.Playlist.Reach       1,076,784.00    3,202,914.00      1        48,197,850   
Amazon.Playlist.Count          23.38           22.97          1           210       
Pandora.Streams            67,992,567.00   149,554,570.00     2      1,463,624,043  
Pandora.Track.Stations       65,763.56       224,896.40       1        3,780,513    
Soundcloud.Streams          6,454,278.00   17,636,382.00    18.00    319,835,943.00 
Shazam.Counts               2,311,550.00    5,669,531.00      1       219,794,543   
Explicit.Track                  0.36            0.48          0            1        
------------------------------------------------------------------------------------
n = 4599.                                                                           

Ggplots

ggplot(data = df_clean, 
       mapping = aes(x = TikTok.Posts, y = Spotify.Popularity)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between TikTok Posts and Spotify Popularity")

ggplot(data = df_clean, 
       mapping = aes(x = YouTube.Views, y = YouTube.Likes)) + geom_point(colour = "deeppink") + ggtitle("Correlation Between Youtube Views and Youtube Likes")

Correlation Matrix

library(ggplot2)

df_melted <- reshape2::melt(df_clean)
Using Track, Album.Name, Artist, Release.Date, ISRC as id variables
ggplot(df_melted, aes(x = value)) +
  geom_histogram() +
  facet_wrap(~variable, scales = "free_x")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Histograms

ggplot(data = df_clean, # inputs clean data
       mapping = # Inserts data for the homeruns by batters
         aes(x = Spotify.Streams)) + geom_histogram() + ggtitle("Histogram of Spotify Streams")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Spotify Regression

ggplot(data = df_clean, # inputs clean data
       mapping = # Inserts data for the homeruns by batters
         aes(x = Spotify.Popularity)) + geom_histogram() + ggtitle("Histogram of Spotify Streams")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#computing p values of the data loaded
#p_values <- rcorr(as.matrix(df_clean))
#print(p_values)
# Assuming df_clean is your original dataframe
df_subset <- df_clean |>
  dplyr::select(-ISRC, -Track) |>
  filter(!is.na(All.Time.Rank)) |>
  mutate(release_year = substr(Release.Date, 
                               start = nchar(Release.Date) - 3, 
                               stop = nchar(Release.Date)
                               )
         ) |>
 dplyr::select(-Release.Date) 


kitchen_sink <- 
lm(data = na.omit(df_subset),
   formula = Spotify.Streams ~ . - Album.Name - Artist - SiriusXM.Spins - Soundcloud.Streams - Track.Score    
   )

summary(kitchen_sink)

Call:
lm(formula = Spotify.Streams ~ . - Album.Name - Artist - SiriusXM.Spins - 
    Soundcloud.Streams - Track.Score, data = na.omit(df_subset))

Residuals:
       Min         1Q     Median         3Q        Max 
-894145352 -110967205  -38796475   40396981 4049289159 

Coefficients:
                             Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 4.062e+06  7.638e+07   0.053 0.957596    
All.Time.Rank              -2.172e+04  3.982e+03  -5.455 5.16e-08 ***
Spotify.Playlist.Count      3.167e+03  1.177e+02  26.907  < 2e-16 ***
Spotify.Playlist.Reach      1.166e+00  2.717e-01   4.290 1.82e-05 ***
Spotify.Popularity          1.566e+06  3.156e+05   4.961 7.26e-07 ***
YouTube.Views              -2.758e-02  1.199e-02  -2.301 0.021450 *  
YouTube.Likes               1.961e+01  1.794e+00  10.933  < 2e-16 ***
TikTok.Posts               -4.267e+00  2.481e+00  -1.720 0.085498 .  
TikTok.Likes                1.599e-01  7.258e-02   2.203 0.027627 *  
TikTok.Views               -1.512e-02  6.956e-03  -2.174 0.029792 *  
YouTube.Playlist.Reach      7.857e-03  8.156e-03   0.963 0.335418    
Apple.Music.Playlist.Count  1.297e+06  1.307e+05   9.925  < 2e-16 ***
AirPlay.Spins               1.681e+02  4.661e+01   3.606 0.000315 ***
Deezer.Playlist.Count       6.935e+05  1.429e+05   4.852 1.26e-06 ***
Deezer.Playlist.Reach      -6.455e+00  1.635e+00  -3.948 8.01e-05 ***
Amazon.Playlist.Count      -5.198e+05  2.876e+05  -1.807 0.070751 .  
Pandora.Streams             4.459e-01  4.441e-02  10.041  < 2e-16 ***
Pandora.Track.Stations     -1.025e+02  2.655e+01  -3.860 0.000115 ***
Shazam.Counts              -3.294e-01  8.353e-01  -0.394 0.693362    
Explicit.Track             -2.728e+07  8.933e+06  -3.053 0.002276 ** 
release_year1987           -6.946e+07  2.854e+08  -0.243 0.807715    
release_year1991           -2.917e+08  2.850e+08  -1.024 0.306066    
release_year1994            9.698e+08  2.981e+08   3.253 0.001150 ** 
release_year1998           -9.508e+08  2.835e+08  -3.354 0.000802 ***
release_year1999           -1.769e+08  2.862e+08  -0.618 0.536458    
release_year2000            1.315e+08  2.101e+08   0.626 0.531264    
release_year2001           -4.657e+08  2.834e+08  -1.643 0.100424    
release_year2002            2.094e+08  1.458e+08   1.436 0.151112    
release_year2003            1.451e+08  1.751e+08   0.829 0.407216    
release_year2004           -1.002e+08  1.455e+08  -0.689 0.490851    
release_year2005            8.186e+07  1.507e+08   0.543 0.586953    
release_year2006           -7.846e+07  1.753e+08  -0.448 0.654490    
release_year2007           -1.263e+07  1.437e+08  -0.088 0.929975    
release_year2008           -7.505e+07  1.125e+08  -0.667 0.504601    
release_year2009            5.086e+07  1.036e+08   0.491 0.623500    
release_year2010           -2.063e+07  9.327e+07  -0.221 0.824971    
release_year2011            3.421e+07  8.510e+07   0.402 0.687691    
release_year2012            1.857e+08  8.585e+07   2.163 0.030560 *  
release_year2013            1.662e+08  8.339e+07   1.993 0.046322 *  
release_year2014            1.376e+08  8.130e+07   1.692 0.090666 .  
release_year2015            1.790e+08  7.940e+07   2.255 0.024194 *  
release_year2016            1.865e+08  7.786e+07   2.395 0.016669 *  
release_year2017            1.888e+08  7.595e+07   2.485 0.012982 *  
release_year2018            1.615e+08  7.533e+07   2.144 0.032102 *  
release_year2019            5.227e+07  7.482e+07   0.699 0.484821    
release_year2020           -1.369e+07  7.447e+07  -0.184 0.854194    
release_year2021            1.437e+07  7.412e+07   0.194 0.846291    
release_year2022            2.684e+07  7.350e+07   0.365 0.715005    
release_year2023            2.079e+07  7.318e+07   0.284 0.776379    
release_year2024           -2.848e+07  7.359e+07  -0.387 0.698786    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 271600000 on 4549 degrees of freedom
Multiple R-squared:  0.7422,    Adjusted R-squared:  0.7394 
F-statistic: 267.2 on 49 and 4549 DF,  p-value: < 2.2e-16
library(MASS)
AIC_model <-
stepAIC(object = kitchen_sink, direction = "backward")
Start:  AIC=178673.6
Spotify.Streams ~ (Album.Name + Artist + All.Time.Rank + Track.Score + 
    Spotify.Playlist.Count + Spotify.Playlist.Reach + Spotify.Popularity + 
    YouTube.Views + YouTube.Likes + TikTok.Posts + TikTok.Likes + 
    TikTok.Views + YouTube.Playlist.Reach + Apple.Music.Playlist.Count + 
    AirPlay.Spins + SiriusXM.Spins + Deezer.Playlist.Count + 
    Deezer.Playlist.Reach + Amazon.Playlist.Count + Pandora.Streams + 
    Pandora.Track.Stations + Soundcloud.Streams + Shazam.Counts + 
    Explicit.Track + release_year) - Album.Name - Artist - SiriusXM.Spins - 
    Soundcloud.Streams - Track.Score

                             Df  Sum of Sq        RSS    AIC
- Shazam.Counts               1 1.1470e+16 3.3559e+20 178672
- YouTube.Playlist.Reach      1 6.8462e+16 3.3564e+20 178673
<none>                                     3.3557e+20 178674
- TikTok.Posts                1 2.1824e+17 3.3579e+20 178675
- Amazon.Playlist.Count       1 2.4101e+17 3.3582e+20 178675
- TikTok.Views                1 3.4850e+17 3.3592e+20 178676
- TikTok.Likes                1 3.5810e+17 3.3593e+20 178676
- YouTube.Views               1 3.9050e+17 3.3597e+20 178677
- Explicit.Track              1 6.8773e+17 3.3626e+20 178681
- AirPlay.Spins               1 9.5901e+17 3.3653e+20 178685
- Pandora.Track.Stations      1 1.0991e+18 3.3667e+20 178687
- Deezer.Playlist.Reach       1 1.1496e+18 3.3672e+20 178687
- Spotify.Playlist.Reach      1 1.3578e+18 3.3693e+20 178690
- Deezer.Playlist.Count       1 1.7367e+18 3.3731e+20 178695
- Spotify.Popularity          1 1.8157e+18 3.3739e+20 178696
- All.Time.Rank               1 2.1950e+18 3.3777e+20 178702
- Apple.Music.Playlist.Count  1 7.2668e+18 3.4284e+20 178770
- Pandora.Streams             1 7.4378e+18 3.4301e+20 178772
- YouTube.Likes               1 8.8169e+18 3.4439e+20 178791
- release_year               30 1.5902e+19 3.5148e+20 178827
- Spotify.Playlist.Count      1 5.3408e+19 3.8898e+20 179351

Step:  AIC=178671.7
Spotify.Streams ~ All.Time.Rank + Spotify.Playlist.Count + Spotify.Playlist.Reach + 
    Spotify.Popularity + YouTube.Views + YouTube.Likes + TikTok.Posts + 
    TikTok.Likes + TikTok.Views + YouTube.Playlist.Reach + Apple.Music.Playlist.Count + 
    AirPlay.Spins + Deezer.Playlist.Count + Deezer.Playlist.Reach + 
    Amazon.Playlist.Count + Pandora.Streams + Pandora.Track.Stations + 
    Explicit.Track + release_year

                             Df  Sum of Sq        RSS    AIC
- YouTube.Playlist.Reach      1 6.8670e+16 3.3566e+20 178671
<none>                                     3.3559e+20 178672
- TikTok.Posts                1 2.1694e+17 3.3580e+20 178673
- Amazon.Playlist.Count       1 2.3862e+17 3.3583e+20 178673
- TikTok.Views                1 3.5529e+17 3.3594e+20 178675
- TikTok.Likes                1 3.6517e+17 3.3595e+20 178675
- YouTube.Views               1 3.8413e+17 3.3597e+20 178675
- Explicit.Track              1 6.8064e+17 3.3627e+20 178679
- AirPlay.Spins               1 9.5811e+17 3.3654e+20 178683
- Pandora.Track.Stations      1 1.0964e+18 3.3668e+20 178685
- Deezer.Playlist.Reach       1 1.1499e+18 3.3674e+20 178685
- Spotify.Playlist.Reach      1 1.3740e+18 3.3696e+20 178689
- Deezer.Playlist.Count       1 1.7255e+18 3.3731e+20 178693
- Spotify.Popularity          1 1.8120e+18 3.3740e+20 178694
- All.Time.Rank               1 2.1836e+18 3.3777e+20 178700
- Apple.Music.Playlist.Count  1 7.3222e+18 3.4291e+20 178769
- Pandora.Streams             1 7.4434e+18 3.4303e+20 178771
- YouTube.Likes               1 8.8055e+18 3.4439e+20 178789
- release_year               30 1.5979e+19 3.5157e+20 178826
- Spotify.Playlist.Count      1 5.3636e+19 3.8922e+20 179352

Step:  AIC=178670.7
Spotify.Streams ~ All.Time.Rank + Spotify.Playlist.Count + Spotify.Playlist.Reach + 
    Spotify.Popularity + YouTube.Views + YouTube.Likes + TikTok.Posts + 
    TikTok.Likes + TikTok.Views + Apple.Music.Playlist.Count + 
    AirPlay.Spins + Deezer.Playlist.Count + Deezer.Playlist.Reach + 
    Amazon.Playlist.Count + Pandora.Streams + Pandora.Track.Stations + 
    Explicit.Track + release_year

                             Df  Sum of Sq        RSS    AIC
<none>                                     3.3566e+20 178671
- TikTok.Posts                1 2.2155e+17 3.3588e+20 178672
- Amazon.Playlist.Count       1 2.3621e+17 3.3589e+20 178672
- TikTok.Views                1 3.6207e+17 3.3602e+20 178674
- TikTok.Likes                1 3.7275e+17 3.3603e+20 178674
- YouTube.Views               1 3.8254e+17 3.3604e+20 178674
- Explicit.Track              1 6.8607e+17 3.3634e+20 178678
- Deezer.Playlist.Reach       1 1.1029e+18 3.3676e+20 178684
- Pandora.Track.Stations      1 1.1298e+18 3.3678e+20 178684
- AirPlay.Spins               1 1.2314e+18 3.3689e+20 178686
- Spotify.Playlist.Reach      1 1.3829e+18 3.3704e+20 178688
- Deezer.Playlist.Count       1 1.7552e+18 3.3741e+20 178693
- Spotify.Popularity          1 1.8202e+18 3.3748e+20 178694
- All.Time.Rank               1 2.2437e+18 3.3790e+20 178699
- Apple.Music.Playlist.Count  1 7.3122e+18 3.4297e+20 178768
- Pandora.Streams             1 7.4529e+18 3.4311e+20 178770
- YouTube.Likes               1 9.2458e+18 3.4490e+20 178794
- release_year               30 1.5975e+19 3.5163e+20 178824
- Spotify.Playlist.Count      1 5.3576e+19 3.8923e+20 179350
stargazer(kitchen_sink, AIC_model, type="text", omit = "release_year")

==================================================================================
                                             Dependent variable:                  
                           -------------------------------------------------------
                                               Spotify.Streams                    
                                       (1)                         (2)            
----------------------------------------------------------------------------------
All.Time.Rank                    -21,718.560***              -21,843.580***       
                                   (3,981.553)                 (3,960.362)        
                                                                                  
Spotify.Playlist.Count            3,167.159***                3,160.753***        
                                    (117.708)                   (117.273)         
                                                                                  
Spotify.Playlist.Reach              1.166***                    1.175***          
                                     (0.272)                     (0.271)          
                                                                                  
Spotify.Popularity              1,565,632.000***            1,567,325.000***      
                                  (315,577.400)               (315,495.800)       
                                                                                  
YouTube.Views                       -0.028**                    -0.027**          
                                     (0.012)                     (0.012)          
                                                                                  
YouTube.Likes                       19.611***                   19.843***         
                                     (1.794)                     (1.772)          
                                                                                  
TikTok.Posts                         -4.267*                     -4.298*          
                                     (2.481)                     (2.480)          
                                                                                  
TikTok.Likes                         0.160**                     0.163**          
                                     (0.073)                     (0.072)          
                                                                                  
TikTok.Views                        -0.015**                    -0.015**          
                                     (0.007)                     (0.007)          
                                                                                  
YouTube.Playlist.Reach                0.008                                       
                                     (0.008)                                      
                                                                                  
Apple.Music.Playlist.Count      1,296,961.000***            1,289,090.000***      
                                  (130,675.200)               (129,465.400)       
                                                                                  
AirPlay.Spins                      168.069***                  181.562***         
                                    (46.613)                    (44.435)          
                                                                                  
Deezer.Playlist.Count            693,503.100***              695,002.700***       
                                  (142,930.600)               (142,466.900)       
                                                                                  
Deezer.Playlist.Reach               -6.455***                   -6.286***         
                                     (1.635)                     (1.626)          
                                                                                  
Amazon.Playlist.Count             -519,752.400*               -514,376.000*       
                                  (287,553.600)               (287,428.100)       
                                                                                  
Pandora.Streams                     0.446***                    0.446***          
                                     (0.044)                     (0.044)          
                                                                                  
Pandora.Track.Stations             -102.494***                 -103.751***        
                                    (26.553)                    (26.509)          
                                                                                  
Shazam.Counts                        -0.329                                       
                                     (0.835)                                      
                                                                                  
Explicit.Track                 -27,276,327.000***          -27,208,007.000***     
                                 (8,933,361.000)             (8,920,871.000)      
                                                                                  
Constant                          4,061,609.000               4,453,666.000       
                                (76,383,717.000)            (76,371,655.000)      
                                                                                  
----------------------------------------------------------------------------------
Observations                          4,599                       4,599           
R2                                    0.742                       0.742           
Adjusted R2                           0.739                       0.739           
Residual Std. Error        271,604,390.000 (df = 4549) 271,577,126.000 (df = 4551)
F Statistic                267.223*** (df = 49; 4549)  278.627*** (df = 47; 4551) 
==================================================================================
Note:                                                  *p<0.1; **p<0.05; ***p<0.01

Predictions

reg1 <- 
lm(formula = df_clean$Spotify.Streams ~ df_clean$TikTok.Views)


round(sum(reg1$residuals), 15)
[1] 0.0044052
sum(reg1$residuals^2)
[1] 1.299444e+21
sum(reg1$residuals)^2
[1] 1.940579e-05
ggplot2::ggplot(mapping = aes(x = df_clean$Spotify.Streams, y = reg1$fitted.values)) + geom_point()+geom_line()

summary(reg1)

Call:
lm(formula = df_clean$Spotify.Streams ~ df_clean$TikTok.Views)

Residuals:
       Min         1Q     Median         3Q        Max 
-1.369e+09 -3.662e+08 -1.992e+08  1.682e+08  3.829e+09 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)    
(Intercept)           4.378e+08  7.973e+06  54.915  < 2e-16 ***
df_clean$TikTok.Views 4.042e-03  1.499e-03   2.696  0.00704 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 531700000 on 4597 degrees of freedom
Multiple R-squared:  0.001579,  Adjusted R-squared:  0.001362 
F-statistic:  7.27 on 1 and 4597 DF,  p-value: 0.007037
var(x = df_clean$TikTok.Views)
[1] 2.735463e+19
cov(y = df_clean$Spotify.Streams, x = df_clean$TikTok.Views)
[1] 1.105716e+17
beta1 <- cov(y = df_clean$Spotify.Streams, x = df_clean$TikTok.Views) / var(x = df_clean$TikTok.Views) 
beta1
[1] 0.004042152
x_bar <- mean(x = df_clean$TikTok.Views)
y_bar <- mean(x = df_clean$Spotify.Streams)

beta0 <- y_bar - beta1 * x_bar
beta0
[1] 437844887
library(stargazer)
?stargazer

stargazer(reg1, type = "text")

===============================================
                        Dependent variable:    
                    ---------------------------
                          Spotify.Streams      
-----------------------------------------------
TikTok.Views                 0.004***          
                              (0.001)          
                                               
Constant                437,844,887.000***     
                          (7,973,128.000)      
                                               
-----------------------------------------------
Observations                   4,599           
R2                             0.002           
Adjusted R2                    0.001           
Residual Std. Error 531,669,245.000 (df = 4597)
F Statistic           7.270*** (df = 1; 4597)  
===============================================
Note:               *p<0.1; **p<0.05; ***p<0.01
plot(reg1)

reg2 <- 
lm(formula = df_clean$Spotify.Streams ~ sqrt(df_clean$TikTok.Views))

plot(reg2)

df_clean$predictions <- predict(object = reg1, newdata = df_clean)

summary(df_clean$predictions)
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
4.378e+08 4.382e+08 4.389e+08 4.418e+08 4.404e+08 1.381e+09 
df_clean$predicted_Spotify.Streams <- ifelse(test = df_clean$predictions > .5, yes = 1, no = 0)

head(table(df_clean$Spotify.Streams, df_clean$predicted_Spotify.Streams))
      
       1
  1071 1
  1186 1
  1224 1
  1332 1
  1384 1
  1537 1