set.seed(123)

dataset <- read.csv("spotify-2023.csv")
names(dataset)
##  [1] "track_name"           "artist.s._name"       "artist_count"        
##  [4] "released_year"        "released_month"       "released_day"        
##  [7] "in_spotify_playlists" "in_spotify_charts"    "streams"             
## [10] "in_apple_playlists"   "in_apple_charts"      "in_deezer_playlists" 
## [13] "in_deezer_charts"     "in_shazam_charts"     "bpm"                 
## [16] "key"                  "mode"                 "danceability_."      
## [19] "valence_."            "energy_."             "acousticness_."      
## [22] "instrumentalness_."   "liveness_."           "speechiness_."
dataset$in_deezer_playlists <- as.integer(dataset$in_deezer_playlists)
## Warning: NAs introduced by coercion
print(typeof(dataset$in_deezer_playlists))
## [1] "integer"
dataset$in_shazam_charts <- as.integer(dataset$in_shazam_charts)
## Warning: NAs introduced by coercion
print(typeof(dataset$in_shazam_charts))
## [1] "integer"
dataset$total_playlist_inclusions<-dataset$in_spotify_playlists+dataset$in_apple_playlists+dataset$in_deezer_playlists+dataset$in_shazam_charts
dataset$average_chart_position <- (dataset$in_spotify_charts + dataset$in_apple_charts + dataset$in_deezer_charts + dataset$in_shazam_charts) / 4
plot(dataset$total_playlist_inclusions, dataset$streams, xlab = "Total Playlist Inclusions", ylab = "Streams", main = "Relationship between Total Playlist Inclusions and Streams")

Points to Consider:

plot(dataset$average_chart_position, dataset$streams, xlab = "Average Chart Position", ylab = "Streams", main = "Relationship between Average Chart Position and Streams")

Points to Consider:

print(sum(is.na(dataset$total_playlist_inclusions)))
## [1] 123
print(sum(is.na(dataset$average_chart_position)))
## [1] 57
print(sum(is.na(dataset$streams)))
## [1] 0
dataset$total_playlist_inclusions <- ifelse(is.na(dataset$total_playlist_inclusions), 0, dataset$total_playlist_inclusions)
dataset$average_chart_position <- ifelse(is.na(dataset$average_chart_position), 0, dataset$average_chart_position)
# Calculating correlation coefficients
cor1 <- cor(dataset$total_playlist_inclusions, dataset$streams)
cor2 <- cor(dataset$average_chart_position, dataset$streams)

# Printing correlation coefficients
cat("Correlation coefficient between Total Playlist Inclusions and Streams:", cor1, "\n")
## Correlation coefficient between Total Playlist Inclusions and Streams: 0.2636325
cat("Correlation coefficient between Average Chart Position and Streams:", cor2, "\n")
## Correlation coefficient between Average Chart Position and Streams: 0.08899804

1. Total Playlist Inclusions vs. Streams (correlation coefficient: 0.2636)

2. Average Chart Position vs. Streams (correlation coefficient: 0.0890)

Why the Values Make Sense:

Overall, the correlation coefficients confirm what we observed visually in the graphs. There’s a weak positive influence of playlist inclusions on streams, while average chart position has a negligible linear relationship with streams on Spotify in this data.

# Build confidence intervals for response variables
# Assuming 'streams' as the response variable
confidence_interval <- t.test(dataset$streams)$conf.int
cat("Confidence Interval for Streams:", confidence_interval, "\n")
## Confidence Interval for Streams: 477566048 549629814

Conclusion:

There is a 95% chance that the average number of streams for all songs on Spotify in 2023 lies within this range. In other words, I can be fairly certain that the typical song in this dataset received somewhere between approximately 477 million and 550 million streams.

Important Considerations:

Overall, the confidence interval provides a valuable estimate of the average number of streams for songs on Spotify in 2023, along with a measure of uncertainty associated with that estimate.