#1.loading the data set data <- read_csv(“C:/Users/ACER/Desktop/Most Streamed Spotify Songs 2024.csv”)
#2.Exploratory data analysis
#1.Understanding the data set
data <- data[, colSums(is.na(data)) < nrow(data)]
numeric_columns <- names(data)[sapply(data, is.numeric)]
for (col in numeric_columns) { data[[col]][is.na(data[[col]])] <- mean(data[[col]], na.rm = TRUE) }
missing_values <- colSums(is.na(data)) print(“Missing values per column after handling:”) print(missing_values) #a.Column names are very important as they provide context and meaning to the data contained within each column. print(colnames(data))
#b.Are there any missing or duplicate values in the dataset? # Check for missing values missing_values <- colSums(is.na(data)) print(missing_values)
duplicate_rows <- sum(duplicated(data)) print(paste(“Number of duplicate rows:”, duplicate_rows))
#c.Convert the date column to Date format (assuming the column is named ‘date’) if (“date” %in% colnames(data)) { data\(date <- as.Date(data\)date, format=“%Y-%m-%d”)
# Find the time range time_range <- range(data$date, na.rm = TRUE) print(time_range) } else { print(“No ‘date’ column found in the dataset.”) }
#e. Display data types and structure of the dataset print(“Dataset structure:”) str(data)
#Identifying and apply necessary data transformations print(“Applying necessary transformations…”)
categorical_columns <- c(“artist”, “genre”, “region”, “label”) for (col in categorical_columns) { if (col %in% colnames(data)) { data[[col]] <- as.factor(data[[col]]) print(paste(“Converted”, col, “to factor (categorical).”)) } }
numeric_columns <- c(“streams”, “daily_streams”, “social_mentions”, “artist_popularity”) for (col in numeric_columns) { if (col %in% colnames(data)) { data[[col]] <- as.numeric(gsub(“,”, ““, data[[col]])) # Remove commas before conversion print(paste(”Converted”, col, “to numeric.”)) } }
print(“Updated dataset structure:”) str(data)
#(a) How many songs have more than 100 million streams? # Filter
songs with more than 100 million streams high_stream_songs <- data
%>% filter(Spotify Streams > 100000000)
num_high_stream_songs <- nrow(high_stream_songs)
print(paste(“Number of songs with more than 100 million streams:”, num_high_stream_songs))
#(b) Which artist has the most songs in the top 100 streamed songs? #
Select the top 100 streamed songs top_100_songs <- data %>%
arrange(desc(Spotify Streams)) %>% head(100)
artist_song_count <- top_100_songs %>% count(Artist, sort = TRUE)
top_artist <- artist_song_count %>% slice(1)
print(top_artist)
#(c) What percentage of songs belong to the top 5 record labels? # Count the number of songs per record label label_song_count <- data %>% count(‘Record Label’, sort = TRUE)
top_5_labels <- label_song_count %>% head(5)
total_songs <- nrow(data) top_5_percentage <- sum(top_5_labels$n) / total_songs * 100
print(paste(“Percentage of songs from top 5 record labels:”, round(top_5_percentage, 2), “%”))
#(d)How do Spotify Streams compare to YouTube Views? (Are songs with high Spotify streams also popular on YouTube?
correlation <- cor(data\(`Spotify
Streams`, data\)YouTube Views, use = “complete.obs”)
print(paste(“Correlation between Spotify Streams and YouTube Views:”,
round(correlation, 2)))
#3. Grouping & Summarisation #(a)What is the average number of
streams per song? # Calculate the average number of streams per song
avg_streams <- mean(data$Spotify Streams, na.rm =
TRUE)
print(paste(“Average number of streams per song:”, round(avg_streams, 2)))
#(b).Which platform contributes the most to the overall track
popularity? # Sum engagement by platform platform_totals <-
data.frame( Platform = c(“Spotify”, “YouTube”, “TikTok”, “Soundcloud”,
“Pandora”), Total_Engagement = c( sum(data\(`Spotify Streams`, na.rm = TRUE),
sum(data\)YouTube Views, na.rm = TRUE),
sum(data\(`TikTok Views`, na.rm = TRUE),
sum(data\)Soundcloud Streams, na.rm = TRUE),
sum(data$Pandora Streams, na.rm = TRUE) ) )
platform_totals <- platform_totals %>% arrange(desc(Total_Engagement)) print(platform_totals)
#(c)Which artist has the highest combined social media reach (YouTube
+ TikTok + Spotify)? # Calculate total social media reach data <-
data %>% mutate(Social_Reach = YouTube Views +
TikTok Views + Spotify Playlist Reach)
artist_reach <- data %>% group_by(Artist) %>% summarise(Total_Reach = sum(Social_Reach, na.rm = TRUE)) %>% arrange(desc(Total_Reach))
print(artist_streams)
#4. Sorting and Ranking Data
#(a)Which songs stayed in the top 10 “All Time Rank” the longest? #
Count occurrences of songs in the top 10 of “All Time Rank” top_10_songs
<- data %>% filter(All Time Rank <= 10) %>%
group_by(Track) %>% summarise(Top10_Appearances = n()) %>%
arrange(desc(Top10_Appearances))
print(top_10_songs)
#(b)What is the correlation between YouTube Views and Spotify
Streams? # Calculate correlation between YouTube Views and Spotify
Streams correlation <- cor(data\(`YouTube
Views`, data\)Spotify Streams, use =
“complete.obs”)
print(paste(“Correlation between YouTube Views and Spotify Streams:”, round(correlation, 2)))
#(c)Which song had the most consistent ranking across multiple platforms? # Select ranking-related columns (adjust column names based on dataset) ranking_columns <- c(“Spotify Popularity”, “YouTube Views”, “TikTok Likes”, “Apple Music Playlist Count”)
ranking_variability <- data %>% rowwise() %>% mutate(Rank_Variance = var(c_across(all_of(ranking_columns)), na.rm = TRUE)) %>% select(Track, Rank_Variance) %>% arrange(Rank_Variance)
print(ranking_variability %>% head(10))
#(d)Which songs were trending on both TikTok and Spotify
simultaneously? # Define the threshold for top 10% in both TikTok and
Spotify spotify_threshold <- quantile(data\(`Spotify Streams`, 0.90, na.rm = TRUE)
tiktok_threshold <- quantile(data\)TikTok Posts,
0.90, na.rm = TRUE)
trending_songs <- data %>% filter(Spotify Streams
>= spotify_threshold & TikTok Posts >=
tiktok_threshold) %>% select(Track, Artist,
Spotify Streams, TikTok Posts)
print(trending_songs)
data <- data %>% mutate(Composite_Popularity_Score =
scale(Spotify Streams) + scale(YouTube Views)
+ scale(TikTok Views) +
scale(Spotify Playlist Count) +
scale(Spotify Popularity) +
scale(Shazam Counts))
print(“Summary of Composite Popularity Score:”) print(summary(data$Composite_Popularity_Score))
data <- data %>% mutate( Views_per_Playlist =
ifelse(YouTube Playlist Reach > 0,
YouTube Views / YouTube Playlist Reach, NA),
TikTok_Engagement_Rate = ifelse(TikTok Posts > 0,
TikTok Likes / TikTok Posts, NA),
Spotify_Reach_per_Stream = ifelse(Spotify Streams > 0,
Spotify Playlist Reach / Spotify Streams, NA)
)
print(“First 5 rows of Engagement Ratios:”) print(data %>% select(Views_per_Playlist, TikTok_Engagement_Rate, Spotify_Reach_per_Stream) %>% head(5))
data <- data %>% mutate(Release_Date_Formatted =
as.Date(Release Date, format = “%Y-%m-%d”),
Days_Since_Release = as.numeric(Sys.Date() -
Release_Date_Formatted))
print(“Summary of Days Since Release:”) print(summary(data$Days_Since_Release))
data <- data %>% mutate(Platform_Count =
rowSums(!is.na(select(., Spotify Streams,
YouTube Views, TikTok Views,
Apple Music Playlist Count, Pandora Streams,
Deezer Playlist Count,
Soundcloud Streams))))
print(“Distribution of Platform Count:”) print(table(data$Platform_Count))
print(“Updated structure of dataset:”) str(data) View(data)