#1.loading the data set data <- read_csv(“C:/Users/ACER/Desktop/Most Streamed Spotify Songs 2024.csv”)

#2.Exploratory data analysis

#1.Understanding the data set

Remove columns that contain all NA values

data <- data[, colSums(is.na(data)) < nrow(data)]

Fill NA values in numeric columns with their respective mean values

numeric_columns <- names(data)[sapply(data, is.numeric)]

for (col in numeric_columns) { data[[col]][is.na(data[[col]])] <- mean(data[[col]], na.rm = TRUE) }

Check if missing values have been handled

missing_values <- colSums(is.na(data)) print(“Missing values per column after handling:”) print(missing_values) #a.Column names are very important as they provide context and meaning to the data contained within each column. print(colnames(data))

#b.Are there any missing or duplicate values in the dataset? # Check for missing values missing_values <- colSums(is.na(data)) print(missing_values)

Check for duplicate rows

duplicate_rows <- sum(duplicated(data)) print(paste(“Number of duplicate rows:”, duplicate_rows))

#c.Convert the date column to Date format (assuming the column is named ‘date’) if (“date” %in% colnames(data)) { data$date <- as.Date(data$date, format=“%Y-%m-%d”)

# Find the time range time_range <- range(data$date, na.rm = TRUE) print(time_range) } else { print(“No ‘date’ column found in the dataset.”) }

#e. Display data types and structure of the dataset print(“Dataset structure:”) str(data)

#Identifying and apply necessary data transformations print(“Applying necessary transformations…”)

Convert character columns that should be categorical (factor)

categorical_columns <- c(“artist”, “genre”, “region”, “label”) for (col in categorical_columns) { if (col %in% colnames(data)) { data[[col]] <- as.factor(data[[col]]) print(paste(“Converted”, col, “to factor (categorical).”)) } }

Convert numeric columns stored as characters to numeric

numeric_columns <- c(“streams”, “daily_streams”, “social_mentions”, “artist_popularity”) for (col in numeric_columns) { if (col %in% colnames(data)) { data[[col]] <- as.numeric(gsub(“,”, ““, data[[col]])) # Remove commas before conversion print(paste(”Converted”, col, “to numeric.”)) } }

Final check on dataset structure after transformations

print(“Updated dataset structure:”) str(data)

2. Data Extraction & Filtering

#(a) How many songs have more than 100 million streams? # Filter songs with more than 100 million streams high_stream_songs <- data %>% filter(Spotify Streams > 100000000)

Count the number of such songs

num_high_stream_songs <- nrow(high_stream_songs)

Print the result

print(paste(“Number of songs with more than 100 million streams:”, num_high_stream_songs))

#(b) Which artist has the most songs in the top 100 streamed songs? # Select the top 100 streamed songs top_100_songs <- data %>% arrange(desc(Spotify Streams)) %>% head(100)

Count the number of songs per artist

artist_song_count <- top_100_songs %>% count(Artist, sort = TRUE)

Print the artist with the most songs

top_artist <- artist_song_count %>% slice(1)

print(top_artist)

#(c) What percentage of songs belong to the top 5 record labels? # Count the number of songs per record label label_song_count <- data %>% count(‘Record Label’, sort = TRUE)

Get the top 5 record labels

top_5_labels <- label_song_count %>% head(5)

Calculate the percentage of songs from these labels

total_songs <- nrow(data) top_5_percentage <- sum(top_5_labels$n) / total_songs * 100

Print the result

print(paste(“Percentage of songs from top 5 record labels:”, round(top_5_percentage, 2), “%”))

#(d)How do Spotify Streams compare to YouTube Views? (Are songs with high Spotify streams also popular on YouTube?

Compute correlation between Spotify Streams and YouTube Views

correlation <- cor(data$`Spotify Streams`, data$YouTube Views, use = “complete.obs”) print(paste(“Correlation between Spotify Streams and YouTube Views:”, round(correlation, 2)))

#3. Grouping & Summarisation #(a)What is the average number of streams per song? # Calculate the average number of streams per song avg_streams <- mean(data$Spotify Streams, na.rm = TRUE)

Print the result

print(paste(“Average number of streams per song:”, round(avg_streams, 2)))

#(b).Which platform contributes the most to the overall track popularity? # Sum engagement by platform platform_totals <- data.frame( Platform = c(“Spotify”, “YouTube”, “TikTok”, “Soundcloud”, “Pandora”), Total_Engagement = c( sum(data$`Spotify Streams`, na.rm = TRUE), sum(data$YouTube Views, na.rm = TRUE), sum(data$`TikTok Views`, na.rm = TRUE), sum(data$Soundcloud Streams, na.rm = TRUE), sum(data$Pandora Streams, na.rm = TRUE) ) )

Sort by total engagement

platform_totals <- platform_totals %>% arrange(desc(Total_Engagement)) print(platform_totals)

#(c)Which artist has the highest combined social media reach (YouTube + TikTok + Spotify)? # Calculate total social media reach data <- data %>% mutate(Social_Reach = YouTube Views + TikTok Views + Spotify Playlist Reach)

Group by artist and sum their total reach

artist_reach <- data %>% group_by(Artist) %>% summarise(Total_Reach = sum(Social_Reach, na.rm = TRUE)) %>% arrange(desc(Total_Reach))

Print the result

print(artist_streams)

#4. Sorting and Ranking Data

#(a)Which songs stayed in the top 10 “All Time Rank” the longest? # Count occurrences of songs in the top 10 of “All Time Rank” top_10_songs <- data %>% filter(All Time Rank <= 10) %>% group_by(Track) %>% summarise(Top10_Appearances = n()) %>% arrange(desc(Top10_Appearances))

Print the result

print(top_10_songs)

#(b)What is the correlation between YouTube Views and Spotify Streams? # Calculate correlation between YouTube Views and Spotify Streams correlation <- cor(data$`YouTube Views`, data$Spotify Streams, use = “complete.obs”)

Print correlation result

print(paste(“Correlation between YouTube Views and Spotify Streams:”, round(correlation, 2)))

#(c)Which song had the most consistent ranking across multiple platforms? # Select ranking-related columns (adjust column names based on dataset) ranking_columns <- c(“Spotify Popularity”, “YouTube Views”, “TikTok Likes”, “Apple Music Playlist Count”)

Calculate variance in rankings for each song

ranking_variability <- data %>% rowwise() %>% mutate(Rank_Variance = var(c_across(all_of(ranking_columns)), na.rm = TRUE)) %>% select(Track, Rank_Variance) %>% arrange(Rank_Variance)

Print the song with the most consistent ranking (lowest variance)

print(ranking_variability %>% head(10))

#(d)Which songs were trending on both TikTok and Spotify simultaneously? # Define the threshold for top 10% in both TikTok and Spotify spotify_threshold <- quantile(data$`Spotify Streams`, 0.90, na.rm = TRUE) tiktok_threshold <- quantile(data$TikTok Posts, 0.90, na.rm = TRUE)

Filter songs that meet both conditions

trending_songs <- data %>% filter(Spotify Streams >= spotify_threshold & TikTok Posts >= tiktok_threshold) %>% select(Track, Artist, Spotify Streams, TikTok Posts)

Print the result

print(trending_songs)

5. Feature Engineering

(a). Composite Popularity Score (standardized sum of key metrics)

data <- data %>% mutate(Composite_Popularity_Score = scale(Spotify Streams) + scale(YouTube Views) + scale(TikTok Views) + scale(Spotify Playlist Count) + scale(Spotify Popularity) + scale(Shazam Counts))

Print summary of new composite score

print(“Summary of Composite Popularity Score:”) print(summary(data$Composite_Popularity_Score))

(b). Engagement Ratios

data <- data %>% mutate( Views_per_Playlist = ifelse(YouTube Playlist Reach > 0, YouTube Views / YouTube Playlist Reach, NA), TikTok_Engagement_Rate = ifelse(TikTok Posts > 0, TikTok Likes / TikTok Posts, NA), Spotify_Reach_per_Stream = ifelse(Spotify Streams > 0, Spotify Playlist Reach / Spotify Streams, NA) )

Print example rows for engagement ratios

print(“First 5 rows of Engagement Ratios:”) print(data %>% select(Views_per_Playlist, TikTok_Engagement_Rate, Spotify_Reach_per_Stream) %>% head(5))

(c). Days Since Release

data <- data %>% mutate(Release_Date_Formatted = as.Date(Release Date, format = “%Y-%m-%d”), Days_Since_Release = as.numeric(Sys.Date() - Release_Date_Formatted))

Print summary of days since release

print(“Summary of Days Since Release:”) print(summary(data$Days_Since_Release))

(d). Platform Count (non-NA count across key platforms)

data <- data %>% mutate(Platform_Count = rowSums(!is.na(select(., Spotify Streams, YouTube Views, TikTok Views, Apple Music Playlist Count, Pandora Streams, Deezer Playlist Count, Soundcloud Streams))))

Print frequency of Platform Count

print(“Distribution of Platform Count:”) print(table(data$Platform_Count))

Final structure of the dataset

print(“Updated structure of dataset:”) str(data) View(data)

Spotify Data Analysis

Armaan

Remove columns that contain all NA values

Fill NA values in numeric columns with their respective mean values

Check if missing values have been handled

Check for duplicate rows

Convert character columns that should be categorical (factor)

Convert numeric columns stored as characters to numeric

Final check on dataset structure after transformations

2. Data Extraction & Filtering

Count the number of such songs

Print the result

Count the number of songs per artist

Print the artist with the most songs

Get the top 5 record labels

Calculate the percentage of songs from these labels

Print the result

Compute correlation between Spotify Streams and YouTube Views

Print the result

Sort by total engagement

Group by artist and sum their total reach

Print the result

Print the result

Print correlation result

Calculate variance in rankings for each song

Print the song with the most consistent ranking (lowest variance)

Filter songs that meet both conditions

Print the result

5. Feature Engineering

(a). Composite Popularity Score (standardized sum of key metrics)

Print summary of new composite score

(b). Engagement Ratios

Print example rows for engagement ratios

(c). Days Since Release

Print summary of days since release

(d). Platform Count (non-NA count across key platforms)

Print frequency of Platform Count

Final structure of the dataset

Spotify Data Analysis

Armaan

Remove columns that contain all NA values

Fill NA values in numeric columns with their respective mean values

Check if missing values have been handled

Check for duplicate rows

Convert character columns that should be categorical (factor)

Convert numeric columns stored as characters to numeric

Final check on dataset structure after transformations

2. Data Extraction & Filtering

Count the number of such songs

Print the result

Count the number of songs per artist

Print the artist with the most songs

Get the top 5 record labels

Calculate the percentage of songs from these labels

Print the result

Compute correlation between Spotify Streams and YouTube Views

Print the result

Sort by total engagement

Group by artist and sum their total reach

Display the artist with the highest total social reach

Print the result

Print the result

Print correlation result

Calculate variance in rankings for each song

Print the song with the most consistent ranking (lowest variance)

Filter songs that meet both conditions

Print the result

5. Feature Engineering

(a). Composite Popularity Score (standardized sum of key metrics)

Print summary of new composite score

(b). Engagement Ratios

Print example rows for engagement ratios

(c). Days Since Release

Print summary of days since release

(d). Platform Count (non-NA count across key platforms)

Print frequency of Platform Count

Final structure of the dataset