#1.loading the data set data <- read_csv(“C:/Users/ACER/Desktop/Most Streamed Spotify Songs 2024.csv”)

#2.Exploratory data analysis

#1.Understanding the data set

Remove columns that contain all NA values

data <- data[, colSums(is.na(data)) < nrow(data)]

Fill NA values in numeric columns with their respective mean values

numeric_columns <- names(data)[sapply(data, is.numeric)]

for (col in numeric_columns) { data[[col]][is.na(data[[col]])] <- mean(data[[col]], na.rm = TRUE) }

Check if missing values have been handled

missing_values <- colSums(is.na(data)) print(“Missing values per column after handling:”) print(missing_values) #a.Column names are very important as they provide context and meaning to the data contained within each column. print(colnames(data))

#b.Are there any missing or duplicate values in the dataset? # Check for missing values missing_values <- colSums(is.na(data)) print(missing_values)

Check for duplicate rows

duplicate_rows <- sum(duplicated(data)) print(paste(“Number of duplicate rows:”, duplicate_rows))

#c.Convert the date column to Date format (assuming the column is named ‘date’) if (“date” %in% colnames(data)) { data\(date <- as.Date(data\)date, format=“%Y-%m-%d”)

# Find the time range time_range <- range(data$date, na.rm = TRUE) print(time_range) } else { print(“No ‘date’ column found in the dataset.”) }

#e. Display data types and structure of the dataset print(“Dataset structure:”) str(data)

#Identifying and apply necessary data transformations print(“Applying necessary transformations…”)

Convert character columns that should be categorical (factor)

categorical_columns <- c(“artist”, “genre”, “region”, “label”) for (col in categorical_columns) { if (col %in% colnames(data)) { data[[col]] <- as.factor(data[[col]]) print(paste(“Converted”, col, “to factor (categorical).”)) } }

Convert numeric columns stored as characters to numeric

numeric_columns <- c(“streams”, “daily_streams”, “social_mentions”, “artist_popularity”) for (col in numeric_columns) { if (col %in% colnames(data)) { data[[col]] <- as.numeric(gsub(“,”, ““, data[[col]])) # Remove commas before conversion print(paste(”Converted”, col, “to numeric.”)) } }

Final check on dataset structure after transformations

print(“Updated dataset structure:”) str(data)

2. Data Extraction & Filtering

#(a) How many songs have more than 100 million streams? # Filter songs with more than 100 million streams high_stream_songs <- data %>% filter(Spotify Streams > 100000000)

Count the number of such songs

num_high_stream_songs <- nrow(high_stream_songs)

Count the number of songs per artist

artist_song_count <- top_100_songs %>% count(Artist, sort = TRUE)

Get the top 5 record labels

top_5_labels <- label_song_count %>% head(5)

Calculate the percentage of songs from these labels

total_songs <- nrow(data) top_5_percentage <- sum(top_5_labels$n) / total_songs * 100

Compute correlation between Spotify Streams and YouTube Views

correlation <- cor(data\(`Spotify Streams`, data\)YouTube Views, use = “complete.obs”) print(paste(“Correlation between Spotify Streams and YouTube Views:”, round(correlation, 2)))

#3. Grouping & Summarisation #(a)What is the average number of streams per song? # Calculate the average number of streams per song avg_streams <- mean(data$Spotify Streams, na.rm = TRUE)

Sort by total engagement

platform_totals <- platform_totals %>% arrange(desc(Total_Engagement)) print(platform_totals)

#(c)Which artist has the highest combined social media reach (YouTube + TikTok + Spotify)? # Calculate total social media reach data <- data %>% mutate(Social_Reach = YouTube Views + TikTok Views + Spotify Playlist Reach)

Group by artist and sum their total reach

artist_reach <- data %>% group_by(Artist) %>% summarise(Total_Reach = sum(Social_Reach, na.rm = TRUE)) %>% arrange(desc(Total_Reach))

Display the artist with the highest total social reach

top_social_artist <- artist_reach %>% slice(1) print(top_social_artist)

#(d) What is the total number of streams per artist? # Summarize total streams by artist artist_streams <- data %>% group_by(Artist) %>% summarize(total_streams = sum(Spotify Streams, na.rm = TRUE))

Calculate variance in rankings for each song

ranking_variability <- data %>% rowwise() %>% mutate(Rank_Variance = var(c_across(all_of(ranking_columns)), na.rm = TRUE)) %>% select(Track, Rank_Variance) %>% arrange(Rank_Variance)

Filter songs that meet both conditions

trending_songs <- data %>% filter(Spotify Streams >= spotify_threshold & TikTok Posts >= tiktok_threshold) %>% select(Track, Artist, Spotify Streams, TikTok Posts)

5. Feature Engineering

(a). Composite Popularity Score (standardized sum of key metrics)

data <- data %>% mutate(Composite_Popularity_Score = scale(Spotify Streams) + scale(YouTube Views) + scale(TikTok Views) + scale(Spotify Playlist Count) + scale(Spotify Popularity) + scale(Shazam Counts))

(b). Engagement Ratios

data <- data %>% mutate( Views_per_Playlist = ifelse(YouTube Playlist Reach > 0, YouTube Views / YouTube Playlist Reach, NA), TikTok_Engagement_Rate = ifelse(TikTok Posts > 0, TikTok Likes / TikTok Posts, NA), Spotify_Reach_per_Stream = ifelse(Spotify Streams > 0, Spotify Playlist Reach / Spotify Streams, NA) )

(c). Days Since Release

data <- data %>% mutate(Release_Date_Formatted = as.Date(Release Date, format = “%Y-%m-%d”), Days_Since_Release = as.numeric(Sys.Date() - Release_Date_Formatted))

(d). Platform Count (non-NA count across key platforms)

data <- data %>% mutate(Platform_Count = rowSums(!is.na(select(., Spotify Streams, YouTube Views, TikTok Views, Apple Music Playlist Count, Pandora Streams, Deezer Playlist Count, Soundcloud Streams))))

Final structure of the dataset

print(“Updated structure of dataset:”) str(data) View(data)