library(dplyr)
library(readr)
library(tidyverse)
library(ggplot2)
library(conflicted)

Selecting Data

#Reading the data set and printing first 5rows
data <- read.csv("dataset.csv")

conflicted::conflicts_prefer(dplyr::filter)
[conflicted] Removing existing preference.[conflicted] Will prefer dplyr::filter over any other package.
# Filtering dataset where explicit is "True" and taking a sample of 9,000 rows
sample_data <- data |> filter(explicit == "True") |> sample_n(9000)
data <- sample_data
nrow(data)
[1] 9000
# Display first few rows
head(data)

Making Random Subsamples

# Determine sample size (50% of data) from 9000 rows
sample_size <- min(9000, round(nrow(data) * 0.5))

# Create 5 random samples with replacement
df_1 <- data |> sample_n(sample_size, replace = TRUE)
df_2 <- data |> sample_n(sample_size, replace = TRUE)
df_3 <- data |> sample_n(sample_size, replace = TRUE)
df_4 <- data |> sample_n(sample_size, replace = TRUE)
df_5 <- data |> sample_n(sample_size, replace = TRUE)

# Verify
dim(df_1); dim(df_2); dim(df_3); dim(df_4); dim(df_5)
[1] 4500   21
[1] 4500   21
[1] 4500   21
[1] 4500   21
[1] 4500   21

Summarising df_1

df_1 |> 
  group_by(track_genre) |> 
  summarise(
    count = n(),
    avg_popularity = mean(popularity, na.rm = TRUE),
    avg_danceability = mean(danceability, na.rm = TRUE),
    avg_energy = mean(energy, na.rm = TRUE),
    avg_acousticness = mean(acousticness, na.rm = TRUE),
    mode_key = names(sort(table(key), decreasing = TRUE))[1]
  ) |> 
  arrange(desc(avg_popularity))

Summarisng df_2

df_2 |> 
  group_by(track_genre) |> 
  summarise(
    count = n(),
    avg_popularity = mean(popularity, na.rm = TRUE),
    avg_danceability = mean(danceability, na.rm = TRUE),
    avg_energy = mean(energy, na.rm = TRUE),
    avg_acousticness = mean(acousticness, na.rm = TRUE),
    mode_key = names(sort(table(key), decreasing = TRUE)[1])
  ) |> 
  arrange(desc(avg_popularity))

Summarising df_3

df_3 |> 
  group_by(track_genre) |> 
  summarise(
    count = n(),
    avg_popularity = mean(popularity, na.rm = TRUE),
    avg_danceability = mean(danceability, na.rm = TRUE),
    avg_energy = mean(energy, na.rm = TRUE),
    avg_acousticness = mean(acousticness, na.rm = TRUE),
    mode_key = names(sort(table(key), decreasing = TRUE)[1])
  ) |> 
  arrange(desc(avg_popularity))

Summarising df_4

df_4 |> 
  group_by(track_genre) |> 
  summarise(
    count = n(),
    avg_popularity = mean(popularity, na.rm = TRUE),
    avg_danceability = mean(danceability, na.rm = TRUE),
    avg_energy = mean(energy, na.rm = TRUE),
    avg_acousticness = mean(acousticness, na.rm = TRUE),
    mode_key = names(sort(table(key), decreasing = TRUE)[1])
  ) |> 
  arrange(desc(avg_popularity))

Summarising df_5

df_5 |> 
  group_by(track_genre) |> 
  summarise(
    count = n(),
    avg_popularity = mean(popularity, na.rm = TRUE),
    avg_danceability = mean(danceability, na.rm = TRUE),
    avg_energy = mean(energy, na.rm = TRUE),
    avg_acousticness = mean(acousticness, na.rm = TRUE),
    mode_key = names(sort(table(key), decreasing = TRUE)[1])
  ) |> 
  arrange(desc(avg_popularity))

Summarising all dfs as a list using ‘’lapply’

# Summarising using lapply
#lapply(list(df_1, df_2, df_3, df_4, df_5), summary)

dfs <- list(df_1, df_2, df_3, df_4, df_5)

# Apply summarization to each dataframe in the list
summarized_dfs <- lapply(dfs, function(df) {
  df |> 
    group_by(track_genre) |> 
    summarise(
      count = n(),
      avg_popularity = mean(popularity, na.rm = TRUE),
      avg_danceability = mean(danceability, na.rm = TRUE),
      avg_energy = mean(energy, na.rm = TRUE),
      avg_acousticness = mean(acousticness, na.rm = TRUE),
      mode_key = names(sort(table(key), decreasing = TRUE)[1])
    ) |> 
    arrange(desc(avg_popularity))
})

# To view the summarized dataframes
summarized_dfs
[[1]]

[[2]]

[[3]]

[[4]]

[[5]]
NA

1. How different are they?

For comparing random sub-samples of tracks,

2. What would you have called an anomaly in one sub-sample that you wouldn’t in another?

For this dataset, anomalies vary across sub-samples like:

3. Are there aspects of the data that are consistent among all sub-samples?

Conclusion

Analysing subsample summaries on the spotify dataset by grouping data based on probability. We got to know the key findings of the spotify dataset using subsamples and the summarization operation grouped the data by track_genre and calculated several summary statistics, including the count of records, average values for features like popularity, danceability, energy, and acousticness, as well as the most frequent key for each genre.

In the future, this experience suggests that it’s important to: 1. Examine sampling methods carefully to ensure that both categorical and numeric variables are appropriately represented. 2. Use stratified sampling or ensure a balanced representation of categories if we’re working with imbalanced datasets or when certain groups are crucial for the analysis. 3. Consider the context of missing variables when interpreting results—if a certain category is underrepresented, it might skew results or lead to faulty conclusions about the relationships within the data.

Monte Carlo Simulation

set.seed(4)
n_simulations <- 1000
sample_size <- 4500  
mc_results <- replicate(n_simulations, {  
  subsample <- music_data |> sample_n(sample_size, replace = TRUE)  
  mean(subsample$Track_Length, na.rm = TRUE) 
})  

# Plotting the histogram
hist(  
  mc_results,  
  breaks = 30,  
  col = "skyblue",  
  border = "white",  
  main = "Monte Carlo Simulation: Distribution of Average Track Length",  
  xlab = "Average Track Length (seconds)",  
  ylab = "Frequency"  
)

Explnation:

The histogram of the simulation results shows a roughly normal distribution centered around an average track length of approximately 268 seconds. The spread (standard deviation) of the average track lengths is relatively small, indicating consistency in the sample averages.

Next Steps:

I could expand the simulation to track how often different categories appear and explore if some categories are more prone to being sampled over others. Validate whether the assumption of normality holds for the distribution of average track lengths or if alternative distributions better fit the data.

