#Loading and Preparing the Data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lubridate)
spotify_songs <- readr::read_csv("C:/Users/priya/Downloads/spotify_songs.csv")
## Rows: 32833 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): track_id, track_name, track_artist, track_album_id, track_album_na...
## dbl (13): track_popularity, danceability, energy, key, loudness, mode, speec...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Check the column names of the dataset

colnames(spotify_songs)
##  [1] "track_id"                 "track_name"              
##  [3] "track_artist"             "track_popularity"        
##  [5] "track_album_id"           "track_album_name"        
##  [7] "track_album_release_date" "playlist_name"           
##  [9] "playlist_id"              "playlist_genre"          
## [11] "playlist_subgenre"        "danceability"            
## [13] "energy"                   "key"                     
## [15] "loudness"                 "mode"                    
## [17] "speechiness"              "acousticness"            
## [19] "instrumentalness"         "liveness"                
## [21] "valence"                  "tempo"                   
## [23] "duration_ms"
# Converting date column to Date format
spotify_songs$track_album_release_date <- as.Date(spotify_songs$track_album_release_date, format = "%Y-%m-%d")

Exploring and Cleaning the Data

glimpse(spotify_songs)
## Rows: 32,833
## Columns: 23
## $ track_id                 <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdfa…
## $ track_name               <chr> "I Don't Care (with Justin Bieber) - Loud Lux…
## $ track_artist             <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "Th…
## $ track_popularity         <dbl> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, 6…
## $ track_album_id           <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E6…
## $ track_album_name         <chr> "I Don't Care (with Justin Bieber) [Loud Luxu…
## $ track_album_release_date <date> 2019-06-14, 2019-12-13, 2019-07-05, 2019-07-…
## $ playlist_name            <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop R…
## $ playlist_id              <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7cf…
## $ playlist_genre           <chr> "pop", "pop", "pop", "pop", "pop", "pop", "po…
## $ playlist_subgenre        <chr> "dance pop", "dance pop", "dance pop", "dance…
## $ danceability             <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.4…
## $ energy                   <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.8…
## $ key                      <dbl> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5,…
## $ loudness                 <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.38…
## $ mode                     <dbl> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ speechiness              <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.127…
## $ acousticness             <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030, …
## $ instrumentalness         <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00e…
## $ liveness                 <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.143…
## $ valence                  <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.1…
## $ tempo                    <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, 1…
## $ duration_ms              <dbl> 194754, 162600, 176616, 169093, 189052, 16304…

Creating Grouped Data Frames

Group 1: By Playlist Category

group_by_category <- spotify_songs %>%
  group_by(playlist_genre) %>%
  summarise(mean_danceability = mean(danceability, na.rm = TRUE), 
            count = n()) %>%
  arrange(count)

# Printing the data frame to view the results
print(group_by_category)
## # A tibble: 6 × 3
##   playlist_genre mean_danceability count
##   <chr>                      <dbl> <int>
## 1 rock                       0.521  4951
## 2 latin                      0.713  5155
## 3 r&b                        0.670  5431
## 4 pop                        0.639  5507
## 5 rap                        0.718  5746
## 6 edm                        0.655  6043

Insights

The analysis shows the distribution of songs across various playlist genres, highlighting which genres are most and least populated in the dataset. This information reflects the popularity or market presence of different music genres.

Significance

Understanding the popularity distribution of music genres can inform marketing strategies, and content curation, and even influence promotional activities within the music industry. It may indicate consumer preferences and potential areas for growth or investment.

Further Questions

Group-2:

library(dplyr)
library(lubridate)

group_by_artist_year <- spotify_songs %>%
  group_by(track_artist, year = year(as.Date(track_album_release_date, "%Y-%m-%d"))) %>%
  summarise(mean_popularity = mean(track_popularity, na.rm = TRUE), count = n(), .groups = 'drop') %>%
  arrange(count)

# View results
print(group_by_artist_year)
## # A tibble: 16,944 × 4
##    track_artist                                      year mean_popularity count
##    <chr>                                            <dbl>           <dbl> <int>
##  1 "!deladap"                                          NA              25     1
##  2 "\"Dear Evan Hansen\" August 2018 Broadway Cast"  2018              39     1
##  3 "#TocoParaVos"                                    2019              43     1
##  4 "$ANFI"                                           2018              12     1
##  5 "$IFRA"                                           2019              34     1
##  6 "(G)I-DLE"                                        2018              66     1
##  7 "(Sandy) Alex G"                                  2019              57     1
##  8 "(dc)"                                            2015              28     1
##  9 "*NSYNC"                                          1997              54     1
## 10 "*NSYNC"                                          2000              71     1
## # ℹ 16,934 more rows

Insights Gathered

The analysis calculates the mean popularity of tracks for each artist by year, providing insights into how artists’ popularity trends over time and varies annually.

Significance

This can reveal artists’ career peaks, identify rising stars, or track declines in popularity, useful for industry stakeholders like producers, marketers, or analysts focusing on trends and potential investments.

Further Questions

Group 3: By Energy Levels in Different Genres group_by_energy_genre

library(dplyr)
library(ggplot2)

# Grouping data by playlist genre and energy levels
group_by_energy_genre <- spotify_songs %>%
  group_by(playlist_genre, energy_group = cut(energy,
    breaks = quantile(energy, na.rm = TRUE, probs = seq(0, 1, 0.25)),
    include.lowest = TRUE)) %>%
  summarise(mean_valence = mean(valence, na.rm = TRUE), count = n(), .groups = 'drop') %>%
  arrange(count)

print(group_by_energy_genre)
## # A tibble: 24 × 4
##    playlist_genre energy_group     mean_valence count
##    <chr>          <fct>                   <dbl> <int>
##  1 r&b            (0.84,1]                0.680   445
##  2 edm            [0.000175,0.581]        0.393   487
##  3 rap            (0.84,1]                0.560   767
##  4 r&b            (0.721,0.84]            0.621   903
##  5 rock           (0.581,0.721]           0.579   961
##  6 latin          [0.000175,0.581]        0.487   994
##  7 edm            (0.581,0.721]           0.415  1000
##  8 rock           (0.721,0.84]            0.601  1022
##  9 latin          (0.84,1]                0.685  1036
## 10 rock           [0.000175,0.581]        0.457  1100
## # ℹ 14 more rows

Insights Gathered

The code analyzes how the valence (a measure of musical positivity) correlates with energy levels across different music genres. By grouping songs based on their energy and assessing average valence, it identifies patterns in the emotional content of songs across varying intensities of energy.

Significance

Understanding these correlations helps in curating playlists and marketing to target audiences more effectively, enhancing listener experience by aligning song characteristics with user preferences or moods.

Further Questions

Analysis of the Smallest Groups

smallest_groups <- bind_rows(
  group_by_category %>% filter(count == min(count)) %>% mutate(group_type = "category"),
  group_by_artist_year %>% filter(count == min(count)) %>% mutate(group_type = "artist_year"),
  group_by_energy_genre %>% filter(count == min(count)) %>% mutate(group_type = "energy_genre")
)

# Examining smallest groups
print(smallest_groups)
## # A tibble: 11,315 × 9
##    playlist_genre mean_danceability count group_type  track_artist          year
##    <chr>                      <dbl> <int> <chr>       <chr>                <dbl>
##  1 rock                       0.521  4951 category     <NA>                   NA
##  2 <NA>                      NA         1 artist_year "!deladap"              NA
##  3 <NA>                      NA         1 artist_year "\"Dear Evan Hansen…  2018
##  4 <NA>                      NA         1 artist_year "#TocoParaVos"        2019
##  5 <NA>                      NA         1 artist_year "$ANFI"               2018
##  6 <NA>                      NA         1 artist_year "$IFRA"               2019
##  7 <NA>                      NA         1 artist_year "(G)I-DLE"            2018
##  8 <NA>                      NA         1 artist_year "(Sandy) Alex G"      2019
##  9 <NA>                      NA         1 artist_year "(dc)"                2015
## 10 <NA>                      NA         1 artist_year "*NSYNC"              1997
## # ℹ 11,305 more rows
## # ℹ 3 more variables: mean_popularity <dbl>, energy_group <fct>,
## #   mean_valence <dbl>

Insights Gathered

The code identifies the smallest groups within three distinct categorizations in the Spotify dataset: genres, artist-year combinations, and energy-genre groupings. It specifically pinpoints which of these categories have the least occurrences.

Significance

Understanding the least common groups can offer insights into niche markets or underexplored areas within the music industry. This information is crucial for strategies aiming to explore or expand into less saturated genres or artist collaborations.

Further Questions

Building Visualizations for Each Grouping

Visualization for Group 1:By Playlist Genre

ggplot(group_by_category, aes(x = reorder(playlist_genre, count), y = count)) +
  geom_col(fill = "steelblue") +
  labs(title = "Distribution of Songs by Playlist Genre", x = "Playlist Genre", y = "Count") +
  theme_minimal()

Insights Gathered

The visualization showcases the distribution of songs across different playlist genres, indicating which genres are most and least populated within the dataset.

Significance

This insight is vital for understanding market trends and consumer preferences in the music industry. It can guide strategic decisions regarding marketing efforts, content curation, and resource allocation towards genres with varying levels of popularity.

Further Questions

Group-2:

library(ggplot2)

# Visualization of artist popularity over the years
ggplot(group_by_artist_year, aes(x = year, y = mean_popularity, size = count)) +
  geom_point(aes(color = track_artist), alpha = 0.6) +
  labs(title = "Artist Popularity by Release Year", 
       x = "Year", 
       y = "Average Popularity", 
       size = "Track Count") +
  scale_size(range = c(1, 10)) +  # Adjust the size range for better visualization
  theme_minimal() +
  theme(legend.position = "none")  # Remove the legend to clean up the plot
## Warning: Removed 903 rows containing missing values or values outside the scale range
## (`geom_point()`).

Insights Gathered

The visualization provides a detailed look at the trends in artist popularity over various years, mapped by the volume of tracks released. It highlights how popularity metrics and productivity have evolved for different artists across time.

Significance

This insight is crucial for understanding how artists maintain relevance in the industry and helps identify key periods of activity and success. It can inform decisions on marketing strategies and artist development.

Further Questions

Visualization for Group 3: By Energy Levels in Different Genres

ggplot(group_by_energy_genre, aes(x = energy_group, y = mean_valence, fill = playlist_genre)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_viridis_d() +
  labs(title = "Valence by Energy Levels across Genres", x = "Energy Group", y = "Mean Valence") +
  theme_minimal()

Insights Gathered

The visualization provides a detailed analysis of how valence, a measure of musical mood and positivity, varies across different energy levels within music genres. It highlights how more energetic songs might correlate with different emotional expressions in various genres.

Significance

This insight is valuable for understanding listener engagement and can inform decisions in music production, marketing, and playlist curation targeted at specific emotional responses or activity levels.

Further Questions

#Build a Data Frame of All Combinations

library(dplyr)
library(tidyr)

# Creating  data frame of all unique combinations that actually exist in the data
existing_combinations <- spotify_songs %>%
  distinct(playlist_genre, playlist_subgenre)

# Creating all possible combinations of the levels of both categorical variables
all_combinations <- expand.grid(playlist_genre = unique(spotify_songs$playlist_genre),
                                playlist_subgenre = unique(spotify_songs$playlist_subgenre))

# Finding combinations that do not exist in the data
missing_combinations <- anti_join(all_combinations, existing_combinations, by = c("playlist_genre", "playlist_subgenre"))

# Displaying missing combinations
print(missing_combinations)
##     playlist_genre         playlist_subgenre
## 1              rap                 dance pop
## 2             rock                 dance pop
## 3            latin                 dance pop
## 4              r&b                 dance pop
## 5              edm                 dance pop
## 6              rap             post-teen pop
## 7             rock             post-teen pop
## 8            latin             post-teen pop
## 9              r&b             post-teen pop
## 10             edm             post-teen pop
## 11             rap                electropop
## 12            rock                electropop
## 13           latin                electropop
## 14             r&b                electropop
## 15             edm                electropop
## 16             rap           indie poptimism
## 17            rock           indie poptimism
## 18           latin           indie poptimism
## 19             r&b           indie poptimism
## 20             edm           indie poptimism
## 21             pop                   hip hop
## 22            rock                   hip hop
## 23           latin                   hip hop
## 24             r&b                   hip hop
## 25             edm                   hip hop
## 26             pop          southern hip hop
## 27            rock          southern hip hop
## 28           latin          southern hip hop
## 29             r&b          southern hip hop
## 30             edm          southern hip hop
## 31             pop              gangster rap
## 32            rock              gangster rap
## 33           latin              gangster rap
## 34             r&b              gangster rap
## 35             edm              gangster rap
## 36             pop                      trap
## 37            rock                      trap
## 38           latin                      trap
## 39             r&b                      trap
## 40             edm                      trap
## 41             pop                album rock
## 42             rap                album rock
## 43           latin                album rock
## 44             r&b                album rock
## 45             edm                album rock
## 46             pop              classic rock
## 47             rap              classic rock
## 48           latin              classic rock
## 49             r&b              classic rock
## 50             edm              classic rock
## 51             pop            permanent wave
## 52             rap            permanent wave
## 53           latin            permanent wave
## 54             r&b            permanent wave
## 55             edm            permanent wave
## 56             pop                 hard rock
## 57             rap                 hard rock
## 58           latin                 hard rock
## 59             r&b                 hard rock
## 60             edm                 hard rock
## 61             pop                  tropical
## 62             rap                  tropical
## 63            rock                  tropical
## 64             r&b                  tropical
## 65             edm                  tropical
## 66             pop                 latin pop
## 67             rap                 latin pop
## 68            rock                 latin pop
## 69             r&b                 latin pop
## 70             edm                 latin pop
## 71             pop                 reggaeton
## 72             rap                 reggaeton
## 73            rock                 reggaeton
## 74             r&b                 reggaeton
## 75             edm                 reggaeton
## 76             pop             latin hip hop
## 77             rap             latin hip hop
## 78            rock             latin hip hop
## 79             r&b             latin hip hop
## 80             edm             latin hip hop
## 81             pop        urban contemporary
## 82             rap        urban contemporary
## 83            rock        urban contemporary
## 84           latin        urban contemporary
## 85             edm        urban contemporary
## 86             pop                   hip pop
## 87             rap                   hip pop
## 88            rock                   hip pop
## 89           latin                   hip pop
## 90             edm                   hip pop
## 91             pop            new jack swing
## 92             rap            new jack swing
## 93            rock            new jack swing
## 94           latin            new jack swing
## 95             edm            new jack swing
## 96             pop                  neo soul
## 97             rap                  neo soul
## 98            rock                  neo soul
## 99           latin                  neo soul
## 100            edm                  neo soul
## 101            pop             electro house
## 102            rap             electro house
## 103           rock             electro house
## 104          latin             electro house
## 105            r&b             electro house
## 106            pop                  big room
## 107            rap                  big room
## 108           rock                  big room
## 109          latin                  big room
## 110            r&b                  big room
## 111            pop                   pop edm
## 112            rap                   pop edm
## 113           rock                   pop edm
## 114          latin                   pop edm
## 115            r&b                   pop edm
## 116            pop progressive electro house
## 117            rap progressive electro house
## 118           rock progressive electro house
## 119          latin progressive electro house
## 120            r&b progressive electro house

Insights Gathered

The analysis identifies combinations of playlist genres and subgenres that do not exist in the dataset. This reveals gaps or missing data categories in the Spotify dataset.

Significance

Understanding missing combinations can highlight underexplored or niche areas in the music catalog, potentially guiding new content creation or curation strategies to fill these gaps.

Further Questions

Analyze Most and Least Common Combinations

combination_counts <- spotify_songs %>%
  group_by(playlist_genre, playlist_subgenre) %>%
  summarise(count = n(), .groups = 'drop') %>%
  arrange(desc(count))

# Displaying the most common combinations
print(head(combination_counts))
## # A tibble: 6 × 3
##   playlist_genre playlist_subgenre         count
##   <chr>          <chr>                     <int>
## 1 edm            progressive electro house  1809
## 2 rap            southern hip hop           1675
## 3 pop            indie poptimism            1672
## 4 latin          latin hip hop              1656
## 5 r&b            neo soul                   1637
## 6 edm            pop edm                    1517
# Displaying the least common combinations
print(tail(combination_counts))
## # A tibble: 6 × 3
##   playlist_genre playlist_subgenre count
##   <chr>          <chr>             <int>
## 1 edm            big room           1206
## 2 r&b            new jack swing     1133
## 3 pop            post-teen pop      1129
## 4 rock           permanent wave     1105
## 5 rock           album rock         1065
## 6 latin          reggaeton           949

Insights Gathered

The code identifies the most and least common genre-subgenre combinations in the Spotify dataset, highlighting which music categories are most prevalent and which are less frequented.

Significance

This analysis provides valuable insights into listener preferences and market trends, helping to understand which musical niches are saturated and which may offer opportunities for growth or further exploration.

Further Questions

Visualize the Combinations

library(ggplot2)

# Visualizing combination counts
ggplot(combination_counts, aes(x = playlist_genre, y = count, fill = playlist_subgenre)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Frequency of Playlist Genre and Subgenre Combinations", x = "Playlist Genre", y = "Count") +
  theme_minimal()

Insights Gathered

The visualization illustrates the frequency of different genre and subgenre combinations, showing which pairings are most and least common in the dataset. This allows for an easy comparison of how various subgenres are distributed within broader genres.

Significance

Understanding the distribution of genre-subgenre combinations helps identify market trends, popular combinations, and potential gaps. This can inform decisions about content creation, playlist curation, and marketing strategies to meet listener demands.

Further Questions

Hypothesis for Group 1 (Playlist Genre): Hypothesis: “Genres that are more niche or specialized, such as Jazz or Classical, will have fewer songs as they target a more specific audience compared to more popular genres like Pop or Hip-hop.” Rationale: Niche genres may not be as commercially viable or mainstream, leading to less production and promotion. Test: Analyze the distribution of genres across various platforms or sales data to confirm if niche genres consistently show lower song counts and engagement.

Hypothesis for Group 2: “Artists who release more songs in a given year tend to have a higher average song popularity than those who release fewer songs.” This hypothesis can be tested by examining the correlation between the number of songs an artist releases in a year (count) and their average song popularity (mean_popularity). If there is a positive correlation, it would support the hypothesis that releasing more songs is associated with higher popularity.

Hypothesis for Group 3 (Energy Levels in Different Genres): Hypothesis: “Songs in genres characterized as calm or soothing, such as Classical, will have lower energy levels and, consequently, smaller groups in high-energy brackets.” Rationale: The intrinsic characteristics of some genres may not align with high energy, influencing production trends. Test: Analyze genre characteristics in relation to energy levels and song output to validate if low-energy genres produce fewer high-energy songs.