MusicBrainzAPIPractice

Author

Austin Blumenthal

Synopsis

The purpose of this script is to use the MusicBrainz API to retrieve music metadata using HTTP GET requests.

Setup

First, we will load the necessary packages to handle the data and web requests.

# Load packages
library(tidyverse) # All the tidy things
library(jsonlite)  # Converting JSON data into data frames
library(magrittr)  # Extracting items from list objects using piping grammar
library(httr)      # Interacting with HTTP verbs

1 + 1

[1] 2

Introduction

MusicBrainz is a free, music data library that collects information on musicians and the results from songs and albums across the internet. There is no API Key needed for the use of MusicBrainz which is quite nice.

Here is the standard link to the API:

https://musicbrainz.org/doc/MusicBrainz_API

## Here is the first endpoint
mb_endpoint <- "https://musicbrainz.org/ws/2/"

## Since there is no API key, we need to follow ethical API usage like
## our overlord Joel taught us, so we will have a user agent
user_agent <- "MusicBrainzRTutorial/1.0 (CollegeDropout@xavier.edu)"

Defining the Query

The formatting for this API is quite unique, and will require a few extra steps that I had to find on the site.

# Choosing what kind of entity you would like to see
entity <- "artist"

# Here you will seperate the different categories you are interested in seeing
# You can find these sections on the website
query <- "tag:rap AND country:US"

# Since the query format I am using here has spaces, you need to add this extra
# line of code to make the spaces acceptable for a url.
query_encoded <- URLencode(query)

# Pick the amount of pages that you would like to see
limit <- "&limit=100"

# This is used to track which page of the data that you are on. Offset 0 is
# the equivalent of page 1.
offset <- "&offset=0"

# The data from this API originally uses a different format called XML.
fmt <- "&fmt=json"

### This portion creates the final url that you will be able to use.
mb_api_url <-
  paste0(mb_endpoint, entity, "?query=", query_encoded, limit, offset, fmt)

## Here you can see the url that has been generated, and test it.
mb_api_url

[1] "https://musicbrainz.org/ws/2/artist?query=tag:rap%20AND%20country:US&limit=100&offset=0&fmt=json"

Bringing this data into R

## Grab the data from the url here.
mb_api_response <-
  mb_api_url %>%
  GET(add_headers(`User-Agent` = user_agent))

## Here is a little test to see if your code is working! 200 means it's going well.
mb_api_response$status_code

[1] 200

## Turn the response that we grabbed earlier into a list.
mb_api_data <-
  mb_api_response %>%
  content(as = "text", encoding = "UTF-8") %>%
  fromJSON()

## Choose which portions of the results you are interested in, and put them
## into a dataframe.
artist_data <-
  mb_api_url %>%
  GET(add_headers(`User-Agent` = user_agent)) %>%
  content(as = "text", encoding = "UTF-8") %>%
  fromJSON() %>%
  use_series(artists) %>%
  select(id, name, country, disambiguation, score) %>%
  relocate(id, name)

Creating Reusable Functions

Turn the building of these URLs and data retrieval into functions:

mb_api_GET_url <-
  # Take the code that we used up top and compile it all together
  function(entity, query, limit, offset) {
    mb_endpoint <- "https://musicbrainz.org/ws/2/"
    query_encoded <- URLencode(query)
    limit_param <- paste0("&limit=", limit)
    offset_param <- paste0("&offset=", offset)
    fmt <- "&fmt=json"
    
    mb_url <- paste0(mb_endpoint, entity, "?query=", query_encoded,
                     limit_param, offset_param, fmt)
    return(mb_url)
  }

## Second function: grab the data and place it into a dataframe.
mb_api_GET_df <-
  function(entity, query, limit, offset, user_agent) {
    # The first function will create the URL
    mb_url <- mb_api_GET_url(entity, query, limit, offset)
    
    # Here is the get request and dataframe creation from the top of the script
    mb_df <-
      mb_url %>%
      GET(add_headers(`User-Agent` = user_agent)) %>%
      content(as = "text", encoding = "UTF-8") %>%
      fromJSON() %>%
      use_series(artists) %>%          
      select(id, name, country, disambiguation, score) %>%
      relocate(id, name)
    
    return(mb_df)
  }

## This third function will grab multiple pages of data
mb_api_GET_count <-
  function(entity, query, user_agent) {
    # This will build a URL and track the page we are on
    mb_url <- mb_api_GET_url(entity, query, limit = 1, offset = 0)
    
    # This shows the quantity of total results
    total_count <-
      mb_url %>%
      GET(add_headers(`User-Agent` = user_agent)) %>%
      content(as = "text", encoding = "UTF-8") %>%
      fromJSON() %>%
      use_series(count)
    
    return(total_count)
  }

## The final function that puts it all together
mb_api_GET_all <-
  function(entity, query, user_agent, limit = 100) {
    # Grabs the total results
    total <- mb_api_GET_count(entity, query, user_agent)
    
    # Shows the amount of pages that will be needed
    total <- min(total, 10000)
    pages <- ceiling(total / limit)
    
    # Stores this in a dataframe
    full_df <- data.frame()
    
    # LOOP
    for (i in 1:pages) {
      # Tracks the page you are on
      current_offset <- (i - 1) * limit
      
      # Grabs the data from each page
      page_df <-
        mb_api_GET_df(entity = entity,
                      query = query,
                      limit = limit,
                      offset = current_offset,
                      user_agent = user_agent)
      
      # Combine the data from the pages together
      full_df <- bind_rows(full_df, page_df)
      
      # Loading screen
      paste("Page", i, "of", pages, "collected.") %>%
        print()
      
      # Delay for requests to prevent a ban
      if (i < pages) {
        Sys.sleep(1.5)
      }
    }
    return(full_df)
  }

Collecting the Full Dataset

#### Example of US rappers ####

all_us_rappers <-
  mb_api_GET_all(entity = "artist",
                 query = "tag:rap AND country:US",
                 user_agent = "MusicBrainzRTutorial/1.0 (Collegedropout@xavier.edu)",
                 limit = 100)

[1] "Page 1 of 11 collected."
[1] "Page 2 of 11 collected."
[1] "Page 3 of 11 collected."
[1] "Page 4 of 11 collected."
[1] "Page 5 of 11 collected."
[1] "Page 6 of 11 collected."
[1] "Page 7 of 11 collected."
[1] "Page 8 of 11 collected."
[1] "Page 9 of 11 collected."
[1] "Page 10 of 11 collected."
[1] "Page 11 of 11 collected."

# Create a csv from the data so that it can be exported
write_csv(all_us_rappers, "us_rappers.csv")

Final Output

# Load the saved data
mb_data <- read_csv("us_rappers.csv")

Rows: 1038 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): id, name, country, disambiguation
dbl (1): score

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View the first few rows
head(mb_data)

# A tibble: 6 × 5
  id                                   name         country disambiguation score
  <chr>                                <chr>        <chr>   <chr>          <dbl>
1 b95ce3ff-3d05-4e87-9e01-c97b66af13d4 Eminem       US      US rapper        100
2 f27ec8db-af05-4f36-916e-3d57f91ecf5e Michael Jac… US      “King of Pop”     99
3 1550f952-c91b-40d7-9b4d-d26a259ee932 Lil B        US      US rapper, a.…    99
4 8bfac288-ccc5-448d-9573-c33ea2aa5c30 Red Hot Chi… US      <NA>              98
5 164f0d73-1234-4e2c-8743-d77bf2191051 Ye           US      formerly Kany…    96
6 f59c5520-5f46-4d2c-b2c4-822eabf53419 Linkin Park  US      American rock…    96