OpenFDA Data Structure

OpenFDA Data Dictionary

OpenFDA API call

Option 1: OpenFDA call

#Required R libraries

library(httr) #To send GET requests
## Warning: package 'httr' was built under R version 4.3.3
library(jsonlite) #To parse through JSON
## Warning: package 'jsonlite' was built under R version 4.3.3
library(data.table)

extract_data <- function(date,limit,api_key,skip){ #This function automates extraction

#Setting up parameters for the search

params <- list( #setting parameters
  date = date,
  limit = limit, #just retriving one case
  api_key = api_key, #My API Key
  skip = skip
)

#Building the API call (GET request)

url <- paste0("api.fda.gov/drug/event.json?search=receivedate:", #API site
              params$date, #date
              "&", #seperator
              "api_key=",
              api_key,
              "&",
              "limit=",
              params$limit,
              "&",
              "skip=",
              skip)
timing <- system.time({
response <- GET(url) #Sending the GET request
})


if (status_code(response) != 200) { #200 is the message for sucessful GET request
  stop("API request failed with status: ", status_code(response)) #notifies failed request
}

print(paste("Time taken for API call:", timing["elapsed"], "seconds"))


timing2 <- system.time({
json <- fromJSON(content(response, as = "text", encoding = "UTF-8")) #Parse the JSON file
})

print(paste("Time taken for parsing:", timing2["elapsed"], "seconds"))

parsed <- as.data.table(json$results)

return(parsed)
}
date = "[20040101+TO+20081231]"
limit = 1000 #Let's say we want 1000 cases
api_key = "bUInb91viNAQQsRh6YCNyTDCd9fm6a7SS7muQ7gg" #My API Key
skip = 0

data <- extract_data(date,limit,api_key,skip)
## [1] "Time taken for API call: 10.77 seconds"
## [1] "Time taken for parsing: 5.05 seconds"

OpenFDA Paging

  • The limit of each OpenFDA API call is 1,000 cases. However, we can paginate with the skip parameter to get more cases with multiple API calls.
#Creating a 'wrapper' function
paginate <- function(iterations){
  all_results <- data.table()
  
  for (i in 0:(iterations-1))  {
    
    cat(paste0("processing iteration ", i+1 , "\n"))
    
    data <- extract_data(date,limit,api_key,skip=limit*i)

    all_results <- rbind(all_results,data, fill = TRUE)
  }
  return(all_results)
}
  • Let’s say we want 3,000 cases
date = "[20040101+TO+20081231]" 
limit = 1000 #Let's say we want 1000 cases 
api_key = "bUInb91viNAQQsRh6YCNyTDCd9fm6a7SS7muQ7gg" #My API Key 
iterations = 3 #this means 10 iterations cause it starts from 0

combined <- paginate(iterations)
## processing iteration 1
## [1] "Time taken for API call: 9.84 seconds"
## [1] "Time taken for parsing: 4.88 seconds"
## processing iteration 2
## [1] "Time taken for API call: 10.06 seconds"
## [1] "Time taken for parsing: 5.16 seconds"
## processing iteration 3
## [1] "Time taken for API call: 10.19 seconds"
## [1] "Time taken for parsing: 4.98 seconds"
nrow(combined)
## [1] 3000

API Calls Limitations

Table 1. OpenFDA Limitations
Rate per key Without API Key With API Key
Per minute 240 240
Per day 1,000 120,000

Search_After

  • Step 1: execute initial search with large number of matches.

    • Must exclude the skip parameter
extract_data2 <- function(date,limit,api_key){ #This function automates extraction

#Setting up parameters for the search

params <- list( #setting parameters
  date = date,
  limit = limit, #just retriving one case
  api_key = api_key #My API Key
) #Excluded skip parameter

#Building the API call (GET request)

url <- paste0("api.fda.gov/drug/event.json?search=receivedate:", #API site
              params$date, #date
              "&", #seperator
              "api_key=",
              api_key,
              "&",
              "limit=",
              params$limit,
              "&")

timing <- system.time({
response <- GET(url) #Sending the GET request
link_header <- headers(response)[["link"]]
})


if (status_code(response) != 200) { #200 is the message for sucessful GET request
  stop("API request failed with status: ", status_code(response)) #notifies failed request
}

print(paste("Time taken for API call:", timing["elapsed"], "seconds"))


timing2 <- system.time({
json <- fromJSON(content(response, as = "text", encoding = "UTF-8")) #Parse the JSON file
})

print(paste("Time taken for parsing:", timing2["elapsed"], "seconds"))

parsed <- as.data.table(json$results)

list(parsed=parsed, link_header=link_header)
}
date = "[20040101+TO+20081231]" 
limit = 1000 #Let's say we want 1000 cases 
api_key = "bUInb91viNAQQsRh6YCNyTDCd9fm6a7SS7muQ7gg" #My API Key 

result <- extract_data2(date,limit,api_key)
## [1] "Time taken for API call: 9.72999999999999 seconds"
## [1] "Time taken for parsing: 5.42999999999999 seconds"
  • Step 2: Extract the link HTTP header contained in the response
result$link_header
## [1] "<https://api.fda.gov/drug/event.json?search=receivedate%3A%5B20040101%20TO%2020081231%5D&limit=1000&skip=0&search_after=0%3D3654572>; rel=\"next\""
next_url <- sub(".*<([^>]+)>; rel=\"next\".*", "\\", result$link_header)
  • Automating the extraction
library(httr)
library(jsonlite)
library(data.table)

search_after_paginate <- function(date, limit = 1000, api_key, max_pages = Inf, pause = 1) {
  all_results <- list()
  page <- 1
  next_url <- NULL
  
  repeat {
    # Build the first URL if next_url is not yet set
    if (is.null(next_url)) {
      next_url <- paste0(
        "https://api.fda.gov/drug/event.json?search=receivedate:", 
        date,
        "&limit=", limit,
        "&sort=receivedate:asc",
        "&api_key=", api_key
      )
    }
    
    cat("Processing page", page, "\n")
    
    # Perform GET request
    timing <- system.time({
      response <- GET(next_url)
    })
    cat("Time for API call:", round(timing["elapsed"], 2), "seconds\n")
    
    # Check for errors
    if (status_code(response) != 200) {
      stop("API request failed with status: ", status_code(response))
    }
    
    # Extract Link header and JSON content
    link_header <- headers(response)[["link"]]
    timing2 <- system.time({
      json <- fromJSON(content(response, as = "text", encoding = "UTF-8"))
    })
    cat("Time for parsing:", round(timing2["elapsed"], 2), "seconds\n")
    
    # Convert to data.table and store
    parsed <- as.data.table(json$results)
    all_results[[page]] <- parsed
    
    # Check for end of pagination
    if (is.null(link_header) || page >= max_pages) {
      break
    }
    
    # Extract next_url from Link header
    next_url <- sub(".*<([^>]+)>; rel=\"next\".*", "\\1", link_header)
    
    # Be polite to the server
    Sys.sleep(pause)
    page <- page + 1
  }
  
  # Combine all results into one data.table
  result_dt <- rbindlist(all_results, fill = TRUE)
  return(result_dt)
}
api_key <- "bUInb91viNAQQsRh6YCNyTDCd9fm6a7SS7muQ7gg"
date_range <- "[20120101+TO+20121231]"

data_all <- search_after_paginate(
  date = date_range,
  limit = 100,           
  api_key = api_key,
  max_pages = 5         
)
## Processing page 1 
## Time for API call: 1.53 seconds
## Time for parsing: 0.62 seconds
## Processing page 2 
## Time for API call: 0.76 seconds
## Time for parsing: 0.33 seconds
## Processing page 3 
## Time for API call: 1.13 seconds
## Time for parsing: 0.38 seconds
## Processing page 4 
## Time for API call: 0.92 seconds
## Time for parsing: 0.32 seconds
## Processing page 5 
## Time for API call: 0.84 seconds
## Time for parsing: 0.36 seconds
  • This allows for extraction of all reports. 1000 cases max each time.

Sanity/Feasibility check

  • Extracting each 1000 cases and parsing takes about 15 seconds total.

  • Assuming 2 million reports, that would be almost 9 hours.

  • This would not be feasible unless we are working with lower amount of data.

Option 2: Direct Data Download

Automated openFDA download

  • Quarterly json files are available on https://open.fda.gov/data/downloads/.

  • The issue is that each quarter is split into several different JSON files.

  • The challenge here is to automatically identify how many files per each quarter, and download them accordingly.

library(httr)
library(glue)
## Warning: package 'glue' was built under R version 4.3.3
library(data.table)

start_year <- 2012
end_year <- 2013
download_dir <- "H:/OpenFDA"

base_url <- "https://download.open.fda.gov/drug/event/"
quarters <- sprintf("%dq%d", rep(start_year:end_year, each=4), 1:4)

# Initialize empty data.table
all_files <- data.table()

# Loop over each quarter
for (i in seq_along(quarters)) {
  this_quarter <- quarters[i]
  
  # Generate 60 URLs (0001 to 0060)
  temp <- data.table(
    quarter = this_quarter,
    part = 1:60,
    url = glue("{base_url}{this_quarter}/drug-event-{sprintf('%04d', 1:60)}-of-{sprintf('%04d', 1:60)}.json.zip")
  )
  
  all_files <- rbind(all_files, temp)
}


# Check if each URL exists (status code 200)
all_files[, exists := sapply(url, function(u) {
  status_code(HEAD(u)) == 200
})]

all_files <- all_files[exists=="TRUE"]
  • This is problematic, somehow there are two versions of the JSON files. One split into 11 parts, the other split into 12 parts. We cannot reliably confirm this. As such, it might be better to manually code in how many files there are.