We will need to use libraries httr, jsonlite, and dplyr to extract the data from API from JSON format to R Dataframe
library(httr)
library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Sign up for the NY Times developer apis and activate an API key for the API accessing. I choose the books API and from the API documentation get the base url and use it to build out API call.
# Define API key and endpoint
api_key <- "PR6ullqlzC8T72wCHIUWBbqkTQWAgRRe"
base_url <- "https://api.nytimes.com/svc/books/v3/lists/full-overview.json"
Get the response with the url constructed and check the response code
# Construct the full URL with query parameters
url <- paste0(base_url, "?", "&api-key=", api_key) %>% print()
## [1] "https://api.nytimes.com/svc/books/v3/lists/full-overview.json?&api-key=PR6ullqlzC8T72wCHIUWBbqkTQWAgRRe"
# Make the GET request
response <- GET(url)
# Check if the request was successful
if (status_code(response) != 200) {
stop("API request failed!")
}
Parse the response data into json format then convert it to dataframe.
# Parse the JSON response into a list
json_data <- fromJSON(content(response, "text", encoding = "UTF-8"))
# Extract JSON to dataframe
book_lists <- json_data$results$lists %>% as.data.frame()
books <- json_data$results$lists$books
# Empty dataframe for books
books_df <- data.frame(counter = integer(),
title = character(),
author = character(),
rank = integer(),
last_rank = integer(),
publisher = character(),
stringsAsFactors = FALSE)
# create an identifier for join later
book_lists <- book_lists %>% mutate(counter = 1:18)
# parsing the list of books
item <- 1
for (book in books) {
dataf <- data.frame(
counter = item,
title = book$title,
author = book$author,
rank = book$rank,
last_rank = book$rank_last_week,
publisher = book$publisher,
stringsAsFactors = FALSE
)
books_df <- rbind(books_df, dataf)
item <- item+1
}
# Create data frame for best sellers
best_sellers <- left_join(books_df, book_lists %>% select(counter, list_id,list_name,updated)) %>% select(-counter)
## Joining with `by = join_by(counter)`
# Which author was on the best seller list the most times
best_author <- best_sellers %>%
count(author) %>%
arrange(desc(n)) %>%
head(6)
best_author
## author n
## 1 Freida McFadden 6
## 2 Kristin Hannah 6
## 3 Rebecca Yarros 5
## 4 Sarah J. Maas 5
## 5 Dav Pilkey 4
## 6 Malcolm Gladwell 4
best_sellers %>%
filter(author %in% best_author$author) %>%
ggplot(aes(x = author)) +
geom_bar()
# Which publisher produced the most best sellers
best_publisher <- best_sellers %>%
count(publisher) %>%
arrange(desc(n)) %>%
head(3)
best_publisher
## publisher n
## 1 Scholastic 15
## 2 Crown 9
## 3 Random House Audio 9
best_sellers %>%
filter(publisher %in% best_publisher$publisher) %>%
ggplot(aes(x = publisher)) +
geom_bar()
Freida McFadden and Kristin Hannah authored the most best sellers for different categories as of the list from December. Scholastic is the publisher that published the most best sellers.