Working with XML and JSON in R

Reading and printing an html file

# Libraries for loading the html file and extracting the table
library(rvest)
library(xml2)

# Set the file path
file_path <- "/Users/candace/Desktop/CUNY_SEM1_Masters/books.html"
cat("Selected file:", file_path, "\n")

## Selected file: /Users/candace/Desktop/CUNY_SEM1_Masters/books.html

# Read the HTML file
html_file <- read_html(file_path)

# Extract table
html_data <- html_file %>% html_table() %>% .[[1]]
print(html_data)

## # A tibble: 3 × 6
##      id title                   authors           copyright page_count publisher
##   <int> <chr>                   <chr>                 <int>      <int> <chr>    
## 1     1 Key Person of Influence Daniel Priestly …      2014        201 Rethink …
## 2     2 Million Dolllar Weekend Noah Kagan and T…      2024        218 Penguin …
## 3     3 The 12 Week Work Year   Brian P. Morgan …      2013        190 Wiley

Reading and printing an XML file

library(xml2)
library(xml2)

xml_book_table <- read_xml("books.xml")

xml_table <- data.frame(
  id = xml_text(xml_find_all(xml_book_table, "//book/id")),
  title = xml_text(xml_find_all(xml_book_table, "//book/title")),
  authors = xml_text(xml_find_all(xml_book_table, "//book/authors")),
  copyright = xml_text(xml_find_all(xml_book_table, "//book/copyright")),
  page_count = xml_text(xml_find_all(xml_book_table, "//book/page_count")),
  publisher = xml_text(xml_find_all(xml_book_table, "//book/publisher"))
)

print(xml_table)

##   id                   title                                authors copyright
## 1  1 Key Person of Influence          Daniel Priestly and Mike Reid      2014
## 2  2  Million Dollar Weekend                Noah Kagan and Tahl Raz      2024
## 3  3   The 12 Week Work Year Brian P. Morgan and Michael Lennington      2013
##   page_count                publisher
## 1        201            Rethink Press
## 2        218 Penguin Random House LLC
## 3        190                    Wiley

xml_book_table <- read_xml("books.xml")

xml_table <- data.frame(
  id = xml_text(xml_find_all(xml_book_table, "//book/id")),
  title = xml_text(xml_find_all(xml_book_table, "//book/title")),
  authors = xml_text(xml_find_all(xml_book_table, "//book/authors")),
  copyright = xml_text(xml_find_all(xml_book_table, "//book/copyright")),
  page_count = xml_text(xml_find_all(xml_book_table, "//book/page_count")),
  publisher = xml_text(xml_find_all(xml_book_table, "//book/publisher"))
)

print(xml_table)

##   id                   title                                authors copyright
## 1  1 Key Person of Influence          Daniel Priestly and Mike Reid      2014
## 2  2  Million Dollar Weekend                Noah Kagan and Tahl Raz      2024
## 3  3   The 12 Week Work Year Brian P. Morgan and Michael Lennington      2013
##   page_count                publisher
## 1        201            Rethink Press
## 2        218 Penguin Random House LLC
## 3        190                    Wiley

Reading and printing a JSON file

library(jsonlite)
library(jsonlite)

# Create the data as a data frame
books_data <- data.frame(
  id = c(1, 2, 3),
  title = c("Key Person of Influence", "Million Dollar Weekend", "The 12 Week Work Year"),
  authors = c("Daniel Priestly and Mike Reid", "Noah Kagan and Tahl Raz", "Brian P. Morgan and Michael Lennington"),
  copyright = c(2014, 2024, 2013),
  page_count = c(201, 218, 190),
  publisher = c("Rethink Press", "Penguin Random House LLC", "Wiley")
)

# Write to JSON file
write_json(books_data, "/Users/candace/Desktop/CUNY_SEM1_Masters/favorite_books.json", pretty = TRUE)

# Read it back
json_data <- read_json("/Users/candace/Desktop/CUNY_SEM1_Masters/favorite_books.json", simplifyVector = TRUE)
print(json_data)

##   id                   title                                authors copyright
## 1  1 Key Person of Influence          Daniel Priestly and Mike Reid      2014
## 2  2  Million Dollar Weekend                Noah Kagan and Tahl Raz      2024
## 3  3   The 12 Week Work Year Brian P. Morgan and Michael Lennington      2013
##   page_count                publisher
## 1        201            Rethink Press
## 2        218 Penguin Random House LLC
## 3        190                    Wiley

My analysis of all uploading .xml, .html and .json file formats to R

The HTML file was the easiest to set up and read into R. In contrast, the XML and JSON files were a bit more challenging to configure and import. When examining the outputs, the HTML file appeared the most organized—it displayed data neatly in rows and columns similar to a table. Meanwhile, the XML and JSON outputs required more space and printed their content across multiple lines, making them less straightforward to read compared to the HTML output.