Overview

This R Markdown reads three “by hand” source files about favorite books:

books.html (HTML table)
books.xml (XML document)
books.json (JSON array)

It loads each into its own data frame, then standardizes columns so we can compare whether the data frames are identical.

library(tidyverse)
library(rvest)     # parse HTML
library(xml2)      # parse XML
library(jsonlite)  # parse JSON
library(knitr)

# Helper to standardize authors (lists/arrays) into comma-separated strings
collapse_authors <- function(x) {
  if (is.list(x)) {
    sapply(x, function(v) paste(unlist(v), collapse = ", "))
  } else if (is.character(x)) {
    x
  } else {
    as.character(x)
  }
}

Read HTML

html_path <- "books.html"
doc <- read_html(html_path)
html_df <- html_table(doc, header = TRUE)[[0 + 1]] %>%
  as_tibble() %>%
  rename(
    Title = Title,
    Authors = Authors,
    Year = Year,
    Pages = Pages,
    Subject = Subject,
    ISBN = ISBN
  ) %>%
  mutate(
    Year = as.integer(Year),
    Pages = as.integer(Pages),
    Authors = as.character(Authors)
  )

kable(html_df, caption = "Data from HTML")

Data from HTML
Title	Authors	Year	Pages	Subject	ISBN
Forecasting: Principles and Practice (3rd ed.)	Rob J. Hyndman, George Athanasopoulos	2021	412	Time Series / Forecasting	978-0987507136
Hands-On Programming with R	Garrett Grolemund	2014	250	R Programming	978-1449359010
Deep Learning with Python (2nd ed.)	François Chollet	2021	504	Deep Learning	978-1617296864

Read XML

xml_path <- "books.xml"
x <- read_xml(xml_path)

book_nodes <- xml_find_all(x, "//book")

xml_df <- tibble(
  Title   = xml_text(xml_find_first(book_nodes, "title")),
  Authors = sapply(book_nodes, function(node) {
    paste(xml_text(xml_find_all(node, "authors/author")), collapse = ", ")
  }),
  Year    = as.integer(xml_text(xml_find_first(book_nodes, "year"))),
  Pages   = as.integer(xml_text(xml_find_first(book_nodes, "pages"))),
  Subject = xml_text(xml_find_first(book_nodes, "subject")),
  ISBN    = xml_text(xml_find_first(book_nodes, "isbn"))
)

kable(xml_df, caption = "Data from XML")

Data from XML
Title	Authors	Year	Pages	Subject	ISBN
Forecasting: Principles and Practice (3rd ed.)	Rob J. Hyndman, George Athanasopoulos	2021	412	Time Series / Forecasting	978-0987507136
Hands-On Programming with R	Garrett Grolemund	2014	250	R Programming	978-1449359010
Deep Learning with Python (2nd ed.)	François Chollet	2021	504	Deep Learning	978-1617296864

Read JSON

json_path <- "books.json"
j <- fromJSON(json_path, flatten = TRUE)

json_df <- tibble(
  Title   = j$title,
  Authors = collapse_authors(j$authors),
  Year    = as.integer(j$year),
  Pages   = as.integer(j$pages),
  Subject = j$subject,
  ISBN    = j$isbn
)

kable(json_df, caption = "Data from JSON")

Data from JSON
Title	Authors	Year	Pages	Subject	ISBN
Forecasting: Principles and Practice (3rd ed.)	Rob J. Hyndman, George Athanasopoulos	2021	412	Time Series / Forecasting	978-0987507136
Hands-On Programming with R	Garrett Grolemund	2014	250	R Programming	978-1449359010
Deep Learning with Python (2nd ed.)	François Chollet	2021	504	Deep Learning	978-1617296864

Standardize & Compare

We sort rows by Title and ensure identical column order and types before comparison.

std <- function(df) {
  df %>%
    select(Title, Authors, Year, Pages, Subject, ISBN) %>%
    arrange(Title) %>%
    mutate(
      Title = as.character(Title),
      Authors = as.character(Authors),
      Year = as.integer(Year),
      Pages = as.integer(Pages),
      Subject = as.character(Subject),
      ISBN = as.character(ISBN)
    )
}

html_std <- std(html_df)
xml_std  <- std(xml_df)
json_std <- std(json_df)

ident_html_xml  <- identical(html_std, xml_std)
ident_html_json <- identical(html_std, json_std)
ident_xml_json  <- identical(xml_std, json_std)

cat("Are HTML and XML identical (after standardization)? ", ident_html_xml, "\n")

## Are HTML and XML identical (after standardization)?  TRUE

cat("Are HTML and JSON identical (after standardization)?", ident_html_json, "\n")

## Are HTML and JSON identical (after standardization)? TRUE

cat("Are XML and JSON identical (after standardization)? ", ident_xml_json, "\n")

## Are XML and JSON identical (after standardization)?  TRUE

Working with XML and JSON in R

Sabina Baraili

2025-10-08

Overview

Read HTML

Read XML

Read JSON

Standardize & Compare