Overview

This R Markdown reads three “by hand” source files about favorite books:

It loads each into its own data frame, then standardizes columns so we can compare whether the data frames are identical.

library(tidyverse)
library(rvest)     # parse HTML
library(xml2)      # parse XML
library(jsonlite)  # parse JSON
library(knitr)

# Helper to standardize authors (lists/arrays) into comma-separated strings
collapse_authors <- function(x) {
  if (is.list(x)) {
    sapply(x, function(v) paste(unlist(v), collapse = ", "))
  } else if (is.character(x)) {
    x
  } else {
    as.character(x)
  }
}

Read HTML

html_path <- "books.html"
doc <- read_html(html_path)
html_df <- html_table(doc, header = TRUE)[[0 + 1]] %>%
  as_tibble() %>%
  rename(
    Title = Title,
    Authors = Authors,
    Year = Year,
    Pages = Pages,
    Subject = Subject,
    ISBN = ISBN
  ) %>%
  mutate(
    Year = as.integer(Year),
    Pages = as.integer(Pages),
    Authors = as.character(Authors)
  )

kable(html_df, caption = "Data from HTML")
Data from HTML
Title Authors Year Pages Subject ISBN
Forecasting: Principles and Practice (3rd ed.) Rob J. Hyndman, George Athanasopoulos 2021 412 Time Series / Forecasting 978-0987507136
Hands-On Programming with R Garrett Grolemund 2014 250 R Programming 978-1449359010
Deep Learning with Python (2nd ed.) François Chollet 2021 504 Deep Learning 978-1617296864

Read XML

xml_path <- "books.xml"
x <- read_xml(xml_path)

book_nodes <- xml_find_all(x, "//book")

xml_df <- tibble(
  Title   = xml_text(xml_find_first(book_nodes, "title")),
  Authors = sapply(book_nodes, function(node) {
    paste(xml_text(xml_find_all(node, "authors/author")), collapse = ", ")
  }),
  Year    = as.integer(xml_text(xml_find_first(book_nodes, "year"))),
  Pages   = as.integer(xml_text(xml_find_first(book_nodes, "pages"))),
  Subject = xml_text(xml_find_first(book_nodes, "subject")),
  ISBN    = xml_text(xml_find_first(book_nodes, "isbn"))
)

kable(xml_df, caption = "Data from XML")
Data from XML
Title Authors Year Pages Subject ISBN
Forecasting: Principles and Practice (3rd ed.) Rob J. Hyndman, George Athanasopoulos 2021 412 Time Series / Forecasting 978-0987507136
Hands-On Programming with R Garrett Grolemund 2014 250 R Programming 978-1449359010
Deep Learning with Python (2nd ed.) François Chollet 2021 504 Deep Learning 978-1617296864

Read JSON

json_path <- "books.json"
j <- fromJSON(json_path, flatten = TRUE)

json_df <- tibble(
  Title   = j$title,
  Authors = collapse_authors(j$authors),
  Year    = as.integer(j$year),
  Pages   = as.integer(j$pages),
  Subject = j$subject,
  ISBN    = j$isbn
)

kable(json_df, caption = "Data from JSON")
Data from JSON
Title Authors Year Pages Subject ISBN
Forecasting: Principles and Practice (3rd ed.) Rob J. Hyndman, George Athanasopoulos 2021 412 Time Series / Forecasting 978-0987507136
Hands-On Programming with R Garrett Grolemund 2014 250 R Programming 978-1449359010
Deep Learning with Python (2nd ed.) François Chollet 2021 504 Deep Learning 978-1617296864

Standardize & Compare

We sort rows by Title and ensure identical column order and types before comparison.

std <- function(df) {
  df %>%
    select(Title, Authors, Year, Pages, Subject, ISBN) %>%
    arrange(Title) %>%
    mutate(
      Title = as.character(Title),
      Authors = as.character(Authors),
      Year = as.integer(Year),
      Pages = as.integer(Pages),
      Subject = as.character(Subject),
      ISBN = as.character(ISBN)
    )
}

html_std <- std(html_df)
xml_std  <- std(xml_df)
json_std <- std(json_df)

ident_html_xml  <- identical(html_std, xml_std)
ident_html_json <- identical(html_std, json_std)
ident_xml_json  <- identical(xml_std, json_std)

cat("Are HTML and XML identical (after standardization)? ", ident_html_xml, "\n")
## Are HTML and XML identical (after standardization)?  TRUE
cat("Are HTML and JSON identical (after standardization)?", ident_html_json, "\n")
## Are HTML and JSON identical (after standardization)? TRUE
cat("Are XML and JSON identical (after standardization)? ", ident_xml_json, "\n")
## Are XML and JSON identical (after standardization)?  TRUE