book

basePath <- "."

# 1) HTML -> dfHtml
htmlPath <- file.path(basePath, "books.html")
htmlTabs <- read_html(htmlPath) |>
  html_table(header = TRUE, fill = TRUE, trim = TRUE)

dfHtml <- htmlTabs[[1]] |>
  dplyr::rename_with(~ tolower(.x)) |>
  mutate(authors = stringr::str_replace_all(authors, "\\s*;\\s*", "; "))

# 2) XML -> dfXml
xmlPath <- file.path(basePath, "books.xml")
xmlDoc  <- read_xml(xmlPath)
xmlBooks <- xml_find_all(xmlDoc, "//book")

dfXml <- tibble(
  title = xml_text(xml_find_all(xmlBooks, "./title")),
  authors = purrr::map_chr(xmlBooks, function(b) {
    auths <- xml_find_all(b, "./authors/author") |> xml_text()
    paste(auths, collapse = "; ")
  }),
  year = as.integer(xml_text(xml_find_all(xmlBooks, "./year"))),
  publisher = xml_text(xml_find_all(xmlBooks, "./publisher")),
  isbn = xml_text(xml_find_all(xmlBooks, "./isbn")),
  subject = xml_attr(xmlDoc, "subject")
)

# 3) JSON -> dfJson
jsonPath <- file.path(basePath, "books.json")
dfJson <- jsonlite::fromJSON(jsonPath, flatten = TRUE) |>
  tibble::as_tibble() |>
  mutate(authors = purrr::map_chr(authors, ~ paste(.x, collapse = "; ")))

normalizeDf <- function(df) {
  df |>
    mutate(
      title = as.character(title),
      authors = as.character(authors),
      year = as.integer(year),
      publisher = as.character(publisher),
      isbn = as.character(isbn),
      subject = as.character(subject)
    ) |>
    select(title, authors, year, publisher, isbn, subject) |>
    arrange(title)
}

dfHtmlN <- normalizeDf(dfHtml)
dfXmlN  <- normalizeDf(dfXml)
dfJsonN <- normalizeDf(dfJson)

identical(dfHtmlN, dfXmlN)

## [1] TRUE

identical(dfHtmlN, dfJsonN)

## [1] TRUE

identical(dfXmlN, dfJsonN)

## [1] TRUE

The analysis found that all three data sources—HTML, XML, and JSON—contained the same information about the selected books after being normalized in R.

book

2025-10-11