Data 607 Assignment 7

Overview

This report intends to upload three different format files (HTML, XML, and JSON) with the same information to understand the similarities and differences between these three formats. I will normalize and clean the data in order to properly compare if all methods preserve the same data.

Load Libraries

library(dplyr)
library(rvest)
library(xml2)
library(jsonlite)

Load Data

raw_html <- "https://raw.githubusercontent.com/JDO-MSDS/DATA-607/refs/heads/main/Assignment7/books.html"
raw_xml <- "https://raw.githubusercontent.com/JDO-MSDS/DATA-607/refs/heads/main/Assignment7/books.xml"
raw_json <- "https://raw.githubusercontent.com/JDO-MSDS/DATA-607/refs/heads/main/Assignment7/books.json"

Data Frames

# read html and transform into data frame
doc_html <- read_html(raw_html)
tbl_html <- doc_html %>% 
  html_element("table") %>%
  html_table()

# normalization for comparison
df_html <- tbl_html %>%
  mutate(
    title = trimws(as.character(title)),
    authors = trimws(as.character(authors)),
    year = suppressWarnings(as.integer(year)),
    publisher = trimws(as.character(publisher)),
    isbn = trimws(as.character(isbn))
  )

df_html

## # A tibble: 3 × 5
##   title                                       authors       year publisher isbn 
##   <chr>                                       <chr>        <int> <chr>     <chr>
## 1 Futebol - Observacao, Analise e Intervencao Antonio Bar…  2022 Prime Bo… 9789…
## 2 Mourinho Anatomy of a Winner                Patrick Bar…  2006 Orion Pu… 9780…
## 3 The Young Soccer Player                     Gary Lineker  1994 Dorling … 978-…

# read xml and transform into data fram
doc_xml <- read_xml(raw_xml)
books_node <- xml_find_all(doc_xml, ".//book")

rows <- lapply(books_node, function(node){
  data.frame(
    title = xml_text(xml_find_first(node, "./title")),
    authors = paste(xml_text(xml_find_all(node, "./authors/author")), collapse = "; "),
    year = as.integer(xml_text(xml_find_first(node, "./year"))),
    publisher = xml_text(xml_find_first(node, "./publisher")),
    isbn = xml_text(xml_find_first(node, "./isbn")),
    stringsAsFactors = FALSE
  )
})

df_xml <- do.call(rbind, rows)

df_xml

##                                         title                      authors year
## 1 Futebol - Observacao, Analise e Intervencao Antonio Barbosa; Rui Resende 2022
## 2                Mourinho Anatomy of a Winner              Patrick Barclay 2006
## 3                     The Young Soccer Player                 Gary Lineker 1994
##                           publisher           isbn
## 1                       Prime Books  9789896554620
## 2   Orion Publishing Group, Limited  9780752877655
## 3 Dorling Kindersley Publishers Ltd 978-0751351651

# read json and transform into data frame
j <- fromJSON(raw_json, simplifyVector = FALSE)

df_json <- data.frame(
  title = vapply(j, '[[', "", "title"),
  authors = vapply(j, function(x) paste(x$authors, collapse = "; "), ""),
  year = as.integer(vapply(j, '[[', 0, "year")),
  publisher = vapply(j, '[[', "", "publisher"),
  isbn = vapply(j, '[[', "", "isbn"),
  stringsAsFactors = FALSE
)

df_json

##                                         title                      authors year
## 1 Futebol - Observacao, Analise e Intervencao Antonio Barbosa; Rui Resende 2022
## 2                Mourinho Anatomy of a Winner              Patrick Barclay 2006
## 3                     The Young Soccer Player                 Gary Lineker 1994
##                           publisher           isbn
## 1                       Prime Books  9789896554620
## 2   Orion Publishing Group, Limited  9780752877655
## 3 Dorling Kindersley Publishers Ltd 978-0751351651

desired_cols <- c("title", "authors", "year", "publisher", "isbn")

norm_types <- function(df) {
  df %>%
    mutate(
      title = as.character(title),
      authors = as.character(authors),
      year = as.integer(year),
      publisher = as.character(publisher),
      isbn = as.character(isbn)
    ) %>%
    select(all_of(desired_cols)) %>%
    arrange(title, isbn)
}

df_html_n <- norm_types(df_html)
df_xml_n <- norm_types(df_xml)
df_json_n <- norm_types(df_json)

df_html_n

## # A tibble: 3 × 5
##   title                                       authors       year publisher isbn 
##   <chr>                                       <chr>        <int> <chr>     <chr>
## 1 Futebol - Observacao, Analise e Intervencao Antonio Bar…  2022 Prime Bo… 9789…
## 2 Mourinho Anatomy of a Winner                Patrick Bar…  2006 Orion Pu… 9780…
## 3 The Young Soccer Player                     Gary Lineker  1994 Dorling … 978-…

df_xml_n

##                                         title                      authors year
## 1 Futebol - Observacao, Analise e Intervencao Antonio Barbosa; Rui Resende 2022
## 2                Mourinho Anatomy of a Winner              Patrick Barclay 2006
## 3                     The Young Soccer Player                 Gary Lineker 1994
##                           publisher           isbn
## 1                       Prime Books  9789896554620
## 2   Orion Publishing Group, Limited  9780752877655
## 3 Dorling Kindersley Publishers Ltd 978-0751351651

df_json_n

##                                         title                      authors year
## 1 Futebol - Observacao, Analise e Intervencao Antonio Barbosa; Rui Resende 2022
## 2                Mourinho Anatomy of a Winner              Patrick Barclay 2006
## 3                     The Young Soccer Player                 Gary Lineker 1994
##                           publisher           isbn
## 1                       Prime Books  9789896554620
## 2   Orion Publishing Group, Limited  9780752877655
## 3 Dorling Kindersley Publishers Ltd 978-0751351651

cmp_html_xml <- all.equal(df_html_n, df_xml_n, check.attributes = FALSE)
cmp_html_json <- all.equal(df_html_n, df_json_n, check.attributes = FALSE)
cmp_json_xml <- all.equal(df_json_n, df_xml_n, check.attributes = FALSE)

cat("html vs xml: ", cmp_html_xml, "\n")

## html vs xml:  TRUE

cat("html vs json: ", cmp_html_json, "\n")

## html vs json:  TRUE

cat("json vs xml: ", cmp_json_xml, "\n")

## json vs xml:  TRUE

Data 607 Assignment 7

Joao De Oliveira

2025-10-08

Overview

Load Libraries

Load Data

Data Frames

Conclusion