# Public raw GitHub URLs
html_url <- "https://raw.githubusercontent.com/klkid969/DATA607_Week7/refs/heads/main/books.html"
xml_url  <- "https://raw.githubusercontent.com/klkid969/DATA607_Week7/refs/heads/main/books.xml"
json_url <- "https://raw.githubusercontent.com/klkid969/DATA607_Week7/refs/heads/main/books.json"
# ---- 1) Load HTML table (from URL) ----
html_doc   <- read_html(html_url)
html_list  <- html_table(html_doc, fill = TRUE)
html_books <- as.data.frame(html_list[[1]])
# ---- 2) Load XML (collapse multiple authors) ----
xml_doc    <- read_xml(xml_url)
book_nodes <- xml_find_all(xml_doc, ".//book")
xml_books <- lapply(book_nodes, function(b){
  title   <- xml_text(xml_find_first(b, "./title"))
  authors <- xml_text(xml_find_all(b, "./authors/author"))
  year    <- xml_text(xml_find_first(b, "./year"))
  genre   <- xml_text(xml_find_first(b, "./genre"))
  adapted <- xml_text(xml_find_first(b, "./adapted_to_film"))
  
  data.frame(
    Title = title,
    Authors = paste(authors, collapse = "; "),
    Year = as.integer(year),
    Genre = genre,
    Adapted_to_Film = adapted,
    stringsAsFactors = FALSE
  )
}) |> bind_rows()
# ---- 3) Load JSON ----
json_books_list <- fromJSON(json_url, flatten = TRUE)
json_books <- data.frame(
  Title = json_books_list$title,
  Authors = sapply(json_books_list$authors, function(x) paste(x, collapse = "; ")),
  Year = as.integer(json_books_list$year),
  Genre = json_books_list$genre,
  Adapted_to_Film = json_books_list$adapted_to_film,
  stringsAsFactors = FALSE
)
# ---- 4) Make columns consistent ----
names(html_books) <- c("Title","Authors","Year","Genre","Adapted_to_Film")
html_books$Year   <- as.integer(html_books$Year)

# ---- 5) Compare (no native |> pipes) ----
normalize_df <- function(df) {
  df |>
    mutate(
      Title = as.character(Title),
      Authors = as.character(Authors),
      Year = as.integer(Year),
      Genre = as.character(Genre),
      Adapted_to_Film = as.character(Adapted_to_Film)
    ) |>
    arrange(Title)
}

h <- normalize_df(html_books)
x <- normalize_df(xml_books)
j <- normalize_df(json_books)

cat("HTML vs XML identical? ", identical(h, x), "\n")
## HTML vs XML identical?  TRUE
cat("XML vs JSON identical? ", identical(x, j), "\n")
## XML vs JSON identical?  TRUE
cat("HTML vs JSON identical? ", identical(h, j), "\n")
## HTML vs JSON identical?  TRUE
# Display the final tidy table
h
##               Title                 Authors Year                   Genre
## 1 A Game of Thrones     George R. R. Martin 1996    Fantasy / Epic Drama
## 2              Dune           Frank Herbert 1965      Sci-Fi / Adventure
## 3       The Martian Andy Weir; Ridley Scott 2011 Sci-Fi / Space Survival
##   Adapted_to_Film
## 1             Yes
## 2             Yes
## 3             Yes
# ---- 6) Optional: view or save ----
if(!identical(h, x) || !identical(x, j) || !identical(h, j)){
  cat("Differences detected. Here are head()s for review:\n")
  print(head(h)); print(head(x)); print(head(j))
} else {
  cat("✅ All three formats match perfectly!\n")
}
## ✅ All three formats match perfectly!
write.csv(h, "books_tidy.csv", row.names = FALSE)

Reflection

As I worked on creating the HTML, XML, and JSON files by hand, I began to notice how similar HTML and XML appear at first glance—they both use tags and a structured layout.
The most significant difference I noticed is that HTML is designed to be displayed in a browser for people to read, while XML is built for storing and exchanging data between systems.
The headers at the top of each file instruct the program on how to handle the content, which is why an HTML file opens visually in a browser, while an XML file displays the raw data.
JSON, on the other hand, was much easier for me to understand because it’s cleaner, more compact, and feels closer to how modern APIs share data.

I know this is nothing new for most people, but for me—someone who only sees this in class and doesn’t work with it regularly—it was very eye-opening to see how these formats connect to the real world.
Creating each file by hand really helped me see how the same information can be formatted differently depending on who—or what—is supposed to read it.