Week 7 HTML and JSON

library(rvest)
library(jsonlite)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
html_url <- "https://raw.githubusercontent.com/Jeovany97/Data-607/refs/heads/main/Assignment%206/books.html"
json_url <- "https://raw.githubusercontent.com/Jeovany97/Data-607/refs/heads/main/Assignment%206/books.json"
# Load html data into dataframe
html_raw <- read_html(html_url)
html_df <- html_raw %>% 
  html_table() %>%
  .[[1]]

# LOad Json data into dataframe
json_df <- fromJSON(json_url)


# Normalizing the two dataframes since JSON store them as a string and HTML use tables
#Converting JSON authors to a comma separated string to match HTML
json_df_normalized <- json_df %>%
  mutate(authors = sapply(authors, function(x) paste(x, collapse = ", ")))

# Comparing the two data frames
are_identical <- all.equal(html_df, json_df_normalized)

if (!isTRUE(are_identical)) {
  print("Differences found:")
  print(are_identical)
}
[1] "Differences found:"
[1] "Names: 5 string mismatches"                                                            
[2] "Attributes: < Component \"class\": Lengths (3, 1) differ (string compare on first 1) >"
[3] "Attributes: < Component \"class\": 1 string mismatch >"