pkgs <- c(“rvest”,“xml2”,“jsonlite”,“dplyr”,“tibble”,“stringr”) need <- setdiff(pkgs, rownames(installed.packages())) if(length(need)) install.packages(need, repos=“https://cloud.r-project.org”) lapply(pkgs, library, character.only=TRUE)
normalize_books <- function(df){ df %>% mutate( Title = as.character(Title), Authors = stringr::str_squish(Authors), Year = as.integer(Year), Publisher = as.character(Publisher), ISBN = as.character(ISBN) ) %>% arrange(Title) %>% select(Title, Authors, Year, Publisher, ISBN) }
html_path <- “data/books.html” html_tbl <- read_html(html_path) |> html_element(“table”) |> html_table(header=TRUE, trim=TRUE) df_html <- html_tbl |> tibble::as_tibble() |> normalize_books()
xml_path <- “data/books.xml” doc <- read_xml(xml_path); bk <- xml_find_all(doc, “//book”) df_xml <- tibble( Title = xml_text(xml_find_all(bk, “title”)), Authors = sapply(bk, function(n) paste(xml_text(xml_find_all(n, “authors/author”)), collapse=“,”)), Year = as.integer(xml_text(xml_find_all(bk, “year”))), Publisher = xml_text(xml_find_all(bk, “publisher”)), ISBN = xml_text(xml_find_all(bk, “isbn”)) ) |> normalize_books()
json_path <- “data/books.json” j <- fromJSON(json_path, simplifyVector=TRUE) df_json <- tibble( Title = j\(books\)title, Authors = sapply(j\(books\)authors, function(x) paste(x, collapse=“,”)), Year = as.integer(j\(books\)year), Publisher = j\(books\)publisher, ISBN = j\(books\)isbn ) |> normalize_books()
df_html; df_xml; df_json
ident_html_xml <- identical(df_html, df_xml) ident_html_json <- identical(df_html, df_json) ident_xml_json <- identical(df_xml, df_json)
cat(“HTML & XML identical?”, ident_html_xml, “HTML & JSON identical?”, ident_html_json, “XML & JSON identical?”, ident_xml_json, “”)