week 7 html and json
Load packages
library(rvest) library(jsonlite) library(dplyr)
1. Load HTML into R data frame
html_url <- “https://raw.githubusercontent.com/Chingiz1492/html-and-json/refs/heads/main/3booksRH.html”
df_html <- html_url |> read_html() |> html_element(“table”) |> html_table(header = TRUE)
cat(“=== HTML data frame ===”) print(df_html)
2. Load JSON into a separate R data frame
json_url <- “https://raw.githubusercontent.com/Chingiz1492/html-and-json/refs/heads/main/3booksRH.json”
df_json_raw <- fromJSON(json_url)
cat(“=== JSON data frame (raw) ===”) print(df_json_raw)
3. Fix the three differences in JSON
Fix 2: Rename “Date of publication” to “Date_of_publication”
names(df_json_raw)[names(df_json_raw) == “Date of publication”] <- “Date_of_publication”
Fix 3: Replace NA with “none” in ISBN_10
df_json_raw\(ISBN_10[is.na(df_json_raw\)ISBN_10)] <- “none”
df_json <- df_json_raw
cat(“=== JSON data frame (fixed) ===”) print(df_json)
4. Normalize both data frames for comparison
df_json <- df_json[, names(df_html)]
rownames(df_html) <- NULL rownames(df_json) <- NULL
df_html_norm <- df_html |> mutate(across(everything(), as.character)) df_json_norm <- df_json |> mutate(across(everything(), as.character))
5. Compare using identical()
result <- identical(df_html_norm, df_json_norm)
if (result) { cat(“✅ identical() = TRUE: data frames are fully identical”) } else { cat(“❌ identical() = FALSE: data frames still differ”)
cat(“-by-column comparison:”) for (col in names(df_html_norm)) { match <- identical(df_html_norm[[col]], df_json_norm[[col]]) cat(” -“, col,”:“, ifelse(match,”✅ match”, “❌ differ”), “”) } }
6. Final output
cat(“=== FINAL: HTML data frame ===”) print(df_html)
cat(“=== FINAL: JSON data frame ===”) print(df_json)##