week 7 html and json

Author

chingiz rabdanov

Load packages

library(rvest) library(jsonlite) library(dplyr)

1. Load HTML into R data frame

html_url <- “https://raw.githubusercontent.com/Chingiz1492/html-and-json/refs/heads/main/3booksRH.html”

df_html <- html_url |> read_html() |> html_element(“table”) |> html_table(header = TRUE)

cat(“=== HTML data frame ===”) print(df_html)

2. Load JSON into a separate R data frame

json_url <- “https://raw.githubusercontent.com/Chingiz1492/html-and-json/refs/heads/main/3booksRH.json”

df_json_raw <- fromJSON(json_url)

cat(“=== JSON data frame (raw) ===”) print(df_json_raw)

3. Fix the three differences in JSON

Fix 1: Convert Authors from list to string

df_json_raw\(Authors <- sapply( df_json_raw\)Authors, function(x) paste(trimws(x), collapse = “,”) )

Fix 2: Rename “Date of publication” to “Date_of_publication”

names(df_json_raw)[names(df_json_raw) == “Date of publication”] <- “Date_of_publication”

Fix 3: Replace NA with “none” in ISBN_10

df_json_raw\(ISBN_10[is.na(df_json_raw\)ISBN_10)] <- “none”

df_json <- df_json_raw

cat(“=== JSON data frame (fixed) ===”) print(df_json)

4. Normalize both data frames for comparison

df_json <- df_json[, names(df_html)]

rownames(df_html) <- NULL rownames(df_json) <- NULL

df_html_norm <- df_html |> mutate(across(everything(), as.character)) df_json_norm <- df_json |> mutate(across(everything(), as.character))

5. Compare using identical()

result <- identical(df_html_norm, df_json_norm)

if (result) { cat(“✅ identical() = TRUE: data frames are fully identical”) } else { cat(“❌ identical() = FALSE: data frames still differ”)

cat(“-by-column comparison:”) for (col in names(df_html_norm)) { match <- identical(df_html_norm[[col]], df_json_norm[[col]]) cat(” -“, col,”:“, ifelse(match,”✅ match”, “❌ differ”), “”) } }

6. Final output

cat(“=== FINAL: HTML data frame ===”) print(df_html)

cat(“=== FINAL: JSON data frame ===”) print(df_json)##