Working with XML and JSON in R

#3: Generate the html data frame

# Install and load the required packages
library(XML)
library(xml2)
library(jsonlite)
library(rvest)
library(httr)

books_html <- "https://raw.githubusercontent.com/hawa1983/WK7Assignment/main/books.html"
html_df <- books_html %>% 
  read_html() %>%
  html_table(fill = TRUE) %>%
  .[[1]]

html_df[html_df == ""] <- NA

html_df

## # A tibble: 3 × 5
##   Title                Author_1      Author_2        Attribute_1 Attribute_2    
##   <chr>                <chr>         <chr>           <chr>       <chr>          
## 1 The Story of Art     E.H. Gombrich <NA>            Classic     Comprehensive  
## 2 Ways of Seeing       John Berger   <NA>            Iconic      Thought-Provok…
## 3 Art Through the Ages Helen Gardner Fred S. Kleiner In-depth    Educational

#2: Generate the xml data frame

# Install and load the required packages
library(XML)
library(jsonlite)
library(rvest)
library(httr)


# Fetching the content, also you can access this xml text file directly on the web by clicking on the link below.
books_xml <- GET("https://raw.githubusercontent.com/hawa1983/WK7Assignment/main/books.xml")

# Ensuring the request was successful
if (http_status(books_xml)$category == "Success") {
   FBooksXml1 <- xmlParse(content(books_xml, as = "text", encoding = "UTF-8"))
  
} else {
  stop("Failed to fetch the XML from GitHub.")
}

# Extracting book nodes
FBooksXml2 <- getNodeSet(FBooksXml1, "//book")

# Function to parse each book
parseBook <- function(book) {
  
  title <- xpathSApply(book, "./title", xmlValue)
  
  author_1 <- xpathSApply(book, "./author_1", xmlValue)
  author_2 <- xpathSApply(book, "./author_2", xmlValue)
  author_3 <- xpathSApply(book, "./author_3", xmlValue)
  
  attribute_1 <- xpathSApply(book, "./attribute_1", xmlValue)
  attribute_2 <- xpathSApply(book, "./attribute_2", xmlValue)
  attribute_3 <- xpathSApply(book, "./attribute_3", xmlValue)
  
  data.frame(
    title = ifelse(length(title) > 0, title, NA),
    author_1 = ifelse(length(author_1) > 0, author_1, NA),
    author_2 = ifelse(length(author_2) > 0, author_2, NA),
    author_3 = ifelse(length(author_3) > 0, author_3, NA),
    attribute_1 = ifelse(length(attribute_1) > 0, attribute_1, NA),
    attribute_2 = ifelse(length(attribute_2) > 0, attribute_2, NA),
    attribute_3 = ifelse(length(attribute_3) > 0, attribute_3, NA),
    stringsAsFactors = FALSE
  )
}

FBooksXml3 <- lapply(FBooksXml2, parseBook)
FBooksXml4 <- do.call(rbind, FBooksXml3)

FBooksXml4[FBooksXml4 == ""] <- NA
FBooksXml4

##                  title      author_1        author_2 author_3 attribute_1
## 1     The Story of Art E.H. Gombrich            <NA>       NA     Classic
## 2       Ways of Seeing   John Berger            <NA>       NA      Iconic
## 3 Art Through the Ages Helen Gardner Fred S. Kleiner       NA    In-depth
##         attribute_2 attribute_3
## 1     Comprehensive          NA
## 2 Thought-Provoking          NA
## 3       Educational          NA

#3: Generate the json data frame

# Install and load the required packages
library(XML)
library(jsonlite)
library(rvest)
library(httr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ purrr::flatten()        masks jsonlite::flatten()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)

# Load data from JSON

# Load JSON data into an R data frame
json_data <- fromJSON("https://raw.githubusercontent.com/hawa1983/WK7Assignment/main/books.json")
json_df <- as.data.frame(json_data)
json_df[json_df == ""] <- NA

json_df

##            books.title books.author_1  books.author_2 books.attribute_1
## 1     The Story of Art  E.H. Gombrich            <NA>           Classic
## 2       Ways of Seeing    John Berger            <NA>            Iconic
## 3 Art Through the Ages  Helen Gardner Fred S. Kleiner          In-depth
##   books.attribute_2
## 1     Comprehensive
## 2 Thought-Provoking
## 3       Educational

#4: The raw data and data types in the data frame are the same but the data frames are not identical

# Check if the data frames are identical
identical(html_df, FBooksXml4)

## [1] FALSE

identical(html_df, json_df)

## [1] FALSE

identical(FBooksXml4, json_df)

## [1] FALSE

Working with XML and JSON in R

Fomba Kassoh

2023-10-21