#3: Generate the html data frame
# Install and load the required packages
library(XML)
library(xml2)
library(jsonlite)
library(rvest)
library(httr)
books_html <- "https://raw.githubusercontent.com/hawa1983/WK7Assignment/main/books.html"
html_df <- books_html %>%
read_html() %>%
html_table(fill = TRUE) %>%
.[[1]]
html_df[html_df == ""] <- NA
html_df
## # A tibble: 3 × 5
## Title Author_1 Author_2 Attribute_1 Attribute_2
## <chr> <chr> <chr> <chr> <chr>
## 1 The Story of Art E.H. Gombrich <NA> Classic Comprehensive
## 2 Ways of Seeing John Berger <NA> Iconic Thought-Provok…
## 3 Art Through the Ages Helen Gardner Fred S. Kleiner In-depth Educational
#2: Generate the xml data frame
# Install and load the required packages
library(XML)
library(jsonlite)
library(rvest)
library(httr)
# Fetching the content, also you can access this xml text file directly on the web by clicking on the link below.
books_xml <- GET("https://raw.githubusercontent.com/hawa1983/WK7Assignment/main/books.xml")
# Ensuring the request was successful
if (http_status(books_xml)$category == "Success") {
FBooksXml1 <- xmlParse(content(books_xml, as = "text", encoding = "UTF-8"))
} else {
stop("Failed to fetch the XML from GitHub.")
}
# Extracting book nodes
FBooksXml2 <- getNodeSet(FBooksXml1, "//book")
# Function to parse each book
parseBook <- function(book) {
title <- xpathSApply(book, "./title", xmlValue)
author_1 <- xpathSApply(book, "./author_1", xmlValue)
author_2 <- xpathSApply(book, "./author_2", xmlValue)
author_3 <- xpathSApply(book, "./author_3", xmlValue)
attribute_1 <- xpathSApply(book, "./attribute_1", xmlValue)
attribute_2 <- xpathSApply(book, "./attribute_2", xmlValue)
attribute_3 <- xpathSApply(book, "./attribute_3", xmlValue)
data.frame(
title = ifelse(length(title) > 0, title, NA),
author_1 = ifelse(length(author_1) > 0, author_1, NA),
author_2 = ifelse(length(author_2) > 0, author_2, NA),
author_3 = ifelse(length(author_3) > 0, author_3, NA),
attribute_1 = ifelse(length(attribute_1) > 0, attribute_1, NA),
attribute_2 = ifelse(length(attribute_2) > 0, attribute_2, NA),
attribute_3 = ifelse(length(attribute_3) > 0, attribute_3, NA),
stringsAsFactors = FALSE
)
}
FBooksXml3 <- lapply(FBooksXml2, parseBook)
FBooksXml4 <- do.call(rbind, FBooksXml3)
FBooksXml4[FBooksXml4 == ""] <- NA
FBooksXml4
## title author_1 author_2 author_3 attribute_1
## 1 The Story of Art E.H. Gombrich <NA> NA Classic
## 2 Ways of Seeing John Berger <NA> NA Iconic
## 3 Art Through the Ages Helen Gardner Fred S. Kleiner NA In-depth
## attribute_2 attribute_3
## 1 Comprehensive NA
## 2 Thought-Provoking NA
## 3 Educational NA
#3: Generate the json data frame
# Install and load the required packages
library(XML)
library(jsonlite)
library(rvest)
library(httr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
# Load data from JSON
# Load JSON data into an R data frame
json_data <- fromJSON("https://raw.githubusercontent.com/hawa1983/WK7Assignment/main/books.json")
json_df <- as.data.frame(json_data)
json_df[json_df == ""] <- NA
json_df
## books.title books.author_1 books.author_2 books.attribute_1
## 1 The Story of Art E.H. Gombrich <NA> Classic
## 2 Ways of Seeing John Berger <NA> Iconic
## 3 Art Through the Ages Helen Gardner Fred S. Kleiner In-depth
## books.attribute_2
## 1 Comprehensive
## 2 Thought-Provoking
## 3 Educational
#4: The raw data and data types in the data frame are the same but the data frames are not identical
# Check if the data frames are identical
identical(html_df, FBooksXml4)
## [1] FALSE
identical(html_df, json_df)
## [1] FALSE
identical(FBooksXml4, json_df)
## [1] FALSE