Working with XML and JSON in R - Pick three of your favorite books on one of your favorite subjects. At least one of the books should have more than one author. For each book, include the title, authors, and two or three other attributes that you find interesting. Take the information that you’ve selected about these three books, and separately create three files which store the book’s information in HTML (using an html table), XML, and JSON formats (e.g. “books.html”, “books.xml”, and “books.json”). ——————————————————————————–
library(DT)
library(dplyr)
library(htmltools)
library (readr)
library(aws.s3)
library(jsonlite)
library(rvest)
library(XML)library(lares)
bucket<-get_creds()$`aws.s3`$bucket
Sys.setenv(
"AWS_ACCESS_KEY_ID" = get_creds()$`aws.s3`$accessKeyId,
"AWS_SECRET_ACCESS_KEY" = get_creds()$`aws.s3`$accessKey,
"AWS_DEFAULT_REGION" = get_creds()$`aws.s3`$region)Following 3 books with respective writers are used for this assignment
# CSV
books.csv<-s3read_using(FUN = read.csv, object = "Books.csv", bucket = bucket)
datatable(books.csv,
options = list(pageLength = 10,searching = FALSE,filter=FALSE,
pageLength = FALSE),rownames = FALSE)# JSON
books.vector <-get_object(bucket = bucket,object = "Books.json")
books.json <- fromJSON(rawToChar(books.vector))dataframe_json<-books.json$Books
datatable(dataframe_json,
options = list(pageLength = 10,searching = FALSE,filter=FALSE,
pageLength = FALSE),rownames = FALSE)# XML
books.vector <-get_object(bucket = bucket,object = "Books.xml")
books.xml <- xmlParse(rawToChar(books.vector))
books.xml<- xmlSApply(xmlRoot(books.xml), function(x) xmlSApply(x, xmlValue))dataframe_xml <-t(books.xml)
datatable(dataframe_xml,
options = list(pageLength = 10,searching = FALSE,filter=FALSE,
pageLength = FALSE),rownames = FALSE)# HTML
books.vector <-get_object(bucket = bucket,object = "Books.html")
books.vector<-rawToChar(books.vector)
# ignore 0th element (as NULL)
dataframe_html <- (readHTMLTable(books.vector))[[1]]datatable(dataframe_html,
options = list(pageLength = 10,searching = FALSE,filter=FALSE,
pageLength = FALSE),rownames = FALSE)To find different resulting data frames were identical
all.equal(dataframe_json, dataframe_xml)## [1] "Modes: list, character"
## [2] "Lengths: 6, 18"
## [3] "names for target but not for current"
## [4] "Attributes: < Names: 2 string mismatches >"
## [5] "Attributes: < Component 1: Modes: character, numeric >"
## [6] "Attributes: < Component 1: Lengths: 1, 2 >"
## [7] "Attributes: < Component 1: target is character, current is numeric >"
## [8] "Attributes: < Component 2: Modes: numeric, list >"
## [9] "Attributes: < Component 2: Lengths: 3, 2 >"
## [10] "Attributes: < Component 2: target is numeric, current is list >"
## [11] "current is not list-like"
all.equal(dataframe_json, dataframe_html)## [1] "Component \"Author\": Modes: list, character"
## [2] "Component \"Author\": Component 2: Lengths (2, 1) differ (string compare on first 1)"
## [3] "Component \"Author\": Component 2: 1 string mismatch"
## [4] "Component \"Author\": Component 3: Lengths (3, 1) differ (string compare on first 1)"
## [5] "Component \"Author\": Component 3: 1 string mismatch"
## [6] "Component \"Publisher\": 1 string mismatch"
## [7] "Component \"Year\": Modes: numeric, character"
## [8] "Component \"Year\": target is numeric, current is character"
all.equal(dataframe_xml, dataframe_html)## [1] "Modes: character, list"
## [2] "Lengths: 18, 6"
## [3] "names for current but not for target"
## [4] "Attributes: < Names: 2 string mismatches >"
## [5] "Attributes: < Component 1: Modes: numeric, character >"
## [6] "Attributes: < Component 1: Lengths: 2, 1 >"
## [7] "Attributes: < Component 1: target is numeric, current is character >"
## [8] "Attributes: < Component 2: Modes: list, numeric >"
## [9] "Attributes: < Component 2: Length mismatch: comparison on first 2 components >"
## [10] "Attributes: < Component 2: Component 1: Modes: character, numeric >"
## [11] "Attributes: < Component 2: Component 1: Lengths: 3, 1 >"
## [12] "Attributes: < Component 2: Component 1: target is character, current is numeric >"
## [13] "Attributes: < Component 2: Component 2: Modes: character, numeric >"
## [14] "Attributes: < Component 2: Component 2: Lengths: 6, 1 >"
## [15] "Attributes: < Component 2: Component 2: target is character, current is numeric >"
## [16] "target is matrix, current is data.frame"