Objective

Create a table of favorite books in HTML, XML, and JSON formats and load into R as dataframes

Packages used

library(jsonlite)
library(tidyverse)
library(XML)

GitHub repository

Contains tables in the requested formats https://github.com/robertwelk/DATA607_w7Assignment

Load HTML table

#html loads in as a list
html <- readHTMLTable('book.html', header=T)
str(html)
## List of 1
##  $ NULL:'data.frame':    3 obs. of  5 variables:
##   ..$ Title           : Factor w/ 3 levels "1984","Good Omens",..: 1 2 3
##   ..$ Author1         : Factor w/ 3 levels "Ernest Cline",..: 2 3 1
##   ..$ Author2         : Factor w/ 2 levels "NA","Terry Pratchett": 1 2 1
##   ..$ Genre           : Factor w/ 2 levels "Fantasy","Sci-Fi": 2 1 2
##   ..$ Main Protagonist: Factor w/ 3 levels "Crowley","Wade Watts",..: 3 1 2
# unlist and combine to get a dataframe
html <- do.call(rbind, html)
print(html)
##                   Title       Author1         Author2   Genre
## NULL.1             1984 George Orwell              NA  Sci-Fi
## NULL.2       Good Omens   Neil Gaiman Terry Pratchett Fantasy
## NULL.3 Ready Player One  Ernest Cline              NA  Sci-Fi
##        Main Protagonist
## NULL.1    Winston Smith
## NULL.2          Crowley
## NULL.3       Wade Watts

Load JSON table

# loads as a dataframe
json <- fromJSON('book.json') 
str(json)
## 'data.frame':    3 obs. of  5 variables:
##  $ Title           : chr  "1984" "Good Omens" "Ready Player One"
##  $ Author1         : chr  "George Orwell" "Neil Gaiman" "Ernest Cline"
##  $ Author2         : chr  "NA" "Terry Pratchett" "NA"
##  $ Genre           : chr  "Sci-Fi" "Fantasy" "Sci-Fi"
##  $ Main Protagonist: chr  "Winston Smith" "Crowley" "Wade Watts"

Check that the JSON and HTML tables are the same

json == html
##   Title Author1 Author2 Genre Main Protagonist
## 1  TRUE    TRUE    TRUE  TRUE             TRUE
## 2  TRUE    TRUE    TRUE  TRUE             TRUE
## 3  TRUE    TRUE    TRUE  TRUE             TRUE

Load XML table

# loading an xml is more difficult
# solution obtained from: 
#https://stackoverflow.com/questions/17198658/how-to-parse-xml-to-r-data-frame
xml.load <- xmlTreeParse('book.xml')
xml <- xmlRoot(xml.load)
xml <- xmlSApply(xml,function(x) xmlSApply(x, xmlValue))
xml <- t(xml) %>% as.data.frame() 
str(xml)
## 'data.frame':    3 obs. of  5 variables:
##  $ title      : Factor w/ 3 levels "1984","Good Omens",..: 1 2 3
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
##  $ author1    : Factor w/ 3 levels "Ernest Cline",..: 2 3 1
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
##  $ author2    : Factor w/ 2 levels "NA","Terry Pratchett": 1 2 1
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
##  $ genre      : Factor w/ 2 levels "Fantasy","Sci-Fi": 2 1 2
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
##  $ protagonist: Factor w/ 3 levels "Crowley","Wade Watts",..: 3 1 2
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
is.data.frame(xml)
## [1] TRUE

Check that XML table is the same as HTML table

xml == html
##       title author1 author2 genre protagonist
## Book1  TRUE    TRUE    TRUE  TRUE        TRUE
## Book2  TRUE    TRUE    TRUE  TRUE        TRUE
## Book3  TRUE    TRUE    TRUE  TRUE        TRUE