Objective

Create a table of favorite books in HTML, XML, and JSON formats and load into R as dataframes

Packages used

library(jsonlite)
library(tidyverse)
library(XML)

GitHub repository

Contains tables in the requested formats https://github.com/robertwelk/DATA607_w7Assignment

Load HTML table

#html loads in as a list
html <- readHTMLTable('book.html', header=T)
str(html)

## List of 1
##  $ NULL:'data.frame':    3 obs. of  5 variables:
##   ..$ Title           : Factor w/ 3 levels "1984","Good Omens",..: 1 2 3
##   ..$ Author1         : Factor w/ 3 levels "Ernest Cline",..: 2 3 1
##   ..$ Author2         : Factor w/ 2 levels "NA","Terry Pratchett": 1 2 1
##   ..$ Genre           : Factor w/ 2 levels "Fantasy","Sci-Fi": 2 1 2
##   ..$ Main Protagonist: Factor w/ 3 levels "Crowley","Wade Watts",..: 3 1 2

# unlist and combine to get a dataframe
html <- do.call(rbind, html)
print(html)

##                   Title       Author1         Author2   Genre
## NULL.1             1984 George Orwell              NA  Sci-Fi
## NULL.2       Good Omens   Neil Gaiman Terry Pratchett Fantasy
## NULL.3 Ready Player One  Ernest Cline              NA  Sci-Fi
##        Main Protagonist
## NULL.1    Winston Smith
## NULL.2          Crowley
## NULL.3       Wade Watts

Load JSON table

# loads as a dataframe
json <- fromJSON('book.json') 
str(json)

## 'data.frame':    3 obs. of  5 variables:
##  $ Title           : chr  "1984" "Good Omens" "Ready Player One"
##  $ Author1         : chr  "George Orwell" "Neil Gaiman" "Ernest Cline"
##  $ Author2         : chr  "NA" "Terry Pratchett" "NA"
##  $ Genre           : chr  "Sci-Fi" "Fantasy" "Sci-Fi"
##  $ Main Protagonist: chr  "Winston Smith" "Crowley" "Wade Watts"

Check that the JSON and HTML tables are the same

json == html

##   Title Author1 Author2 Genre Main Protagonist
## 1  TRUE    TRUE    TRUE  TRUE             TRUE
## 2  TRUE    TRUE    TRUE  TRUE             TRUE
## 3  TRUE    TRUE    TRUE  TRUE             TRUE

Load XML table

# loading an xml is more difficult
# solution obtained from: 
#https://stackoverflow.com/questions/17198658/how-to-parse-xml-to-r-data-frame
xml.load <- xmlTreeParse('book.xml')
xml <- xmlRoot(xml.load)
xml <- xmlSApply(xml,function(x) xmlSApply(x, xmlValue))
xml <- t(xml) %>% as.data.frame() 
str(xml)

## 'data.frame':    3 obs. of  5 variables:
##  $ title      : Factor w/ 3 levels "1984","Good Omens",..: 1 2 3
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
##  $ author1    : Factor w/ 3 levels "Ernest Cline",..: 2 3 1
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
##  $ author2    : Factor w/ 2 levels "NA","Terry Pratchett": 1 2 1
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
##  $ genre      : Factor w/ 2 levels "Fantasy","Sci-Fi": 2 1 2
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"
##  $ protagonist: Factor w/ 3 levels "Crowley","Wade Watts",..: 3 1 2
##   ..- attr(*, "names")= chr  "Book1" "Book2" "Book3"

is.data.frame(xml)

## [1] TRUE

Check that XML table is the same as HTML table

xml == html

##       title author1 author2 genre protagonist
## Book1  TRUE    TRUE    TRUE  TRUE        TRUE
## Book2  TRUE    TRUE    TRUE  TRUE        TRUE
## Book3  TRUE    TRUE    TRUE  TRUE        TRUE

DATA 607 - Week 7 Assignment

Rob Welk

March 17, 2019

Objective

Packages used

GitHub repository

Load HTML table

Load JSON table

Check that the JSON and HTML tables are the same

Load XML table

Check that XML table is the same as HTML table