Create a table of favorite books in HTML, XML, and JSON formats and load into R as dataframes
library(jsonlite)
library(tidyverse)
library(XML)
Contains tables in the requested formats https://github.com/robertwelk/DATA607_w7Assignment
#html loads in as a list
html <- readHTMLTable('book.html', header=T)
str(html)
## List of 1
## $ NULL:'data.frame': 3 obs. of 5 variables:
## ..$ Title : Factor w/ 3 levels "1984","Good Omens",..: 1 2 3
## ..$ Author1 : Factor w/ 3 levels "Ernest Cline",..: 2 3 1
## ..$ Author2 : Factor w/ 2 levels "NA","Terry Pratchett": 1 2 1
## ..$ Genre : Factor w/ 2 levels "Fantasy","Sci-Fi": 2 1 2
## ..$ Main Protagonist: Factor w/ 3 levels "Crowley","Wade Watts",..: 3 1 2
# unlist and combine to get a dataframe
html <- do.call(rbind, html)
print(html)
## Title Author1 Author2 Genre
## NULL.1 1984 George Orwell NA Sci-Fi
## NULL.2 Good Omens Neil Gaiman Terry Pratchett Fantasy
## NULL.3 Ready Player One Ernest Cline NA Sci-Fi
## Main Protagonist
## NULL.1 Winston Smith
## NULL.2 Crowley
## NULL.3 Wade Watts
# loads as a dataframe
json <- fromJSON('book.json')
str(json)
## 'data.frame': 3 obs. of 5 variables:
## $ Title : chr "1984" "Good Omens" "Ready Player One"
## $ Author1 : chr "George Orwell" "Neil Gaiman" "Ernest Cline"
## $ Author2 : chr "NA" "Terry Pratchett" "NA"
## $ Genre : chr "Sci-Fi" "Fantasy" "Sci-Fi"
## $ Main Protagonist: chr "Winston Smith" "Crowley" "Wade Watts"
json == html
## Title Author1 Author2 Genre Main Protagonist
## 1 TRUE TRUE TRUE TRUE TRUE
## 2 TRUE TRUE TRUE TRUE TRUE
## 3 TRUE TRUE TRUE TRUE TRUE
# loading an xml is more difficult
# solution obtained from:
#https://stackoverflow.com/questions/17198658/how-to-parse-xml-to-r-data-frame
xml.load <- xmlTreeParse('book.xml')
xml <- xmlRoot(xml.load)
xml <- xmlSApply(xml,function(x) xmlSApply(x, xmlValue))
xml <- t(xml) %>% as.data.frame()
str(xml)
## 'data.frame': 3 obs. of 5 variables:
## $ title : Factor w/ 3 levels "1984","Good Omens",..: 1 2 3
## ..- attr(*, "names")= chr "Book1" "Book2" "Book3"
## $ author1 : Factor w/ 3 levels "Ernest Cline",..: 2 3 1
## ..- attr(*, "names")= chr "Book1" "Book2" "Book3"
## $ author2 : Factor w/ 2 levels "NA","Terry Pratchett": 1 2 1
## ..- attr(*, "names")= chr "Book1" "Book2" "Book3"
## $ genre : Factor w/ 2 levels "Fantasy","Sci-Fi": 2 1 2
## ..- attr(*, "names")= chr "Book1" "Book2" "Book3"
## $ protagonist: Factor w/ 3 levels "Crowley","Wade Watts",..: 3 1 2
## ..- attr(*, "names")= chr "Book1" "Book2" "Book3"
is.data.frame(xml)
## [1] TRUE
xml == html
## title author1 author2 genre protagonist
## Book1 TRUE TRUE TRUE TRUE TRUE
## Book2 TRUE TRUE TRUE TRUE TRUE
## Book3 TRUE TRUE TRUE TRUE TRUE