knitr::opts_chunk$set(echo = TRUE)
# We need these packages
library(RCurl)
library(XML)
library(jsonlite)
library(knitr)

Overview: We are tasked with creating an HTML, XML, and JSON of 3 books from a favorite subject and loading these into R and verifying the data frames are the same.

First read html

html_url = "https://raw.githubusercontent.com/jmhsi/DATA_607/master/books.html"
html_cont = getURLContent(html_url)
books_html = readHTMLTable(html_cont)[[1]]
kable(books_html)
title author ISBN-13 pages
Money and Growth: Selected Papers of Allyn Abbott Young (Routledge Studies in the History of Economics) Perry G. Mehrling, Roger J. Sandilands 978-0415191555 464
The New Lombard Street: How the Fed Became the Dealer of Last Resort Perry Mehrling 978-0691143989 192
Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series) Albert G. Hart, Perry Mehrling 978-1563246388 388

Next read XML

xml_url = "https://raw.githubusercontent.com/jmhsi/DATA_607/master/books.xml"
xml_cont = getURLContent(xml_url)
books_xml = xmlToDataFrame(xmlParse(xml_cont))
kable(books_xml)
title author ISBN-13 pages
Money and Growth: Selected Papers of Allyn Abbott Young (Routledge Studies in the History of Economics) Perry G. Mehrling, Roger J. Sandilands 978-0415191555 464
The New Lombard Street: How the Fed Became the Dealer of Last Resort Perry Mehrling 978-0691143989 192
Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series) Albert G. Hart, Perry Mehrling 978-1563246388 388

Finally read Json

json_url = "https://raw.githubusercontent.com/jmhsi/DATA_607/master/books.json"
books_json = fromJSON(json_url)[[1]]
kable(books_json)
title author ISBN-13 pages
Money and Growth: Selected Papers of Allyn Abbott Young (Routledge Studies in the History of Economics) Perry G. Mehrling, Roger J. Sandilands 978-0415191555 464
The New Lombard Street: How the Fed Became the Dealer of Last Resort Perry Mehrling 978-0691143989 192
Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series) Albert G. Hart, Perry Mehrling 978-1563246388 388

Json already looks different (author is being interpreted as a character vector when there are more than 2). Let us actually compare.

all.equal(books_html, books_xml) # the html and xml are the same
## [1] TRUE
all.equal(books_json, books_xml) # but the json is not
##  [1] "Component \"title\": Modes: character, numeric"                        
##  [2] "Component \"title\": Attributes: < target is NULL, current is list >"  
##  [3] "Component \"title\": target is character, current is factor"           
##  [4] "Component \"author\": Modes: list, numeric"                            
##  [5] "Component \"author\": Attributes: < target is NULL, current is list >" 
##  [6] "Component \"author\": current is not list-like"                        
##  [7] "Component \"ISBN-13\": Modes: character, numeric"                      
##  [8] "Component \"ISBN-13\": Attributes: < target is NULL, current is list >"
##  [9] "Component \"ISBN-13\": target is character, current is factor"         
## [10] "Component \"pages\": Attributes: < target is NULL, current is list >"  
## [11] "Component \"pages\": target is numeric, current is factor"
# lets look at structure of each
str(books_html)
## 'data.frame':    3 obs. of  4 variables:
##  $ title  : Factor w/ 3 levels "Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series)",..: 2 3 1
##  $ author : Factor w/ 3 levels "Albert G. Hart, Perry Mehrling",..: 2 3 1
##  $ ISBN-13: Factor w/ 3 levels "978-0415191555",..: 1 2 3
##  $ pages  : Factor w/ 3 levels "192","388","464": 3 1 2
str(books_xml)
## 'data.frame':    3 obs. of  4 variables:
##  $ title  : Factor w/ 3 levels "Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series)",..: 2 3 1
##  $ author : Factor w/ 3 levels "Albert G. Hart, Perry Mehrling",..: 2 3 1
##  $ ISBN-13: Factor w/ 3 levels "978-0415191555",..: 1 2 3
##  $ pages  : Factor w/ 3 levels "192","388","464": 3 1 2
str(books_json)
## 'data.frame':    3 obs. of  4 variables:
##  $ title  : chr  "Money and Growth: Selected Papers of Allyn Abbott Young (Routledge Studies in the History of Economics)" "The New Lombard Street: How the Fed Became the Dealer of Last Resort" "Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series)"
##  $ author :List of 3
##   ..$ : chr "Perry G. Mehrling, Roger J. Sandilands"
##   ..$ : chr "Perry Mehrling"
##   ..$ : chr "Albert G. Hart, Perry Mehrling"
##  $ ISBN-13: chr  "978-0415191555" "978-0691143989" "978-1563246388"
##  $ pages  : int  464 192 388

Conclusions: We see that the XML and HTML are identical. When loaded into dataframes they turn all the variables into factors by default. For json, we see that the variables are typed as we’d expect them to be with characters, ints, etc. In this case, it may be more desirable to work with JSON as it has tried to infer dtypes which may cut out one step of our data cleaning process.