knitr::opts_chunk$set(echo = TRUE)
# We need these packages
library(RCurl)
library(XML)
library(jsonlite)
library(knitr)
Overview: We are tasked with creating an HTML, XML, and JSON of 3 books from a favorite subject and loading these into R and verifying the data frames are the same.
First read html
html_url = "https://raw.githubusercontent.com/jmhsi/DATA_607/master/books.html"
html_cont = getURLContent(html_url)
books_html = readHTMLTable(html_cont)[[1]]
kable(books_html)
| title | author | ISBN-13 | pages |
|---|---|---|---|
| Money and Growth: Selected Papers of Allyn Abbott Young (Routledge Studies in the History of Economics) | Perry G. Mehrling, Roger J. Sandilands | 978-0415191555 | 464 |
| The New Lombard Street: How the Fed Became the Dealer of Last Resort | Perry Mehrling | 978-0691143989 | 192 |
| Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series) | Albert G. Hart, Perry Mehrling | 978-1563246388 | 388 |
Next read XML
xml_url = "https://raw.githubusercontent.com/jmhsi/DATA_607/master/books.xml"
xml_cont = getURLContent(xml_url)
books_xml = xmlToDataFrame(xmlParse(xml_cont))
kable(books_xml)
| title | author | ISBN-13 | pages |
|---|---|---|---|
| Money and Growth: Selected Papers of Allyn Abbott Young (Routledge Studies in the History of Economics) | Perry G. Mehrling, Roger J. Sandilands | 978-0415191555 | 464 |
| The New Lombard Street: How the Fed Became the Dealer of Last Resort | Perry Mehrling | 978-0691143989 | 192 |
| Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series) | Albert G. Hart, Perry Mehrling | 978-1563246388 | 388 |
Finally read Json
json_url = "https://raw.githubusercontent.com/jmhsi/DATA_607/master/books.json"
books_json = fromJSON(json_url)[[1]]
kable(books_json)
| title | author | ISBN-13 | pages |
|---|---|---|---|
| Money and Growth: Selected Papers of Allyn Abbott Young (Routledge Studies in the History of Economics) | Perry G. Mehrling, Roger J. Sandilands | 978-0415191555 | 464 |
| The New Lombard Street: How the Fed Became the Dealer of Last Resort | Perry Mehrling | 978-0691143989 | 192 |
| Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series) | Albert G. Hart, Perry Mehrling | 978-1563246388 | 388 |
Json already looks different (author is being interpreted as a character vector when there are more than 2). Let us actually compare.
all.equal(books_html, books_xml) # the html and xml are the same
## [1] TRUE
all.equal(books_json, books_xml) # but the json is not
## [1] "Component \"title\": Modes: character, numeric"
## [2] "Component \"title\": Attributes: < target is NULL, current is list >"
## [3] "Component \"title\": target is character, current is factor"
## [4] "Component \"author\": Modes: list, numeric"
## [5] "Component \"author\": Attributes: < target is NULL, current is list >"
## [6] "Component \"author\": current is not list-like"
## [7] "Component \"ISBN-13\": Modes: character, numeric"
## [8] "Component \"ISBN-13\": Attributes: < target is NULL, current is list >"
## [9] "Component \"ISBN-13\": target is character, current is factor"
## [10] "Component \"pages\": Attributes: < target is NULL, current is list >"
## [11] "Component \"pages\": target is numeric, current is factor"
# lets look at structure of each
str(books_html)
## 'data.frame': 3 obs. of 4 variables:
## $ title : Factor w/ 3 levels "Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series)",..: 2 3 1
## $ author : Factor w/ 3 levels "Albert G. Hart, Perry Mehrling",..: 2 3 1
## $ ISBN-13: Factor w/ 3 levels "978-0415191555",..: 1 2 3
## $ pages : Factor w/ 3 levels "192","388","464": 3 1 2
str(books_xml)
## 'data.frame': 3 obs. of 4 variables:
## $ title : Factor w/ 3 levels "Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series)",..: 2 3 1
## $ author : Factor w/ 3 levels "Albert G. Hart, Perry Mehrling",..: 2 3 1
## $ ISBN-13: Factor w/ 3 levels "978-0415191555",..: 1 2 3
## $ pages : Factor w/ 3 levels "192","388","464": 3 1 2
str(books_json)
## 'data.frame': 3 obs. of 4 variables:
## $ title : chr "Money and Growth: Selected Papers of Allyn Abbott Young (Routledge Studies in the History of Economics)" "The New Lombard Street: How the Fed Became the Dealer of Last Resort" "Debt, Crisis, and Recovery: The 1930s and the 1990s (Columbia University Seminar Series)"
## $ author :List of 3
## ..$ : chr "Perry G. Mehrling, Roger J. Sandilands"
## ..$ : chr "Perry Mehrling"
## ..$ : chr "Albert G. Hart, Perry Mehrling"
## $ ISBN-13: chr "978-0415191555" "978-0691143989" "978-1563246388"
## $ pages : int 464 192 388
Conclusions: We see that the XML and HTML are identical. When loaded into dataframes they turn all the variables into factors by default. For json, we see that the variables are typed as we’d expect them to be with characters, ints, etc. In this case, it may be more desirable to work with JSON as it has tried to infer dtypes which may cut out one step of our data cleaning process.