Working with books data in 3 formats

HTML

Here’s how a book with multiple authors looks like in HTML file

Package Installs

install.packages(“tidyverse”)
install.packages(“XML”)

library(tidyverse)
library(XML)
path <- "https://raw.githubusercontent.com/simon63/Assignment7/master/books.html"
download.file(path, destfile = "~/books.html")
path <- file.path("books.html")
booksHtml <- htmlParse(path)
html_tb <- readHTMLTable(booksHtml, stringAsFactors = FALSE)
df.html <- html_tb[[1]] %>% tbl_df()
df.html
## # A tibble: 3 x 6
##   title      authors              subject   publisher  `ISBN-10` `ISBN-13`
##   <fct>      <fct>                <fct>     <fct>      <fct>     <fct>    
## 1 Design Pa~ "Erich Gamma\r\n\t\~ Computer~ Addison-W~ 0-201-63~ ""       
## 2 Object-Or~ Arthur J. Riel       Computer~ Addison-W~ 0-201-63~ ""       
## 3 SQL Cookb~ Anthony Molinaro     Computer~ O'Reilly   05960097~ 978-0596~

Viewing HTML data frame using knir::kable gives unexpected results
due to multiple authors for a book

knitr::kable(df.html)
title authors subject publisher ISBN-10 ISBN-13
Design Patterns Erich Gamma
Richard Helm
Ralph Johnson
John Vlissides Computer Science Addison-Wesley 0-201-63361-2
Object-Oriented Design Heuristics Arthur J. Riel Computer Science Addison-Wesley 0-201-63385-X
SQL Cookbook Anthony Molinaro Computer Science O’Reilly 0596009763 978-0596009762

The multiple authors are separated by “\r\n\t\t” sequence

as.vector(df.html$authors)
## [1] "Erich Gamma\r\n\t\t  Richard Helm\r\n\t\t  Ralph Johnson\r\n\t\t  John Vlissides"
## [2] "Arthur J. Riel"                                                            
## [3] "Anthony Molinaro"

XML

Here’s how the books data looks like in XML file

path <- "https://raw.githubusercontent.com/simon63/Assignment7/master/books.xml"
download.file(path, destfile = "~/books.xml")
path <- file.path("books.xml")
booksXML <- xmlParse(path)

Examining XML root’s name, number of elements and the 1st element

root <- xmlRoot(booksXML)
xmlName(root)
## [1] "favorite_books"
xmlSize(root)
## [1] 4
root[[1]]
## <!-- Favorite Computer Science Books -->

View 1st Book XML element

root[["book"]]
## <book id="1">
##   <title>Design Patterns</title>
##   <authors>
##     <author>Erich Gamma</author>
##     <author>Richard Helm</author>
##     <author>Ralph Johnson</author>
##     <author>John Vlissides</author>
##   </authors>
##   <subject>Computer Science</subject>
##   <publisher>Addison-Wesley</publisher>
##   <ISBN-10>0-201-63361-2</ISBN-10>
## </book>

View all Book elements

root["book"]
## $book
## <book id="1">
##   <title>Design Patterns</title>
##   <authors>
##     <author>Erich Gamma</author>
##     <author>Richard Helm</author>
##     <author>Ralph Johnson</author>
##     <author>John Vlissides</author>
##   </authors>
##   <subject>Computer Science</subject>
##   <publisher>Addison-Wesley</publisher>
##   <ISBN-10>0-201-63361-2</ISBN-10>
## </book> 
## 
## $book
## <book id="2">
##   <title>Object-Oriented Design Heuristics</title>
##   <authors>
##     <author>Arthur J. Riel</author>
##   </authors>
##   <subject>Computer Science</subject>
##   <publisher>Addison-Wesley</publisher>
##   <ISBN-10>0-201-63385-X</ISBN-10>
##   <ISBN-13/>
## </book> 
## 
## $book
## <book id="3">
##   <title>SQL Cookbook</title>
##   <authors>
##     <author>Anthony Molinaro</author>
##   </authors>
##   <subject>Computer Science</subject>
##   <publisher>O'Reilly</publisher>
##   <ISBN-10>0596009763</ISBN-10>
##   <ISBN-13>978-0596009762</ISBN-13>
## </book> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"

Trying xmlToDataFrame function

df.xml <- xmlToDataFrame(root, stringsAsFactors = F) %>% tbl_df()
df.xml
## # A tibble: 4 x 6
##   title        authors           subject   publisher  `ISBN-10` `ISBN-13` 
##   <chr>        <chr>             <chr>     <chr>      <chr>     <chr>     
## 1 <NA>         <NA>              <NA>      <NA>       <NA>      <NA>      
## 2 Design Patt~ Erich GammaRicha~ Computer~ Addison-W~ 0-201-63~ <NA>      
## 3 Object-Orie~ Arthur J. Riel    Computer~ Addison-W~ 0-201-63~ ""        
## 4 SQL Cookbook Anthony Molinaro  Computer~ O'Reilly   05960097~ 978-05960~

Obervations:
1. There is a blank row for the Comment line in XML
2. Multiple Authors are concatenated without any separation (unlike the HTML-parsing case)
3. Structure (in terms of columns) is similar to the HTML-parsing case

JSON

Here’s how the books data looks like in JSON file

library(RJSONIO)

Check if JSON file is valid

path <- "https://raw.githubusercontent.com/simon63/Assignment7/master/books.json"
download.file(path, destfile = "~/books.json")
path <- file.path("books.json")
isValidJSON(path)
## [1] TRUE

Tryng fromJSON() function to parse the file

booksJSON <- fromJSON(content = path)
class(booksJSON)
## [1] "list"
glimpse(booksJSON)
## List of 3
##  $ :List of 6
##   ..$ id       : chr "1"
##   ..$ title    : chr "Design Patterns"
##   ..$ authors  : chr [1:4] "Erich Gamma" "Richard Helm" "Ralph Johnson" "John Vlissides"
##   ..$ subject  : chr "Computer Science"
##   ..$ publisher: chr "Addison-Wesley"
##   ..$ ISBN-10  : chr "0-201-63361-2"
##  $ :List of 7
##   ..$ id       : chr "2"
##   ..$ title    : chr "Object-Oriented Design Heuristics"
##   ..$ authors  : chr "Arthur J. Riel"
##   ..$ subject  : chr "Computer Science"
##   ..$ publisher: chr "Addison-Wesley"
##   ..$ ISBN-10  : chr "0-201-63385-X"
##   ..$ ISBN-13  : NULL
##  $ :List of 7
##   ..$ id       : chr "3"
##   ..$ title    : chr "SQL Cookbook"
##   ..$ authors  : chr "Anthony Molinaro"
##   ..$ subject  : chr "Computer Science"
##   ..$ publisher: chr "O'Reilly"
##   ..$ ISBN-10  : chr "0596009763"
##   ..$ ISBN-13  : chr "978-0596009762"

Trying to use unlist() function and extracting the book titles

book.vec <- unlist(booksJSON, recursive = T, use.names = T)
names(book.vec)
##  [1] "id"        "title"     "authors1"  "authors2"  "authors3" 
##  [6] "authors4"  "subject"   "publisher" "ISBN-10"   "id"       
## [11] "title"     "authors"   "subject"   "publisher" "ISBN-10"  
## [16] "id"        "title"     "authors"   "subject"   "publisher"
## [21] "ISBN-10"   "ISBN-13"
book.vec[str_detect(names(book.vec), "title")]
##                               title                               title 
##                   "Design Patterns" "Object-Oriented Design Heuristics" 
##                               title 
##                      "SQL Cookbook"

Note: The 4 Authors for book 1 got labeled as “authors1”, “authors2”, “authors3”, “authors4”

Converting each JSON book class into separate data frames
using a combination of lapply(), do.call() and rbind functions

do.call("rbind", lapply(booksJSON[[1]], data.frame, stringsAsFactors = F))
##                     X..i..
## id                       1
## title      Design Patterns
## authors.1      Erich Gamma
## authors.2     Richard Helm
## authors.3    Ralph Johnson
## authors.4   John Vlissides
## subject   Computer Science
## publisher   Addison-Wesley
## ISBN-10      0-201-63361-2
do.call("rbind", lapply(booksJSON[[2]], data.frame, stringsAsFactors = F))
##                                      X..i..
## id                                        2
## title     Object-Oriented Design Heuristics
## authors                      Arthur J. Riel
## subject                    Computer Science
## publisher                    Addison-Wesley
## ISBN-10                       0-201-63385-X
do.call("rbind", lapply(booksJSON[[3]], data.frame, stringsAsFactors = F))
##                     X..i..
## id                       3
## title         SQL Cookbook
## authors   Anthony Molinaro
## subject   Computer Science
## publisher         O'Reilly
## ISBN-10         0596009763
## ISBN-13     978-0596009762

Note: This provides 3 data frames with slightly different structures between each other

Now trying jsonlite package ####Package Installs install.packages(“jsonlite”)

library(jsonlite)

Tryng fromJSON() function to parse the file

booksJSON <- fromJSON(txt = path)
class(booksJSON)
glimpse(booksJSON)

Got “Error in parse_con” as the parser failed interpret ISBN field value

Trying with a different JSON file without the troubling ISBN value

path <- "https://raw.githubusercontent.com/simon63/Assignment7/master/books-fix.json"
download.file(path, destfile = "~/books-fix.json")
path <- file.path("books-fix.json")
booksJSON <- fromJSON(txt = path)
class(booksJSON)
## [1] "data.frame"
glimpse(booksJSON)
## Observations: 3
## Variables: 7
## $ id        <chr> "1", "2", "3"
## $ title     <chr> "Design Patterns", "Object-Oriented Design Heuristic...
## $ authors   <list> [<"Erich Gamma", "Richard Helm", "Ralph Johnson", "...
## $ subject   <chr> "Computer Science", "Computer Science", "Computer Sc...
## $ publisher <chr> "Addison-Wesley", "Addison-Wesley", "O'Reilly"
## $ `ISBN-10` <chr> "0-201-63361-2", NA, "0596009763"
## $ `ISBN-13` <chr> NA, NA, "978-0596009762"

Conclusion Using jsonlite::fromJSON() is convenient and gives a nice and expected data frame structure, but it seems to be very sensitive to and problematic with the textual representation of data values inside JSON files.