library("rvest")
library("knitr")
library("jsonlite")
library("RCurl")

Overview

The purpose of this assignment is to explore the difference between JSON, XML and HTML files. Create a file of each type to store information on 3 books. Then parse the files into a data frame. Are all three data frames the same?

Parse XML

#Create empty data frame
xml_books_df <- data.frame(id = character(),
                           title = character(),
                          authors = I ( list() ) ,
                           price = numeric()
                           )

XML file looks like: . . .

#read file from github                           
xml_file <- getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.xml") %>% 
  read_xml()

#loop through all of the "book" nodes in the file
for ( i in 1:length(xml_nodes(xml_file, "book"))) {

  book_node <-html_nodes(xml_file, "book")[i]
  
  # get the ID
  id<- html_nodes(book_node, "id") %>% 
    xml_text()
  
  # get the title 
  title<- html_nodes(book_node, "title") %>% 
    xml_text()
    
  #get the authors list   
  authors_node <- html_nodes(book_node, "authors") 
  
  authors_list <- lapply(authors_node, function(x)  html_nodes(authors_node, "author") %>% 
    xml_text() ) %>% 
    I()
    
  # get the price 
  price <- html_nodes(book_node, "price") %>% 
    xml_text()
 
 #create a row in a temporary data frame
 temp_df <- data.frame(
   id = id,
   title = title,
   price = price,
   authors = authors_list
 )

#add the temp row to the main data frame
xml_books_df <- rbind(xml_books_df, temp_df)
}
    

kable(xml_books_df)
id title price authors
Book1 R Cookbook 46.83 Paul Teetor
Book2 R in Action 33.96 Robert Kabacoff
Book3 R for Data Science 39.99 Hadley Wickham, Garret Grolemund

Parse HTML

#Read file from git hub and extract the table
html_nodes  <- getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.html") %>% 
  read_html() %>% 
  html_nodes( xpath="//table[1]")

table <- html_nodes[1] 

#Convert to cells to a data frame
 html_books_df <- html_table(table, fill=TRUE, header = TRUE)[[1]]

kable(html_books_df)
book id title authors price
Book1 R Cookbook Paul Teetor 46.83
Book2 R In Action Robert Kabacoff 33.96
Book3 R for Data Science Hadley Wickham, Garret Grolemund 39.99

Parse JSON

#Read JSON file from github
json_file <-  getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.json")

#Convert JSON to a data frame
json_books_df <- fromJSON(json_file)

kable(json_books_df)
book id title authors price
Book1 R Cookbook Paul Teetor 46.83
Book2 R In Action Robert Kabacoff 33.96
Book2 R for Data Science Hadley Wickham, Garret Grolemund 39.99

Looks at Structure of Data.Frames

str(xml_books_df)
## 'data.frame':    3 obs. of  4 variables:
##  $ id     : Factor w/ 3 levels "Book1","Book2",..: 1 2 3
##  $ title  : Factor w/ 3 levels "R Cookbook","R in Action",..: 1 2 3
##  $ price  : Factor w/ 3 levels "\n        \t46.83\n        ",..: 1 2 3
##  $ authors:List of 3
##   ..$ : chr "Paul Teetor"
##   ..$ : chr "Robert Kabacoff"
##   ..$ : chr  "Hadley Wickham" "Garret Grolemund"
##   ..- attr(*, "class")= chr "AsIs"
str(html_books_df)
## 'data.frame':    3 obs. of  4 variables:
##  $ book id: chr  "Book1" "Book2" "Book3"
##  $ title  : chr  "R Cookbook" "R In Action" "R for Data Science"
##  $ authors: chr  "Paul Teetor" "Robert Kabacoff" "Hadley Wickham, Garret Grolemund"
##  $ price  : num  46.8 34 40
str(json_books_df)
## 'data.frame':    3 obs. of  4 variables:
##  $ book id: chr  "Book1" "Book2" "Book2"
##  $ title  : chr  "R Cookbook" "R In Action" "R for Data Science"
##  $ authors:List of 3
##   ..$ : chr "Paul Teetor"
##   ..$ : chr "Robert Kabacoff"
##   ..$ : chr  "Hadley Wickham" "Garret Grolemund"
##  $ price  : num  46.8 34 40

Conclusions

The data frames created from the XML and JSON files are identical. The only difference between the HTML file and the other files is that the authors column is a list object in the XML and JSON files while in the HTML file the authors field is just a text comma seperated list.