library("rvest")
library("knitr")
library("jsonlite")
library("RCurl")

Overview

The purpose of this assignment is to explore the difference between JSON, XML and HTML files. Create a file of each type to store information on 3 books. Then parse the files into a data frame. Are all three data frames the same?

Parse XML

#Create empty data frame
xml_books_df <- data.frame(id = character(),
                           title = character(),
                          authors = I ( list() ) ,
                           price = numeric()
                           )

XML file looks like: . . .

#read file from github                           
xml_file <- getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.xml") %>% 
  read_xml()

#loop through all of the "book" nodes in the file
for ( i in 1:length(xml_nodes(xml_file, "book"))) {

  book_node <-html_nodes(xml_file, "book")[i]
  
  # get the ID
  id<- html_nodes(book_node, "id") %>% 
    xml_text()
  
  # get the title 
  title<- html_nodes(book_node, "title") %>% 
    xml_text()
    
  #get the authors list   
  authors_node <- html_nodes(book_node, "authors") 
  
  authors_list <- lapply(authors_node, function(x)  html_nodes(authors_node, "author") %>% 
    xml_text() ) %>% 
    I()
    
  # get the price 
  price <- html_nodes(book_node, "price") %>% 
    xml_text()
 
 #create a row in a temporary data frame
 temp_df <- data.frame(
   id = id,
   title = title,
   price = price,
   authors = authors_list
 )

#add the temp row to the main data frame
xml_books_df <- rbind(xml_books_df, temp_df)
}
    

kable(xml_books_df)

id	title	price	authors
Book1	R Cookbook	46.83	Paul Teetor
Book2	R in Action	33.96	Robert Kabacoff
Book3	R for Data Science	39.99	Hadley Wickham, Garret Grolemund

Parse HTML

#Read file from git hub and extract the table
html_nodes  <- getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.html") %>% 
  read_html() %>% 
  html_nodes( xpath="//table[1]")

table <- html_nodes[1] 

#Convert to cells to a data frame
 html_books_df <- html_table(table, fill=TRUE, header = TRUE)[[1]]

kable(html_books_df)

book id	title	authors	price
Book1	R Cookbook	Paul Teetor	46.83
Book2	R In Action	Robert Kabacoff	33.96
Book3	R for Data Science	Hadley Wickham, Garret Grolemund	39.99

Parse JSON

#Read JSON file from github
json_file <-  getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.json")

#Convert JSON to a data frame
json_books_df <- fromJSON(json_file)

kable(json_books_df)

book id	title	authors	price
Book1	R Cookbook	Paul Teetor	46.83
Book2	R In Action	Robert Kabacoff	33.96
Book2	R for Data Science	Hadley Wickham, Garret Grolemund	39.99

Looks at Structure of Data.Frames

str(xml_books_df)

## 'data.frame':    3 obs. of  4 variables:
##  $ id     : Factor w/ 3 levels "Book1","Book2",..: 1 2 3
##  $ title  : Factor w/ 3 levels "R Cookbook","R in Action",..: 1 2 3
##  $ price  : Factor w/ 3 levels "\n        \t46.83\n        ",..: 1 2 3
##  $ authors:List of 3
##   ..$ : chr "Paul Teetor"
##   ..$ : chr "Robert Kabacoff"
##   ..$ : chr  "Hadley Wickham" "Garret Grolemund"
##   ..- attr(*, "class")= chr "AsIs"

str(html_books_df)

## 'data.frame':    3 obs. of  4 variables:
##  $ book id: chr  "Book1" "Book2" "Book3"
##  $ title  : chr  "R Cookbook" "R In Action" "R for Data Science"
##  $ authors: chr  "Paul Teetor" "Robert Kabacoff" "Hadley Wickham, Garret Grolemund"
##  $ price  : num  46.8 34 40

str(json_books_df)

## 'data.frame':    3 obs. of  4 variables:
##  $ book id: chr  "Book1" "Book2" "Book2"
##  $ title  : chr  "R Cookbook" "R In Action" "R for Data Science"
##  $ authors:List of 3
##   ..$ : chr "Paul Teetor"
##   ..$ : chr "Robert Kabacoff"
##   ..$ : chr  "Hadley Wickham" "Garret Grolemund"
##  $ price  : num  46.8 34 40

Conclusions

The data frames created from the XML and JSON files are identical. The only difference between the HTML file and the other files is that the authors column is a list object in the XML and JSON files while in the HTML file the authors field is just a text comma seperated list.

CUNY-DATA-607-Week7-JSON-XML-HTML

Raphael Nash