library("rvest")
library("knitr")
library("jsonlite")
library("RCurl")
The purpose of this assignment is to explore the difference between JSON, XML and HTML files. Create a file of each type to store information on 3 books. Then parse the files into a data frame. Are all three data frames the same?
#Create empty data frame
xml_books_df <- data.frame(id = character(),
title = character(),
authors = I ( list() ) ,
price = numeric()
)
XML file looks like:
#read file from github
xml_file <- getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.xml") %>%
read_xml()
#loop through all of the "book" nodes in the file
for ( i in 1:length(xml_nodes(xml_file, "book"))) {
book_node <-html_nodes(xml_file, "book")[i]
# get the ID
id<- html_nodes(book_node, "id") %>%
xml_text()
# get the title
title<- html_nodes(book_node, "title") %>%
xml_text()
#get the authors list
authors_node <- html_nodes(book_node, "authors")
authors_list <- lapply(authors_node, function(x) html_nodes(authors_node, "author") %>%
xml_text() ) %>%
I()
# get the price
price <- html_nodes(book_node, "price") %>%
xml_text()
#create a row in a temporary data frame
temp_df <- data.frame(
id = id,
title = title,
price = price,
authors = authors_list
)
#add the temp row to the main data frame
xml_books_df <- rbind(xml_books_df, temp_df)
}
kable(xml_books_df)
| id | title | price | authors |
|---|---|---|---|
| Book1 | R Cookbook | 46.83 | Paul Teetor |
| Book2 | R in Action | 33.96 | Robert Kabacoff |
| Book3 | R for Data Science | 39.99 | Hadley Wickham, Garret Grolemund |
#Read file from git hub and extract the table
html_nodes <- getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.html") %>%
read_html() %>%
html_nodes( xpath="//table[1]")
table <- html_nodes[1]
#Convert to cells to a data frame
html_books_df <- html_table(table, fill=TRUE, header = TRUE)[[1]]
kable(html_books_df)
| book id | title | authors | price |
|---|---|---|---|
| Book1 | R Cookbook | Paul Teetor | 46.83 |
| Book2 | R In Action | Robert Kabacoff | 33.96 |
| Book3 | R for Data Science | Hadley Wickham, Garret Grolemund | 39.99 |
#Read JSON file from github
json_file <- getURL ("https://raw.githubusercontent.com/RaphaelNash/CUNY-DATA-607/master/Week7-HTMLKMLJSON/books.json")
#Convert JSON to a data frame
json_books_df <- fromJSON(json_file)
kable(json_books_df)
| book id | title | authors | price |
|---|---|---|---|
| Book1 | R Cookbook | Paul Teetor | 46.83 |
| Book2 | R In Action | Robert Kabacoff | 33.96 |
| Book2 | R for Data Science | Hadley Wickham, Garret Grolemund | 39.99 |
str(xml_books_df)
## 'data.frame': 3 obs. of 4 variables:
## $ id : Factor w/ 3 levels "Book1","Book2",..: 1 2 3
## $ title : Factor w/ 3 levels "R Cookbook","R in Action",..: 1 2 3
## $ price : Factor w/ 3 levels "\n \t46.83\n ",..: 1 2 3
## $ authors:List of 3
## ..$ : chr "Paul Teetor"
## ..$ : chr "Robert Kabacoff"
## ..$ : chr "Hadley Wickham" "Garret Grolemund"
## ..- attr(*, "class")= chr "AsIs"
str(html_books_df)
## 'data.frame': 3 obs. of 4 variables:
## $ book id: chr "Book1" "Book2" "Book3"
## $ title : chr "R Cookbook" "R In Action" "R for Data Science"
## $ authors: chr "Paul Teetor" "Robert Kabacoff" "Hadley Wickham, Garret Grolemund"
## $ price : num 46.8 34 40
str(json_books_df)
## 'data.frame': 3 obs. of 4 variables:
## $ book id: chr "Book1" "Book2" "Book2"
## $ title : chr "R Cookbook" "R In Action" "R for Data Science"
## $ authors:List of 3
## ..$ : chr "Paul Teetor"
## ..$ : chr "Robert Kabacoff"
## ..$ : chr "Hadley Wickham" "Garret Grolemund"
## $ price : num 46.8 34 40
The data frames created from the XML and JSON files are identical. The only difference between the HTML file and the other files is that the authors column is a list object in the XML and JSON files while in the HTML file the authors field is just a text comma seperated list.