DATA607 Week 7 Homework

This week’s assignment required the creation of three files in HTML table, XML, and JSON formats containing information about three of our favorite books , with at least one book having multiple authors.

I created each of the books file format by hand and published in the specified Github URL’s

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(XML)       # For HTML and XML manipulation
library(RCurl)
## Loading required package: bitops
library(tidyjson)  # For JSON manipulation
library(knitr)
## Warning: package 'knitr' was built under R version 3.3.3

HTML Parser

# Import HTML file
htmlFile <- getURL("https://raw.githubusercontent.com/sortega7878/DATA607WEEK7/master/books.html")
kable(htmlFile, caption = "HTML FILE")
HTML FILE
x
|
<caption>Sergio's Top 3</caption>
<tr bgcolor=silver>
    <th id="title">Book Title</th>
    <th id="year">Year</th>
    <th id="isbn">ISBN</th>
    <th id="award">Awards</th>
    <th id="film">Film Adaptation</th>
    <th id="author">Author</th>
</tr><tr>
    <td headers="title">The End of Eternity</td>
    <td headers="year">1995</td>
    <td headers="isbn">978-0739435571</td>
    <td headers="author">Isaac Asimov</td>
</tr><tr>
    <td headers="title">One Hundred Years of Solitude</td>
    <td headers="year">1967</td>
    <td headers="isbn">978-0060883287</td>
    <td headers="author">Gabriel Garcia Marquez</td>
</tr><tr>
    <td headers="title">C Programming Language</td>
    <td headers="year">1978</td>
    <td headers="isbn">978-0131103627</td>
    <td headers="author">Brian W. Kernighan</td>
    <td headers="author2">Dennis M. Ritchie</td>
</tr>
# Parse HTML table
booksHTML <- readHTMLTable(htmlFile, header = TRUE)

# Convert to data frame and adjust column names
booksHTML <- as.data.frame(booksHTML)
colnames(booksHTML) <- c("title", "year", "isbn", "author","author2")
kable(booksHTML, caption = "HTML DATAFRAME")
HTML DATAFRAME
title year isbn author author2
The End of Eternity 1995 978-0739435571 Isaac Asimov NA
One Hundred Years of Solitude 1967 978-0060883287 Gabriel Garcia Marquez NA
C Programming Language 1978 978-0131103627 Brian W. Kernighan Dennis M. Ritchie

XML PARSER

# Import XML file
xmlFile <- getURL("https://raw.githubusercontent.com/sortega7878/DATA607WEEK7/master/books.xml")
kable(xmlFile, caption = "XML FILE")
XML FILE
x
| The End of Eternity
    <author>Isaac Asimov</author>
    <year>1995</year>
    <isbn>978-0739435571</isbn>       
</book>
<book id="2">
    <title>>One Hundred Years of Solitude</title>
    <author>Gabriel Garcia Marquez</author>
    <year>1995</year>
    <isbn>978-0739435571</isbn>
</book>
<book id="3">
    <title>C Programming Language</title>
    <author>Brian W. Kernighan</author>
<author2>Dennis M. Ritchie</author2>
    <year>1978</year>
    <isbn>978-0131103627</isbn>        
</book>

|

# Parse XML and get root
booksXML <- xmlParse(xmlFile)
root <- xmlRoot(booksXML)

# Convert main fields to data frame
booksXML <- xmlToDataFrame(root)
kable(booksXML, caption = "XML Dataframe")
XML Dataframe
title author year isbn author2
The End of Eternity Isaac Asimov 1995 978-0739435571 NA
>One Hundred Years of Solitude Gabriel Garcia Marquez 1995 978-0739435571 NA
C Programming Language Brian W. Kernighan 1978 978-0131103627 Dennis M. Ritchie

JSON PARSER

# Import JSON file
jsonFile <- getURL("https://raw.githubusercontent.com/sortega7878/DATA607WEEK7/master/books.json")
kable(jsonFile, caption = "JSON FILE")
JSON FILE
x
{“sergios_books” : [
{
"title" : "The End of Eternity",
"author" : "Isaac Asimov",
"year" : "1995",
"isbn" : "978-0739435571"
},
{
"title" : "One Hundred Years of Solitude",
"author" : "Gabriel Garcia Marquez",
"year" : "1967",
"isbn" : "978-0060883287"
},
{
"title" : "C Programming Language",
"author" : "Brian W. Kernighan",
"author2" : "Brian W. Kernighan",
"year" : "1978",
"isbn" : "978-0131103627"
}]

} |

# Parse book data
booksJSON <- jsonFile %>% 
  enter_object("sergios_books") %>% 
  gather_array("book.id") %>% 
  spread_values(title = jstring("title"), 
                author = jstring("author"),
                author2 = jstring("author2"),
                year = jnumber("year"), 
                isbn = jstring("isbn"))

kable(booksJSON, caption = "JSON Dataframe")
JSON Dataframe
document.id book.id title author author2 year isbn
1 1 The End of Eternity Isaac Asimov NA 1995 978-0739435571
1 2 One Hundred Years of Solitude Gabriel Garcia Marquez NA 1967 978-0060883287
1 3 C Programming Language Brian W. Kernighan Brian W. Kernighan 1978 978-0131103627