DATA607 Week 7 Homework
This week’s assignment required the creation of three files in HTML table, XML, and JSON formats containing information about three of our favorite books , with at least one book having multiple authors.
I created each of the books file format by hand and published in the specified Github URL’s
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(XML) # For HTML and XML manipulation
library(RCurl)
## Loading required package: bitops
library(tidyjson) # For JSON manipulation
library(knitr)
## Warning: package 'knitr' was built under R version 3.3.3
HTML Parser
# Import HTML file
htmlFile <- getURL("https://raw.githubusercontent.com/sortega7878/DATA607WEEK7/master/books.html")
kable(htmlFile, caption = "HTML FILE")
|
<caption>Sergio's Top 3</caption>
<tr bgcolor=silver>
<th id="title">Book Title</th>
<th id="year">Year</th>
<th id="isbn">ISBN</th>
<th id="award">Awards</th>
<th id="film">Film Adaptation</th>
<th id="author">Author</th>
</tr><tr>
<td headers="title">The End of Eternity</td>
<td headers="year">1995</td>
<td headers="isbn">978-0739435571</td>
<td headers="author">Isaac Asimov</td>
</tr><tr>
<td headers="title">One Hundred Years of Solitude</td>
<td headers="year">1967</td>
<td headers="isbn">978-0060883287</td>
<td headers="author">Gabriel Garcia Marquez</td>
</tr><tr>
<td headers="title">C Programming Language</td>
<td headers="year">1978</td>
<td headers="isbn">978-0131103627</td>
<td headers="author">Brian W. Kernighan</td>
<td headers="author2">Dennis M. Ritchie</td>
</tr>
# Parse HTML table
booksHTML <- readHTMLTable(htmlFile, header = TRUE)
# Convert to data frame and adjust column names
booksHTML <- as.data.frame(booksHTML)
colnames(booksHTML) <- c("title", "year", "isbn", "author","author2")
kable(booksHTML, caption = "HTML DATAFRAME")
HTML DATAFRAME
| The End of Eternity |
1995 |
978-0739435571 |
Isaac Asimov |
NA |
| One Hundred Years of Solitude |
1967 |
978-0060883287 |
Gabriel Garcia Marquez |
NA |
| C Programming Language |
1978 |
978-0131103627 |
Brian W. Kernighan |
Dennis M. Ritchie |
XML PARSER
# Import XML file
xmlFile <- getURL("https://raw.githubusercontent.com/sortega7878/DATA607WEEK7/master/books.xml")
kable(xmlFile, caption = "XML FILE")
|
The End of Eternity
<author>Isaac Asimov</author>
<year>1995</year>
<isbn>978-0739435571</isbn>
</book>
<book id="2">
<title>>One Hundred Years of Solitude</title>
<author>Gabriel Garcia Marquez</author>
<year>1995</year>
<isbn>978-0739435571</isbn>
</book>
<book id="3">
<title>C Programming Language</title>
<author>Brian W. Kernighan</author>
<author2>Dennis M. Ritchie</author2>
<year>1978</year>
<isbn>978-0131103627</isbn>
</book>
|
# Parse XML and get root
booksXML <- xmlParse(xmlFile)
root <- xmlRoot(booksXML)
# Convert main fields to data frame
booksXML <- xmlToDataFrame(root)
kable(booksXML, caption = "XML Dataframe")
XML Dataframe
| The End of Eternity |
Isaac Asimov |
1995 |
978-0739435571 |
NA |
| >One Hundred Years of Solitude |
Gabriel Garcia Marquez |
1995 |
978-0739435571 |
NA |
| C Programming Language |
Brian W. Kernighan |
1978 |
978-0131103627 |
Dennis M. Ritchie |
JSON PARSER
# Import JSON file
jsonFile <- getURL("https://raw.githubusercontent.com/sortega7878/DATA607WEEK7/master/books.json")
kable(jsonFile, caption = "JSON FILE")
JSON FILE
| {“sergios_books” : [ |
{
"title" : "The End of Eternity",
"author" : "Isaac Asimov",
"year" : "1995",
"isbn" : "978-0739435571"
},
{
"title" : "One Hundred Years of Solitude",
"author" : "Gabriel Garcia Marquez",
"year" : "1967",
"isbn" : "978-0060883287"
},
{
"title" : "C Programming Language",
"author" : "Brian W. Kernighan",
"author2" : "Brian W. Kernighan",
"year" : "1978",
"isbn" : "978-0131103627"
}]
} |
# Parse book data
booksJSON <- jsonFile %>%
enter_object("sergios_books") %>%
gather_array("book.id") %>%
spread_values(title = jstring("title"),
author = jstring("author"),
author2 = jstring("author2"),
year = jnumber("year"),
isbn = jstring("isbn"))
kable(booksJSON, caption = "JSON Dataframe")
JSON Dataframe
| 1 |
1 |
The End of Eternity |
Isaac Asimov |
NA |
1995 |
978-0739435571 |
| 1 |
2 |
One Hundred Years of Solitude |
Gabriel Garcia Marquez |
NA |
1967 |
978-0060883287 |
| 1 |
3 |
C Programming Language |
Brian W. Kernighan |
Brian W. Kernighan |
1978 |
978-0131103627 |