Assignment Overview

In this assignment, we are required to create three files in HTML table, XML, and JSON formats. The files contain information about three of our favorite books in a particular subject area, with at least one book having multiple authors.

My choices don’t necessarily are my favorite books since two of the chosen books were are excellent references in cryptography, my research field.

We use R to load the information from each of the three sources into separate R data frames. We investigate if the three data frames identical or not.


Load Packages

library(knitr)
library(kableExtra)# manipulate table styles
library(XML)
suppressMessages(library(RCurl))
library(jsonlite)

Read and Parse Data

HTML

html.url <- "https://raw.githubusercontent.com/bsosnovski/DATA607/master/Week7Assignment/mybooks.html"
htmlData <- getURL(html.url)
html_parsed <-htmlParse(htmlData, encoding = "UTF-8")
html_parsed
## <!DOCTYPE html>
## <html>
## <head>
## <meta name="keywords" content="html, table">
## <meta name="author" content="Sosnovski, Bianca">
## <meta name="description" content="This is html document that contains a table with information about some of the books that I use to study and/or do research.">
## <meta charset="ISO-8859-1">
## <meta name="robots" content="noindex, nofollow">
## <title> book.html </title>
## </head>
## <body>
##     <h1> My Book List </h1>
##     <table border="1" bordercolor="silver" cellspacing="3" cellpadding="3">
## <tr>
## <th>ID</th>
##         <th>Title</th>
##         <th>Author 1</th>
##         <th>Author 2</th>
##         <th>Publication Year</th>
##         <th>Publiher</th>
##         <th>Number of Pages</th>
##       </tr>
## <tr>
## <th>1</th>
##         <td>Introduction to Modern Cryptography: Principles and Protocols</td>
##         <td>Jonathan Katz</td>
##         <td>Yehuda Lindell</td>
##         <td>2007</td>
##         <td>Chapman and Hall/CRC</td>
##         <td>552</td>
##       </tr>
## <tr>
## <th>2</th>
##         <td>An Introduction to Mathematical Cryptography</td>
##         <td>Jeffrey Hoffstein</td>
##         <td>Jill Pipher</td>
##         <td>2014</td>
##         <td>Springer</td>
##         <td>538</td>
##       </tr>
## <tr>
## <th>3</th>
##         <td>Thank You for Being Late: An Optimist's Guide to Thriving in the Age of Accelerations</td>
##         <td>Thomas L. Friedman</td>
##         <td></td>
##         <td>2016</td>
##         <td>Farrar, Straus and Giroux</td>
##         <td>497</td>
##       </tr>
## </table>
## </body>
## </html>
## 
class(html_parsed)
## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" 
## [4] "XMLAbstractDocument"
htmlData <- readHTMLTable(html_parsed, stringsAsFactors = FALSE)
html_df <- htmlData[[1]] #without this codeline there is a 'NULL' printed in the first line of htmlTable.
class(html_df)
## [1] "data.frame"
kable(html_df) %>% kable_styling(bootstrap_options = "striped")
ID Title Author 1 Author 2 Publication Year Publiher Number of Pages
1 Introduction to Modern Cryptography: Principles and Protocols Jonathan Katz Yehuda Lindell 2007 Chapman and Hall/CRC 552
2 An Introduction to Mathematical Cryptography Jeffrey Hoffstein Jill Pipher 2014 Springer 538
3 Thank You for Being Late: An Optimist’s Guide to Thriving in the Age of Accelerations Thomas L. Friedman 2016 Farrar, Straus and Giroux 497

XML

xml.url <- "https://raw.githubusercontent.com/bsosnovski/DATA607/master/Week7Assignment/mybooks.xml"
xmlData <- getURL(xml.url)
xml_parsed <- xmlParse(xmlData)
xml_parsed
## <?xml version="1.0" encoding="UTF-8"?>
## <book_list>
##   <book id="1">
##     <id>1</id>
##     <title>Introduction to Modern Cryptography: Principles and Protocols</title>
##     <author1>Jonathan Katz</author1>
##     <author2>Yehuda Lindell</author2>
##     <year>2007</year>
##     <publiher>Chapman and Hall/CRC</publiher>
##     <pages>552</pages>
##   </book>
##   <book id="2">
##     <id>2</id>
##     <title>An Introduction to Mathematical Cryptography</title>
##     <author1>Jeffrey Hoffstein</author1>
##     <author2>Jill Pipher</author2>
##     <year>2014</year>
##     <publiher>Springer</publiher>
##     <pages>538</pages>
##   </book>
##   <book id="3">
##     <id>3</id>
##     <title>Thank You for Being Late: An Optimist' s Guide to Thriving in the Age of Accelerations</title>
##     <author1>Thomas L. Friedman</author1>
##     <author2/>
##     <year>2016</year>
##     <publiher>Farrar, Straus and Giroux</publiher>
##     <pages>497</pages>
##   </book>
## </book_list>
## 
class(xml_parsed)
## [1] "XMLInternalDocument" "XMLAbstractDocument"
xml_df <- xmlToDataFrame(xml_parsed)
colnames(xml_df) <- c("ID", "Title", "Author 1", "Author 2","Publication Year", "Publisher", "Number of Pages")
class(xml_df)
## [1] "data.frame"
kable(xml_df) %>% kable_styling(bootstrap_options = "striped")
ID Title Author 1 Author 2 Publication Year Publisher Number of Pages
1 Introduction to Modern Cryptography: Principles and Protocols Jonathan Katz Yehuda Lindell 2007 Chapman and Hall/CRC 552
2 An Introduction to Mathematical Cryptography Jeffrey Hoffstein Jill Pipher 2014 Springer 538
3 Thank You for Being Late: An Optimist’ s Guide to Thriving in the Age of Accelerations Thomas L. Friedman 2016 Farrar, Straus and Giroux 497

JSON

json.url <- "https://raw.githubusercontent.com/bsosnovski/DATA607/master/Week7Assignment/mybooks.json"
jsonData <- getURL(json.url)
prettify(jsonData)
## {
##     "Books": [
##         {
##             "id": 1,
##             "title": "Introduction to Modern Cryptography: Principles and Protocols",
##             "author1": "Jonathan Katz",
##             "author2": "Yehuda Lindell",
##             "year": 2007,
##             "publisher": "Chapman and Hall/CRC",
##             "pages": 552
##         },
##         {
##             "id": 2,
##             "title": "An Introduction to Mathematical Cryptography",
##             "author1": "Jeffrey Hoffstein",
##             "author2": "Jill Pipher",
##             "year": 2014,
##             "publisher": "Springer",
##             "pages": 538
##         },
##         {
##             "id": 3,
##             "title": "Thank You for Being Late: An Optimist&apos; s Guide to Thriving in the Age of Accelerations",
##             "author1": "Thomas L. Friedman",
##             "author2": "",
##             "year": 2016,
##             "publisher": "Farrar, Straus and Giroux",
##             "pages": 497
##         }
##     ]
## }
## 
json_parsed <- fromJSON(jsonData)
json_parsed
## $Books
##   id
## 1  1
## 2  2
## 3  3
##                                                                                         title
## 1                               Introduction to Modern Cryptography: Principles and Protocols
## 2                                                An Introduction to Mathematical Cryptography
## 3 Thank You for Being Late: An Optimist&apos; s Guide to Thriving in the Age of Accelerations
##              author1        author2 year                 publisher pages
## 1      Jonathan Katz Yehuda Lindell 2007      Chapman and Hall/CRC   552
## 2  Jeffrey Hoffstein    Jill Pipher 2014                  Springer   538
## 3 Thomas L. Friedman                2016 Farrar, Straus and Giroux   497
class(json_parsed)
## [1] "list"
json_df <- data.frame(json_parsed)
colnames(json_df) <- c("ID", "Title", "Author 1", "Author 2","Publication Year", "Publisher", "Number of Pages")
class(json_df)
## [1] "data.frame"
kable(json_df) %>% kable_styling(bootstrap_options = "striped")
ID Title Author 1 Author 2 Publication Year Publisher Number of Pages
1 Introduction to Modern Cryptography: Principles and Protocols Jonathan Katz Yehuda Lindell 2007 Chapman and Hall/CRC 552
2 An Introduction to Mathematical Cryptography Jeffrey Hoffstein Jill Pipher 2014 Springer 538
3 Thank You for Being Late: An Optimist&apos; s Guide to Thriving in the Age of Accelerations Thomas L. Friedman 2016 Farrar, Straus and Giroux 497

The dataframes had different columns names and after changing the column names they look similar. However, as R objects they are not identical.

identical(html_df,xml_df)
## [1] FALSE
identical(html_df,json_df)
## [1] FALSE
identical(xml_df, json_df)
## [1] FALSE