Activity Definition:

Activity Definition

Setup:

#Load the required libraries for data analysis.
library(RCurl)
## Loading required package: bitops
library(XML)
library(rjson)

Files books.xml , books.html , books.json

#initialize the paths to the books.xml, books.html and books.json files.
xml.url  <- "https://raw.githubusercontent.com/psumank/DAMgmt/master/Assignment_9/books.xml"
html.url <- "https://raw.githubusercontent.com/psumank/DAMgmt/master/Assignment_9/books.html"
json.url <- "https://raw.githubusercontent.com/psumank/DAMgmt/master/Assignment_9/books.json"

Analysis:

1. Read books.xml into a data frame.

#Download the "books.xml"" file into a local file.
fn <- "books_suman.xml"
if (file.exists(fn)) file.remove(fn)
download.file(url=xml.url,destfile="./books_suman.xml",method="curl")

#parse the xml
booksXml <- xmlParse(file="./books_suman.xml")

#lets prepare the data frame by joining each of the Author element from Authors node of each Book with a comma.
(dfXml <- do.call(rbind, xpathApply(booksXml, "/books/book", function(node) {
  
   Title      <- xmlValue(node[["Title"]])
   Authors    <- paste(sapply(getNodeSet(node, './Authors/Author'), function(x) { xmlValue(x) }), collapse=",")
   Publisher  <- xmlValue(node[["Publisher"]])
   ISBN       <- xmlGetAttr(node, "ISBN")
   data.frame(Title, Authors, Publisher, ISBN, stringsAsFactors = FALSE)
  
} )))
#review the structure of the data frame
str(dfXml)
## 'data.frame':    3 obs. of  4 variables:
##  $ Title    : chr  "Design Patterns" "Agile Principles,Patterns,and Practices in C#" "Practical J2EE"
##  $ Authors  : chr  "Eric Gamma,Richard Helm,Ralph Johnson,John Vlissides" "Robert C.Martin,Micah Martin" "Nadir Gulzar"
##  $ Publisher: chr  "Pearson Education" "Prentice Hall" "dreamTech Press"
##  $ ISBN     : chr  "81-7808-135-0" "0-13-185725-8" "81-7722-331-3"

2. Read books.html into a data frame.

#Download the "books.html"" into a local file.
fn <- "books_suman.html"
if (file.exists(fn)) file.remove(fn)
download.file(url=html.url,destfile="./books_suman.html",method="curl")

#parse the html
booksHtml <- htmlParse(file = "./books_suman.html")

#read the first HTMLTable
dfHtmlTable = readHTMLTable(doc=booksHtml, which=1, stringsAsFactors = FALSE)
#review the structure of the data frame
str(dfHtmlTable)
## 'data.frame':    3 obs. of  4 variables:
##  $ Title    : chr  "Design Patterns" "Agile Principles,Patterns,and Practices in C#" "Practical J2EE"
##  $ Authors  : chr  "Eric Gamma,Richard Helm,Ralph Johnson,John Vlissides" "Robert C.Martin,Micah Martin" "Nadir Gulzar"
##  $ Publisher: chr  "Pearson Education" "Prentice Hall" "dreamTech Press"
##  $ ISBN     : chr  "81-7808-135-0" "0-13-185725-8" "81-7722-331-3"

3. Read books.json into a data frame.

#Download the "books.json"" into a local file.
fn <- "books_suman.json"
if (file.exists(fn)) file.remove(fn)
download.file(url=json.url,destfile="./books_suman.json",method="curl")

#parse the json file
json_file = fromJSON(file = "./books_suman.json")

#lets prepare the data frame by joining each of the 'Authors' array elements from book document with in books colleciton with a comma.
(dfJSON <- do.call(rbind, lapply(json_file$books$book, function(node) {
  
  Title      <- node[["Title"]]
  Authors    <- paste(node[["Authors"]], collapse=",")
  Publisher  <- node[["Publisher"]]
  ISBN       <- node[["ISBN"]]
  
  data.frame(Title, Authors, Publisher, ISBN, stringsAsFactors = FALSE)
  
} )))
#review the structure of the data frame
str(dfJSON)
## 'data.frame':    3 obs. of  4 variables:
##  $ Title    : chr  "Design Patterns" "Agile Principles,Patterns,and Practices in C#" "Practical J2EE"
##  $ Authors  : chr  "Eric Gamma,Richard Helm,Ralph Johnson,John Vlissides" "Robert C.Martin,Micah Martin" "Nadir Gulzar"
##  $ Publisher: chr  "Pearson Education" "Prentice Hall" "dreamTech Press"
##  $ ISBN     : chr  "81-7808-135-0" "0-13-185725-8" "81-7722-331-3"

4. Lets make sure all of the above 3 data frames prepared from xml, html and json are all identical.

identical(dfXml,dfHtmlTable)
## [1] TRUE
identical(dfXml,dfJSON)
## [1] TRUE