#Load the required libraries for data analysis.
library(RCurl)
## Loading required package: bitops
library(XML)
library(rjson)
Files books.xml , books.html , books.json
#initialize the paths to the books.xml, books.html and books.json files.
xml.url <- "https://raw.githubusercontent.com/psumank/DAMgmt/master/Assignment_9/books.xml"
html.url <- "https://raw.githubusercontent.com/psumank/DAMgmt/master/Assignment_9/books.html"
json.url <- "https://raw.githubusercontent.com/psumank/DAMgmt/master/Assignment_9/books.json"
1. Read books.xml into a data frame.
#Download the "books.xml"" file into a local file.
fn <- "books_suman.xml"
if (file.exists(fn)) file.remove(fn)
download.file(url=xml.url,destfile="./books_suman.xml",method="curl")
#parse the xml
booksXml <- xmlParse(file="./books_suman.xml")
#lets prepare the data frame by joining each of the Author element from Authors node of each Book with a comma.
(dfXml <- do.call(rbind, xpathApply(booksXml, "/books/book", function(node) {
Title <- xmlValue(node[["Title"]])
Authors <- paste(sapply(getNodeSet(node, './Authors/Author'), function(x) { xmlValue(x) }), collapse=",")
Publisher <- xmlValue(node[["Publisher"]])
ISBN <- xmlGetAttr(node, "ISBN")
data.frame(Title, Authors, Publisher, ISBN, stringsAsFactors = FALSE)
} )))
#review the structure of the data frame
str(dfXml)
## 'data.frame': 3 obs. of 4 variables:
## $ Title : chr "Design Patterns" "Agile Principles,Patterns,and Practices in C#" "Practical J2EE"
## $ Authors : chr "Eric Gamma,Richard Helm,Ralph Johnson,John Vlissides" "Robert C.Martin,Micah Martin" "Nadir Gulzar"
## $ Publisher: chr "Pearson Education" "Prentice Hall" "dreamTech Press"
## $ ISBN : chr "81-7808-135-0" "0-13-185725-8" "81-7722-331-3"
2. Read books.html into a data frame.
#Download the "books.html"" into a local file.
fn <- "books_suman.html"
if (file.exists(fn)) file.remove(fn)
download.file(url=html.url,destfile="./books_suman.html",method="curl")
#parse the html
booksHtml <- htmlParse(file = "./books_suman.html")
#read the first HTMLTable
dfHtmlTable = readHTMLTable(doc=booksHtml, which=1, stringsAsFactors = FALSE)
#review the structure of the data frame
str(dfHtmlTable)
## 'data.frame': 3 obs. of 4 variables:
## $ Title : chr "Design Patterns" "Agile Principles,Patterns,and Practices in C#" "Practical J2EE"
## $ Authors : chr "Eric Gamma,Richard Helm,Ralph Johnson,John Vlissides" "Robert C.Martin,Micah Martin" "Nadir Gulzar"
## $ Publisher: chr "Pearson Education" "Prentice Hall" "dreamTech Press"
## $ ISBN : chr "81-7808-135-0" "0-13-185725-8" "81-7722-331-3"
3. Read books.json into a data frame.
#Download the "books.json"" into a local file.
fn <- "books_suman.json"
if (file.exists(fn)) file.remove(fn)
download.file(url=json.url,destfile="./books_suman.json",method="curl")
#parse the json file
json_file = fromJSON(file = "./books_suman.json")
#lets prepare the data frame by joining each of the 'Authors' array elements from book document with in books colleciton with a comma.
(dfJSON <- do.call(rbind, lapply(json_file$books$book, function(node) {
Title <- node[["Title"]]
Authors <- paste(node[["Authors"]], collapse=",")
Publisher <- node[["Publisher"]]
ISBN <- node[["ISBN"]]
data.frame(Title, Authors, Publisher, ISBN, stringsAsFactors = FALSE)
} )))
#review the structure of the data frame
str(dfJSON)
## 'data.frame': 3 obs. of 4 variables:
## $ Title : chr "Design Patterns" "Agile Principles,Patterns,and Practices in C#" "Practical J2EE"
## $ Authors : chr "Eric Gamma,Richard Helm,Ralph Johnson,John Vlissides" "Robert C.Martin,Micah Martin" "Nadir Gulzar"
## $ Publisher: chr "Pearson Education" "Prentice Hall" "dreamTech Press"
## $ ISBN : chr "81-7808-135-0" "0-13-185725-8" "81-7722-331-3"
4. Lets make sure all of the above 3 data frames prepared from xml, html and json are all identical.
identical(dfXml,dfHtmlTable)
## [1] TRUE
identical(dfXml,dfJSON)
## [1] TRUE