Assignment Summary:

Below is the summary of the assignment -

R Libraries:

Load necessary libraries -

library(RCurl)
library(kableExtra)
library(stringr)
library(plyr)
library(XML)
library(jsonlite)
library(rvest)

1. Source Files (.html, .xml & .json) Creation :

Below are the steps to be followed to generate the final output -

  1. I created the following files by hand containing the details of my favourite Data Warehouse books:

1.1. books.html:

1.2. books.xml

1.3. books.json

2. Data import from Files :

  1. Import data into R:

2.1 HTML File:

#Reading in the HTML Code
htmlURL <- "http://htmlpreview.github.io/?https://github.com/soumya2g/R-CUNY-MSDS/blob/master/DATA-607/Data%20Import%20from%20XML%20%26%20JSON/Source%20Files/books.html"


#Parsing the Code using htmlParse
parsedHTML <- htmlParse(file = htmlURL)
parsedHTML
## <!DOCTYPE html>
## <html>
## <head>
## <meta charset="utf-8">
## <title>GitHub &amp; BitBucket HTML Preview</title>
## <style>
##  body {
##      font: 12px 'Helvetica Neue', Helvetica, Arial, freesans, clean, sans-serif;
##      color: #333;
##  }
##  h1 {
##      font-size: 20px;
##  }
##  a {
##      color: #666;
##  }
##  form {
##      padding: 20px; 
##      text-align: center;
##  }
##  strong {
##      color: #333;
##      background-color: #FAFFA6;
##      padding: 0.1em;
##  }
##  #footer {
##      margin: 20px 0;
##      font-size: 10px;
##      color: #666;
##  }
##  </style>
## </head>
## <body>
##  <form id="previewform" action="">
##      <h1>GitHub &amp; BitBucket HTML Preview</h1>
##      <p>Enter URL of the HTML file to preview: <input type="url" id="file" value="" placeholder="e.g. https://github.com/user/repo/blob/master/index.html" size="60" autofocus><input type="submit" value="»"></p>
##      <p>or prepend to the URL: <strong>http://htmlpreview.github.io/?</strong>https://github.com/twbs/bootstrap/blob/gh-pages/2.3.2/index.html</p>
##      <p>or use this bookmarklet while browsing GitHub or BitBucket: <a href="javascript:void('http://htmlpreview.github.io/'==window.location?alert('Drag%20me%20to%20your%20bookmarks%20bar!'):window.location='http://htmlpreview.github.io/?'+window.location)"><strong>HTMLPreview</strong></a></p>
##      <p id="footer">Developed by <a href="https://github.com/niutech">niu tech</a> | Contribute on <a href="https://github.com/htmlpreview/htmlpreview.github.com">GitHub</a></p>
##  </form>
##  <script src="/htmlpreview.min.js"></script><script>HTMLPreview.init();</script>
## </body>
## </html>
## 

2.2 XML File:

#Reading in the XML Code

xmlData <- getURL('https://raw.githubusercontent.com/soumya2g/R-CUNY-MSDS/master/DATA-607/Data%20Import%20from%20XML%20%26%20JSON/Source%20Files/books.xml', ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)

#Parsing the Code using xmlParse
parsedXML <- xmlParse(file = xmlData[1])
parsedXML
## <?xml version="1.0" encoding="ISO-8859-1"?>
## <dwh_books>
##   <book id="1">
##     <title>Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema</title>
##     <authors>
##       <author>Jim Stagnitto</author>
##       <author>Lawrence Corr</author>
##     </authors>
##     <publish_date>April 16, 2011</publish_date>
##     <price>$35.37</price>
##     <publisher>Decision Press</publisher>
##   </book>
##   <book id="2">
##     <title>The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling</title>
##     <authors>
##       <author>Ralph Kimball</author>
##     </authors>
##     <publish_date>March 16, 1996</publish_date>
##     <price>$16.97</price>
##     <publisher>Wiley</publisher>
##   </book>
##   <book id="3">
##     <title>Data Warehouse Systems: Design and Implementation</title>
##     <authors>
##       <author>Alejandro Vaisman</author>
##       <author>Esteban Zimányi</author>
##     </authors>
##     <publish_date>September 10, 2014</publish_date>
##     <price>$89.99</price>
##     <publisher>Springer</publisher>
##   </book>
## </dwh_books>
## 

2.3 JSON File:

#Reading in the JSON Code
jsonURL <- "https://raw.githubusercontent.com/soumya2g/R-CUNY-MSDS/master/DATA-607/Data%20Import%20from%20XML%20%26%20JSON/Source%20Files/books.json"

#Parsing the Code using fromJSON
parsedJSON <- fromJSON(jsonURL)
parsedJSON
## $dwh_books
##   book#
## 1     1
## 2     2
## 3     3
##                                                                                             title
## 1 Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema
## 2                          The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling
## 3                                               Data Warehouse Systems: Design and Implementation
##                               authors       publish_date  price
## 1        Jim Stagnitto, Lawrence Corr     April 16, 2011 $35.37
## 2                       Ralph Kimball     March 16, 1996 $16.97
## 3 Alejandro Vaisman, Esteban Zimányi September 10, 2014 $89.99
##        publisher
## 1 Decision Press
## 2          Wiley
## 3       Springer

3. Parsing data and loading into data frames :

  1. Load parsed source files into R data frames:

3.1 HTML File:

### rvest Method ####
htmlURL <- "https://raw.githubusercontent.com/soumya2g/R-CUNY-MSDS/master/DATA-607/Data%20Import%20from%20XML%20%26%20JSON/Source%20Files/books.html"


#Parsing the Code using htmlParse
readHtml <- read_html(htmlURL)
tables <- html_nodes(readHtml,"table")
tables_ls <- html_table(tables, fill = TRUE)
tableDF <- as.data.frame(tables_ls)


head(tableDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
Book. Title Author.s. Published.On Price Publisher
1 Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema Jim Stagnitto, Lawrence Corr April 16, 2011 $35.37 Decision Press
2 The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling Ralph Kimball March 16, 1996 $16.97 Wiley
3 Data Warehouse Systems: Design and Implementation Alejandro Vaisman, Esteban Zimányi September 10, 2014 $89.99 Springer

3.2 XML File:

#Extract XML file's root
root <- xmlRoot(parsedXML)

#Loading into Data Frame
booksXMLDF <- xmlToDataFrame(root)

head(booksXMLDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
title authors publish_date price publisher
Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema Jim StagnittoLawrence Corr April 16, 2011 $35.37 Decision Press
The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling Ralph Kimball March 16, 1996 $16.97 Wiley
Data Warehouse Systems: Design and Implementation Alejandro VaismanEsteban Zimányi September 10, 2014 $89.99 Springer

I had to extract the ‘id’ attribute of the ‘book’ tag as part of the data frame. Also, for the books with multiple authors, the values from the child tag ‘author’ have been concatenated. I have converted those concatenated strings into comma separated values.

### Create commas separated list for books with multiple authors
authorsList <- unlist(lapply((xpathSApply(parsedXML,"//authors",fun=xmlToList)), function(X){
  paste(unlist(X), collapse = ", ")
} ))

booksXMLDF$authors <- authorsList

### Extract 'id' Tag attribute
ids <- as.data.frame(xmlSApply(root, xmlGetAttr, "id"))
names(ids) <- "book#"

### Merge using cbind
booksXMLDF <- cbind(ids, booksXMLDF)

head(booksXMLDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
book# title authors publish_date price publisher
1 Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema Jim Stagnitto, Lawrence Corr April 16, 2011 $35.37 Decision Press
2 The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling Ralph Kimball March 16, 1996 $16.97 Wiley
3 Data Warehouse Systems: Design and Implementation Alejandro Vaisman, Esteban Zimányi September 10, 2014 $89.99 Springer

3.3 JSON File:

#Parsing the Code using fromJSON
booksJSONDF <- parsedJSON$dwh_books
booksJSONDF <- as.data.frame(booksJSONDF)

head(booksJSONDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
book# title authors publish_date price publisher
1 Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema c(“Jim Stagnitto”, “Lawrence Corr”) April 16, 2011 $35.37 Decision Press
2 The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling Ralph Kimball March 16, 1996 $16.97 Wiley
3 Data Warehouse Systems: Design and Implementation c(“Alejandro Vaisman”, “Esteban Zimányi”) September 10, 2014 $89.99 Springer

Converted the Character vector created for books with muliple authors into comma seprated list :

authorsList <- unlist(lapply(booksJSONDF$authors, function(X){
  paste(unlist(X), collapse = ", ")
} ))

booksJSONDF$authors <- authorsList

head(booksJSONDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
book# title authors publish_date price publisher
1 Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema Jim Stagnitto, Lawrence Corr April 16, 2011 $35.37 Decision Press
2 The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling Ralph Kimball March 16, 1996 $16.97 Wiley
3 Data Warehouse Systems: Design and Implementation Alejandro Vaisman, Esteban Zimányi September 10, 2014 $89.99 Springer

Conclusion:

The three data frames generated from the .html, ,xml and .json files are identical.