Below is the summary of the assignment -
Load necessary libraries -
library(RCurl)
library(kableExtra)
library(stringr)
library(plyr)
library(XML)
library(jsonlite)
library(rvest)
Below are the steps to be followed to generate the final output -
#Reading in the HTML Code
htmlURL <- "http://htmlpreview.github.io/?https://github.com/soumya2g/R-CUNY-MSDS/blob/master/DATA-607/Data%20Import%20from%20XML%20%26%20JSON/Source%20Files/books.html"
#Parsing the Code using htmlParse
parsedHTML <- htmlParse(file = htmlURL)
parsedHTML
## <!DOCTYPE html>
## <html>
## <head>
## <meta charset="utf-8">
## <title>GitHub & BitBucket HTML Preview</title>
## <style>
## body {
## font: 12px 'Helvetica Neue', Helvetica, Arial, freesans, clean, sans-serif;
## color: #333;
## }
## h1 {
## font-size: 20px;
## }
## a {
## color: #666;
## }
## form {
## padding: 20px;
## text-align: center;
## }
## strong {
## color: #333;
## background-color: #FAFFA6;
## padding: 0.1em;
## }
## #footer {
## margin: 20px 0;
## font-size: 10px;
## color: #666;
## }
## </style>
## </head>
## <body>
## <form id="previewform" action="">
## <h1>GitHub & BitBucket HTML Preview</h1>
## <p>Enter URL of the HTML file to preview: <input type="url" id="file" value="" placeholder="e.g. https://github.com/user/repo/blob/master/index.html" size="60" autofocus><input type="submit" value="»"></p>
## <p>or prepend to the URL: <strong>http://htmlpreview.github.io/?</strong>https://github.com/twbs/bootstrap/blob/gh-pages/2.3.2/index.html</p>
## <p>or use this bookmarklet while browsing GitHub or BitBucket: <a href="javascript:void('http://htmlpreview.github.io/'==window.location?alert('Drag%20me%20to%20your%20bookmarks%20bar!'):window.location='http://htmlpreview.github.io/?'+window.location)"><strong>HTMLPreview</strong></a></p>
## <p id="footer">Developed by <a href="https://github.com/niutech">niu tech</a> | Contribute on <a href="https://github.com/htmlpreview/htmlpreview.github.com">GitHub</a></p>
## </form>
## <script src="/htmlpreview.min.js"></script><script>HTMLPreview.init();</script>
## </body>
## </html>
##
#Reading in the XML Code
xmlData <- getURL('https://raw.githubusercontent.com/soumya2g/R-CUNY-MSDS/master/DATA-607/Data%20Import%20from%20XML%20%26%20JSON/Source%20Files/books.xml', ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
#Parsing the Code using xmlParse
parsedXML <- xmlParse(file = xmlData[1])
parsedXML
## <?xml version="1.0" encoding="ISO-8859-1"?>
## <dwh_books>
## <book id="1">
## <title>Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema</title>
## <authors>
## <author>Jim Stagnitto</author>
## <author>Lawrence Corr</author>
## </authors>
## <publish_date>April 16, 2011</publish_date>
## <price>$35.37</price>
## <publisher>Decision Press</publisher>
## </book>
## <book id="2">
## <title>The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling</title>
## <authors>
## <author>Ralph Kimball</author>
## </authors>
## <publish_date>March 16, 1996</publish_date>
## <price>$16.97</price>
## <publisher>Wiley</publisher>
## </book>
## <book id="3">
## <title>Data Warehouse Systems: Design and Implementation</title>
## <authors>
## <author>Alejandro Vaisman</author>
## <author>Esteban Zimányi</author>
## </authors>
## <publish_date>September 10, 2014</publish_date>
## <price>$89.99</price>
## <publisher>Springer</publisher>
## </book>
## </dwh_books>
##
#Reading in the JSON Code
jsonURL <- "https://raw.githubusercontent.com/soumya2g/R-CUNY-MSDS/master/DATA-607/Data%20Import%20from%20XML%20%26%20JSON/Source%20Files/books.json"
#Parsing the Code using fromJSON
parsedJSON <- fromJSON(jsonURL)
parsedJSON
## $dwh_books
## book#
## 1 1
## 2 2
## 3 3
## title
## 1 Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema
## 2 The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling
## 3 Data Warehouse Systems: Design and Implementation
## authors publish_date price
## 1 Jim Stagnitto, Lawrence Corr April 16, 2011 $35.37
## 2 Ralph Kimball March 16, 1996 $16.97
## 3 Alejandro Vaisman, Esteban Zimányi September 10, 2014 $89.99
## publisher
## 1 Decision Press
## 2 Wiley
## 3 Springer
### rvest Method ####
htmlURL <- "https://raw.githubusercontent.com/soumya2g/R-CUNY-MSDS/master/DATA-607/Data%20Import%20from%20XML%20%26%20JSON/Source%20Files/books.html"
#Parsing the Code using htmlParse
readHtml <- read_html(htmlURL)
tables <- html_nodes(readHtml,"table")
tables_ls <- html_table(tables, fill = TRUE)
tableDF <- as.data.frame(tables_ls)
head(tableDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
Book. | Title | Author.s. | Published.On | Price | Publisher |
---|---|---|---|---|---|
1 | Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema | Jim Stagnitto, Lawrence Corr | April 16, 2011 | $35.37 | Decision Press |
2 | The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling | Ralph Kimball | March 16, 1996 | $16.97 | Wiley |
3 | Data Warehouse Systems: Design and Implementation | Alejandro Vaisman, Esteban Zimányi | September 10, 2014 | $89.99 | Springer |
#Extract XML file's root
root <- xmlRoot(parsedXML)
#Loading into Data Frame
booksXMLDF <- xmlToDataFrame(root)
head(booksXMLDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
title | authors | publish_date | price | publisher |
---|---|---|---|---|
Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema | Jim StagnittoLawrence Corr | April 16, 2011 | $35.37 | Decision Press |
The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling | Ralph Kimball | March 16, 1996 | $16.97 | Wiley |
Data Warehouse Systems: Design and Implementation | Alejandro VaismanEsteban Zimányi | September 10, 2014 | $89.99 | Springer |
I had to extract the ‘id’ attribute of the ‘book’ tag as part of the data frame. Also, for the books with multiple authors, the values from the child tag ‘author’ have been concatenated. I have converted those concatenated strings into comma separated values.
### Create commas separated list for books with multiple authors
authorsList <- unlist(lapply((xpathSApply(parsedXML,"//authors",fun=xmlToList)), function(X){
paste(unlist(X), collapse = ", ")
} ))
booksXMLDF$authors <- authorsList
### Extract 'id' Tag attribute
ids <- as.data.frame(xmlSApply(root, xmlGetAttr, "id"))
names(ids) <- "book#"
### Merge using cbind
booksXMLDF <- cbind(ids, booksXMLDF)
head(booksXMLDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
book# | title | authors | publish_date | price | publisher |
---|---|---|---|---|---|
1 | Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema | Jim Stagnitto, Lawrence Corr | April 16, 2011 | $35.37 | Decision Press |
2 | The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling | Ralph Kimball | March 16, 1996 | $16.97 | Wiley |
3 | Data Warehouse Systems: Design and Implementation | Alejandro Vaisman, Esteban Zimányi | September 10, 2014 | $89.99 | Springer |
#Parsing the Code using fromJSON
booksJSONDF <- parsedJSON$dwh_books
booksJSONDF <- as.data.frame(booksJSONDF)
head(booksJSONDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
book# | title | authors | publish_date | price | publisher |
---|---|---|---|---|---|
1 | Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema | c(“Jim Stagnitto”, “Lawrence Corr”) | April 16, 2011 | $35.37 | Decision Press |
2 | The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling | Ralph Kimball | March 16, 1996 | $16.97 | Wiley |
3 | Data Warehouse Systems: Design and Implementation | c(“Alejandro Vaisman”, “Esteban Zimányi”) | September 10, 2014 | $89.99 | Springer |
Converted the Character vector created for books with muliple authors into comma seprated list :
authorsList <- unlist(lapply(booksJSONDF$authors, function(X){
paste(unlist(X), collapse = ", ")
} ))
booksJSONDF$authors <- authorsList
head(booksJSONDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
book# | title | authors | publish_date | price | publisher |
---|---|---|---|---|---|
1 | Agile Data Warehouse Design: Collaborative Dimensional Modeling, from Whiteboard to Star Schema | Jim Stagnitto, Lawrence Corr | April 16, 2011 | $35.37 | Decision Press |
2 | The Data Warehouse Toolkit: The Complete Guide to Dimensional Modeling | Ralph Kimball | March 16, 1996 | $16.97 | Wiley |
3 | Data Warehouse Systems: Design and Implementation | Alejandro Vaisman, Esteban Zimányi | September 10, 2014 | $89.99 | Springer |
The three data frames generated from the .html, ,xml and .json files are identical.