Assignment - Working with XML and JSON in R

Libraries Used:

library(RCurl)

## Warning: package 'RCurl' was built under R version 3.5.3

## Loading required package: bitops

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 3.5.3

library(stringr)

## Warning: package 'stringr' was built under R version 3.5.3

library(plyr)
library(XML)
library(jsonlite)

## Warning: package 'jsonlite' was built under R version 3.5.3

library(rvest)

## Warning: package 'rvest' was built under R version 3.5.3

## Loading required package: xml2

## Warning: package 'xml2' was built under R version 3.5.3

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:XML':
## 
##     xml

library(knitr)

## Warning: package 'knitr' was built under R version 3.5.3

library(png)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following object is masked from 'package:kableExtra':
## 
##     group_rows

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

HTML Source

imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/html.png"
include_graphics(imgage)

XML Source

imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/xml.png"
include_graphics(imgage)

JSON Source

imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/json.png"
include_graphics(imgage)

HTML Import

htmlurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.html"
 htmldata <- getURL(htmlurl)
 (html_doc <- htmlParse(htmldata, useInternalNodes = TRUE))

## <!DOCTYPE html>
## <html>
## <head><title>Book Table</title></head>
## <body>
## 
## 
## 
## <table border="6" width="75%" cellpadding="4" cellspacing="3">
## <tr>
## <th colspan="6">
## <br><h3>Data Science Books</h3>
##       </th>
##    </tr>
## <tr>
## <th>Book #</th>
##       <th>Title</th>
##    <th>Author(s)</th>
##       <th>Published On</th>
##       <th>Price</th>
##       <th>Publisher</th>
##    </tr>
## <tr align="CENTER">
## <td>Book #1</td>
##       <td>R for Data Science: Import, Tidy, transform, Visualiza, and Model Data</td>
##    <td>Hadley Wickham, Garraett Grolemund </td>
##       <td>December 2016</td>
##    <td>$18.17</td>
##       <td>O'Reilly</td>
##    </tr>
## <tr align="CENTER">
## <td>Book #2</td>
##       <td>An Introduction to Statistical Learning: with Applications in R</td>
##    <td>Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani</td>
##       <td>September 2017</td>
##    <td>$22.99</td>
##       <td>Springer</td>
##    </tr>
## <tr align="CENTER">
## <td>Book #3</td>
##       <td>Advanced R</td>
##    <td>Hadley Wickham</td>
##       <td>September 2014</td>
##    <td>$45.03</td>
##       <td>CRC Press</td>
##    </tr>
## </table>
## </body>
## </html>
##

DataFrame out of HTML Page

html_source <- html_doc
readHtml <- read_html(htmlurl)
tables <- html_nodes(readHtml,"table")
tables_ls <- html_table(tables, fill = TRUE)
 tableDF <- as.data.frame(tables_ls)
head(tableDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")

Data.Science.Books	Data.Science.Books.1	Data.Science.Books.2	Data.Science.Books.3	Data.Science.Books.4	Data.Science.Books.5
Book #	Title	Author(s)	Published On	Price	Publisher
Book #1	R for Data Science: Import, Tidy, transform, Visualiza, and Model Data	Hadley Wickham, Garraett Grolemund	December 2016	$18.17	O’Reilly
Book #2	An Introduction to Statistical Learning: with Applications in R	Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani	September 2017	$22.99	Springer
Book #3	Advanced R	Hadley Wickham	September 2014	$45.03	CRC Press

XML Import

xmlurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.xml"
xmldata <- getURL(xmlurl)
(doc <- xmlParse(xmldata, useInternalNodes = TRUE))

## <?xml version="1.0" encoding="ISO-8859-1"?>
## <ds_books>
##   <book id="1"><title>R for Data Science: Import, Tidy, transform, Visualiza, and Model Data</title><authors><author>Hadley Wickham</author><author>Garraett Grolemund </author></authors><publish_date>December 2016</publish_date><price>$18.17</price><publisher>O'Reilly</publisher>]
##  </book>
##   <book id="2">
##     <title>An Introduction to Statistical Learning: with Applications in R</title>
##     <authors>
##       <author>Gareth James</author>
##       <author>Daniela Witten</author>
##       <author>Trevor Hastie</author>
##       <author>Robert Tibshirani </author>
##     </authors>
##     <publish_date>September 2017</publish_date>
##     <price>$22.99</price>
##     <publisher>Springer</publisher>
##   </book>
##   <book id="3">
##     <title>Advanced R</title>
##     <authors>
##       <author>Hadley Wickham</author>
##     </authors>
##     <publish_date>September 2014</publish_date>
##     <price>$45.03</price>
##     <publisher>CRC Press</publisher>
##   </book>
## </ds_books>
##

xml_source <- doc
xml_source <- doc
root <- xmlRoot(xml_source)
booksXMLDF <- xmlToDataFrame(root)

Tidying XML import by Authors

authorsList <- unlist(lapply((xpathSApply(xml_source,"//authors",fun=xmlToList)), function(X){
    paste(unlist(X), collapse = ", ")
} ))

ids <- as.data.frame(xmlSApply(root, xmlGetAttr, "id"))

 names(ids) <- "book#"
 booksXMLDF <- cbind(ids, booksXMLDF)
 booksXMLDF <- select (booksXMLDF,-c(text))

DataFrame out of XML Import

head(booksXMLDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")

book#	title	authors	publish_date	price	publisher
1	R for Data Science: Import, Tidy, transform, Visualiza, and Model Data	Hadley WickhamGarraett Grolemund	December 2016	$18.17	O’Reilly
2	An Introduction to Statistical Learning: with Applications in R	Gareth JamesDaniela WittenTrevor HastieRobert Tibshirani	September 2017	$22.99	Springer
3	Advanced R	Hadley Wickham	September 2014	$45.03	CRC Press

JSON Import

jsonurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.json"
parsedJSON <- fromJSON(jsonurl)
parsedJSON

## $ds_books
##   book#
## 1     1
## 2     2
## 3     3
##                                                                    title
## 1 R for Data Science: Import, Tidy, transform, Visualiza, and Model Data
## 2        An Introduction to Statistical Learning: with Applications in R
## 3                                                             Advanced R
##                                                          authors
## 1                             Hadley Wickham, Garraett Grolemund
## 2 Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani
## 3                                                 Hadley Wickham
##     publish_date  price publisher
## 1  December 2016 $18.17  O'Reilly
## 2 September 2017 $22.99  Springer
## 3 September 2014 $45.03 CRC Press

Tidying and Creation of DataFrame out of JSON File

booksJSONDF <- parsedJSON$dwh_books

booksJSONDF <- as.data.frame(booksJSONDF)
head(booksJSONDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")

Conclusion

After creating, importing, tidying, and displaying the three different file types, HTML, XML, JSON, I can visulize that all three foles are IDENTICAL.

Assignment - Working with XML and JSON in R

Joseph Simone

10/10/2019

Libraries Used:

HTML Source

XML Source

JSON Source

HTML Import

DataFrame out of HTML Page

XML Import

Tidying XML import by Authors

DataFrame out of XML Import

JSON Import

Tidying and Creation of DataFrame out of JSON File

Conclusion