Libraries Used:

library(RCurl)
## Warning: package 'RCurl' was built under R version 3.5.3
## Loading required package: bitops
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
library(plyr)
library(XML)
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.5.3
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.5.3
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
## 
##     xml
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.3
library(png)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

HTML Source

imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/html.png"
include_graphics(imgage)

XML Source

imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/xml.png"
include_graphics(imgage)

JSON Source

imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/json.png"
include_graphics(imgage)

HTML Import

htmlurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.html"
 htmldata <- getURL(htmlurl)
 (html_doc <- htmlParse(htmldata, useInternalNodes = TRUE))
## <!DOCTYPE html>
## <html>
## <head><title>Book Table</title></head>
## <body>
## 
## 
## 
## <table border="6" width="75%" cellpadding="4" cellspacing="3">
## <tr>
## <th colspan="6">
## <br><h3>Data Science Books</h3>
##       </th>
##    </tr>
## <tr>
## <th>Book #</th>
##       <th>Title</th>
##    <th>Author(s)</th>
##       <th>Published On</th>
##       <th>Price</th>
##       <th>Publisher</th>
##    </tr>
## <tr align="CENTER">
## <td>Book #1</td>
##       <td>R for Data Science: Import, Tidy, transform, Visualiza, and Model Data</td>
##    <td>Hadley Wickham, Garraett Grolemund </td>
##       <td>December 2016</td>
##    <td>$18.17</td>
##       <td>O'Reilly</td>
##    </tr>
## <tr align="CENTER">
## <td>Book #2</td>
##       <td>An Introduction to Statistical Learning: with Applications in R</td>
##    <td>Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani</td>
##       <td>September 2017</td>
##    <td>$22.99</td>
##       <td>Springer</td>
##    </tr>
## <tr align="CENTER">
## <td>Book #3</td>
##       <td>Advanced R</td>
##    <td>Hadley Wickham</td>
##       <td>September 2014</td>
##    <td>$45.03</td>
##       <td>CRC Press</td>
##    </tr>
## </table>
## </body>
## </html>
## 

DataFrame out of HTML Page

html_source <- html_doc
readHtml <- read_html(htmlurl)
tables <- html_nodes(readHtml,"table")
tables_ls <- html_table(tables, fill = TRUE)
 tableDF <- as.data.frame(tables_ls)
head(tableDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
Data.Science.Books Data.Science.Books.1 Data.Science.Books.2 Data.Science.Books.3 Data.Science.Books.4 Data.Science.Books.5
Book # Title Author(s) Published On Price Publisher
Book #1 R for Data Science: Import, Tidy, transform, Visualiza, and Model Data Hadley Wickham, Garraett Grolemund December 2016 $18.17 O’Reilly
Book #2 An Introduction to Statistical Learning: with Applications in R Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani September 2017 $22.99 Springer
Book #3 Advanced R Hadley Wickham September 2014 $45.03 CRC Press

XML Import

xmlurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.xml"
xmldata <- getURL(xmlurl)
(doc <- xmlParse(xmldata, useInternalNodes = TRUE))
## <?xml version="1.0" encoding="ISO-8859-1"?>
## <ds_books>
##   <book id="1"><title>R for Data Science: Import, Tidy, transform, Visualiza, and Model Data</title><authors><author>Hadley Wickham</author><author>Garraett Grolemund </author></authors><publish_date>December 2016</publish_date><price>$18.17</price><publisher>O'Reilly</publisher>]
##  </book>
##   <book id="2">
##     <title>An Introduction to Statistical Learning: with Applications in R</title>
##     <authors>
##       <author>Gareth James</author>
##       <author>Daniela Witten</author>
##       <author>Trevor Hastie</author>
##       <author>Robert Tibshirani </author>
##     </authors>
##     <publish_date>September 2017</publish_date>
##     <price>$22.99</price>
##     <publisher>Springer</publisher>
##   </book>
##   <book id="3">
##     <title>Advanced R</title>
##     <authors>
##       <author>Hadley Wickham</author>
##     </authors>
##     <publish_date>September 2014</publish_date>
##     <price>$45.03</price>
##     <publisher>CRC Press</publisher>
##   </book>
## </ds_books>
## 
xml_source <- doc
xml_source <- doc
root <- xmlRoot(xml_source)
booksXMLDF <- xmlToDataFrame(root)

Tidying XML import by Authors

authorsList <- unlist(lapply((xpathSApply(xml_source,"//authors",fun=xmlToList)), function(X){
    paste(unlist(X), collapse = ", ")
} ))
ids <- as.data.frame(xmlSApply(root, xmlGetAttr, "id"))

 names(ids) <- "book#"
 booksXMLDF <- cbind(ids, booksXMLDF)
 booksXMLDF <- select (booksXMLDF,-c(text))

DataFrame out of XML Import

head(booksXMLDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
book# title authors publish_date price publisher
1 R for Data Science: Import, Tidy, transform, Visualiza, and Model Data Hadley WickhamGarraett Grolemund December 2016 $18.17 O’Reilly
2 An Introduction to Statistical Learning: with Applications in R Gareth JamesDaniela WittenTrevor HastieRobert Tibshirani September 2017 $22.99 Springer
3 Advanced R Hadley Wickham September 2014 $45.03 CRC Press

JSON Import

jsonurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.json"
parsedJSON <- fromJSON(jsonurl)
parsedJSON
## $ds_books
##   book#
## 1     1
## 2     2
## 3     3
##                                                                    title
## 1 R for Data Science: Import, Tidy, transform, Visualiza, and Model Data
## 2        An Introduction to Statistical Learning: with Applications in R
## 3                                                             Advanced R
##                                                          authors
## 1                             Hadley Wickham, Garraett Grolemund
## 2 Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani
## 3                                                 Hadley Wickham
##     publish_date  price publisher
## 1  December 2016 $18.17  O'Reilly
## 2 September 2017 $22.99  Springer
## 3 September 2014 $45.03 CRC Press

Tidying and Creation of DataFrame out of JSON File

booksJSONDF <- parsedJSON$dwh_books

booksJSONDF <- as.data.frame(booksJSONDF)
head(booksJSONDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")

Conclusion

After creating, importing, tidying, and displaying the three different file types, HTML, XML, JSON, I can visulize that all three foles are IDENTICAL.