Libraries Used:
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.5.3
## Loading required package: bitops
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.3
library(stringr)
## Warning: package 'stringr' was built under R version 3.5.3
library(plyr)
library(XML)
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.5.3
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.3
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.5.3
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.3
library(png)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
HTML Source
imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/html.png"
include_graphics(imgage)

XML Source
imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/xml.png"
include_graphics(imgage)

JSON Source
imgage <- "C:/Users/jpsim/Documents/DATA Acquisition and Management/json.png"
include_graphics(imgage)

HTML Import
htmlurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.html"
htmldata <- getURL(htmlurl)
(html_doc <- htmlParse(htmldata, useInternalNodes = TRUE))
## <!DOCTYPE html>
## <html>
## <head><title>Book Table</title></head>
## <body>
##
##
##
## <table border="6" width="75%" cellpadding="4" cellspacing="3">
## <tr>
## <th colspan="6">
## <br><h3>Data Science Books</h3>
## </th>
## </tr>
## <tr>
## <th>Book #</th>
## <th>Title</th>
## <th>Author(s)</th>
## <th>Published On</th>
## <th>Price</th>
## <th>Publisher</th>
## </tr>
## <tr align="CENTER">
## <td>Book #1</td>
## <td>R for Data Science: Import, Tidy, transform, Visualiza, and Model Data</td>
## <td>Hadley Wickham, Garraett Grolemund </td>
## <td>December 2016</td>
## <td>$18.17</td>
## <td>O'Reilly</td>
## </tr>
## <tr align="CENTER">
## <td>Book #2</td>
## <td>An Introduction to Statistical Learning: with Applications in R</td>
## <td>Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani</td>
## <td>September 2017</td>
## <td>$22.99</td>
## <td>Springer</td>
## </tr>
## <tr align="CENTER">
## <td>Book #3</td>
## <td>Advanced R</td>
## <td>Hadley Wickham</td>
## <td>September 2014</td>
## <td>$45.03</td>
## <td>CRC Press</td>
## </tr>
## </table>
## </body>
## </html>
##
DataFrame out of HTML Page
html_source <- html_doc
readHtml <- read_html(htmlurl)
tables <- html_nodes(readHtml,"table")
tables_ls <- html_table(tables, fill = TRUE)
tableDF <- as.data.frame(tables_ls)
head(tableDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
|
Data.Science.Books
|
Data.Science.Books.1
|
Data.Science.Books.2
|
Data.Science.Books.3
|
Data.Science.Books.4
|
Data.Science.Books.5
|
|
Book #
|
Title
|
Author(s)
|
Published On
|
Price
|
Publisher
|
|
Book #1
|
R for Data Science: Import, Tidy, transform, Visualiza, and Model Data
|
Hadley Wickham, Garraett Grolemund
|
December 2016
|
$18.17
|
O’Reilly
|
|
Book #2
|
An Introduction to Statistical Learning: with Applications in R
|
Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani
|
September 2017
|
$22.99
|
Springer
|
|
Book #3
|
Advanced R
|
Hadley Wickham
|
September 2014
|
$45.03
|
CRC Press
|
XML Import
xmlurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.xml"
xmldata <- getURL(xmlurl)
(doc <- xmlParse(xmldata, useInternalNodes = TRUE))
## <?xml version="1.0" encoding="ISO-8859-1"?>
## <ds_books>
## <book id="1"><title>R for Data Science: Import, Tidy, transform, Visualiza, and Model Data</title><authors><author>Hadley Wickham</author><author>Garraett Grolemund </author></authors><publish_date>December 2016</publish_date><price>$18.17</price><publisher>O'Reilly</publisher>]
## </book>
## <book id="2">
## <title>An Introduction to Statistical Learning: with Applications in R</title>
## <authors>
## <author>Gareth James</author>
## <author>Daniela Witten</author>
## <author>Trevor Hastie</author>
## <author>Robert Tibshirani </author>
## </authors>
## <publish_date>September 2017</publish_date>
## <price>$22.99</price>
## <publisher>Springer</publisher>
## </book>
## <book id="3">
## <title>Advanced R</title>
## <authors>
## <author>Hadley Wickham</author>
## </authors>
## <publish_date>September 2014</publish_date>
## <price>$45.03</price>
## <publisher>CRC Press</publisher>
## </book>
## </ds_books>
##
xml_source <- doc
xml_source <- doc
root <- xmlRoot(xml_source)
booksXMLDF <- xmlToDataFrame(root)
Tidying XML import by Authors
authorsList <- unlist(lapply((xpathSApply(xml_source,"//authors",fun=xmlToList)), function(X){
paste(unlist(X), collapse = ", ")
} ))
ids <- as.data.frame(xmlSApply(root, xmlGetAttr, "id"))
names(ids) <- "book#"
booksXMLDF <- cbind(ids, booksXMLDF)
booksXMLDF <- select (booksXMLDF,-c(text))
DataFrame out of XML Import
head(booksXMLDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
|
book#
|
title
|
authors
|
publish_date
|
price
|
publisher
|
|
1
|
R for Data Science: Import, Tidy, transform, Visualiza, and Model Data
|
Hadley WickhamGarraett Grolemund
|
December 2016
|
$18.17
|
O’Reilly
|
|
2
|
An Introduction to Statistical Learning: with Applications in R
|
Gareth JamesDaniela WittenTrevor HastieRobert Tibshirani
|
September 2017
|
$22.99
|
Springer
|
|
3
|
Advanced R
|
Hadley Wickham
|
September 2014
|
$45.03
|
CRC Press
|
JSON Import
jsonurl <- "https://raw.githubusercontent.com/josephsimone/DATA607/master/ds_books.json"
parsedJSON <- fromJSON(jsonurl)
parsedJSON
## $ds_books
## book#
## 1 1
## 2 2
## 3 3
## title
## 1 R for Data Science: Import, Tidy, transform, Visualiza, and Model Data
## 2 An Introduction to Statistical Learning: with Applications in R
## 3 Advanced R
## authors
## 1 Hadley Wickham, Garraett Grolemund
## 2 Gareth James, Daniela Witten, Trevor Hastie, Robert Tibshirani
## 3 Hadley Wickham
## publish_date price publisher
## 1 December 2016 $18.17 O'Reilly
## 2 September 2017 $22.99 Springer
## 3 September 2014 $45.03 CRC Press
Tidying and Creation of DataFrame out of JSON File
booksJSONDF <- parsedJSON$dwh_books
booksJSONDF <- as.data.frame(booksJSONDF)
head(booksJSONDF) %>% kable() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% scroll_box(width="100%",height="300px")
Conclusion
After creating, importing, tidying, and displaying the three different file types, HTML, XML, JSON, I can visulize that all three foles are IDENTICAL.