Install packages

library(XML)
library(RJSONIO)
library(RCurl)

## Loading required package: bitops

library(stringr)
library(plyr)

XML

xml_url <- getURL("https://raw.githubusercontent.com/hvasquez81/Data607-Assignment-7/master/Books.xml")
books_xml <- xmlParse(xml_url)
root_xml <- xmlRoot(books_xml)
class(root_xml)

## [1] "XMLInternalElementNode" "XMLInternalNode"       
## [3] "XMLAbstractNode"

xmlName(root_xml)

## [1] "Books"

xmlSize(root_xml)

## [1] 3

root_xml["book"]

## $book
## <book>
##   <Title> Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future </Title>
##   <Authors> Ashlee Vance </Authors>
##   <pages> 386 </pages>
## </book> 
## 
## $book
## <book>
##   <Title> The Dichotomy of Leadership </Title>
##   <Authors> Jocko Willink, Leif Babin </Authors>
##   <pages> 285 </pages>
## </book> 
## 
## $book
## <book>
##   <Title> 12 Rules for Life: An Antidote to Chaos </Title>
##   <Authors> Jordan B. Peterson </Authors>
##   <pages> 370 </pages>
## </book> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"

xml_df <- xmlToDataFrame(root_xml)
xml_df

##                                                              Title
## 1  Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future 
## 2                                     The Dichotomy of Leadership 
## 3                         12 Rules for Life: An Antidote to Chaos 
##                       Authors pages
## 1               Ashlee Vance   386 
## 2  Jocko Willink, Leif Babin   285 
## 3         Jordan B. Peterson   370

HTML

html_url <- getURL("https://raw.githubusercontent.com/hvasquez81/Data607-Assignment-7/master/Books.html")
books_html <- htmlParse(html_url)
books_html <- readHTMLTable(books_html)
html_df <- data.frame(books_html)
html_df

##                                                  NULL.Book.Title
## 1 Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future
## 2                                    The Dichotomy of Leadership
## 3                        12 Rules for Life: An Antidote to Chaos
##                NULL.Authors NULL.pages
## 1              Ashlee Vance        386
## 2 Jocko Willink, Leif Babin        285
## 3        Jordan B. Peterson        370

JSON

isValidJSON("https://raw.githubusercontent.com/hvasquez81/Data607-Assignment-7/master/Books.json")

## [1] TRUE

books_json <- fromJSON("https://raw.githubusercontent.com/hvasquez81/Data607-Assignment-7/master/Books.json")
class(books_json)

## [1] "list"

json_vec <- unlist(books_json, recursive = TRUE, use.names = TRUE)
json_vec[str_detect(names(json_vec), "Book Title")]

##                                                 Books.Book Title 
## "Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future" 
##                                                 Books.Book Title 
##                                    "The Dichotomy of Leadership" 
##                                                 Books.Book Title 
##                        "12 Rules for Life: An Antidote to Chaos"

books_json_unlist <- sapply(books_json[[1]], unlist)
json_df <- do.call("rbind.fill", lapply(lapply(books_json_unlist,t),data.frame, stringsAsFactors = FALSE))
json_df

##                                                       Book.Title
## 1 Elon Musk: Tesla, SpaceX, and the Quest for a Fantastic Future
## 2                                    The Dichotomy of Leadership
## 3                        12 Rules for Life: An Antidote to Chaos
##              Authors pages      Authors1   Authors2
## 1       Ashlee Vance   386          <NA>       <NA>
## 2               <NA>   285 Jocko Willink Leif Babin
## 3 Jordan B. Peterson   370          <NA>       <NA>

Conclusion

The structures between all of the dataframes are similar. HTML and XML seem to be exactly the same, however there is a slight variation in JSON. When transforming JSON to a dataframe it created 3 authors columns. For the book with 2 aughter, the “Authors” column was left NA, while Authors1, and Authors2 recieved input values

DATA607 Assignment 7

Install packages

XML

HTML

JSON

Conclusion