Load packages

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: bitops
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
## 
##     xml

Parse XML

# Read in file
xmlDoc <- read_xml("/Users/ryanweber/Desktop/CUNY/Data 607 Db/Assignments/Week 7/Assignment07/books.xml")

# Get root
nodes <- xmlDoc %>% xml_nodes('Book') 

xmlDf <- NULL

# For each item, parse out columns
for (i in 1:length(nodes))
{
  node <- nodes[[i]]
  
  tempDf <- NULL
  
  tempDf$obTitle <- node %>% xml_nodes('Title') %>% xml_text
  tempDf$obAuthor <- node %>% xml_nodes('Author') %>% xml_text %>% paste(., collapse=",")
  tempDf$obPub <- node %>% xml_attr('Publisher')
  tempDf$obPubDate <- node %>% xml_attr('PublicationDate')
  tempDf$obPages <- node %>% xml_attr('Pages')
  
  xmlDf <- bind_rows(xmlDf, tempDf)
}

# Separate out author values
names(xmlDf) <- c("Title", "Author", "Publisher", "PublicationDate", "Pages")
xmlDf <- xmlDf %>% separate(Author, c("Author1", "Author2"), ",")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 2].
xmlDf$Pages <- as.numeric(xmlDf$Pages)

Parse JSON

# Read in file
jsonOb <- fromJSON("/Users/ryanweber/Desktop/CUNY/Data 607 Db/Assignments/Week 7/Assignment07/books.json")
jsonDf <- as.data.frame(jsonOb, stringsAsFactors = FALSE)

# Convert author list to comma-separated string
jsonDf <- jsonDf %>% group_by(book.title) %>% mutate(book.author = paste(unlist(book.author), collapse=",")) %>%
  ungroup()

# Separate out author values
names(jsonDf) <- c("Title", "Author", "Publisher", "PublicationDate", "Pages")
jsonDf <- jsonDf %>% separate(Author, c("Author1", "Author2"), ",")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 2].
jsonDf$Pages <- as.numeric(jsonDf$Pages)

Parse HTML

# Read in file
htmlDf <- readHTMLTable("/Users/ryanweber/Desktop/CUNY/Data 607 Db/Assignments/Week 7/Assignment07/books.html", which = 1, stringsAsFactors = FALSE, header = TRUE)

# Separate out author values
htmlDf <- htmlDf %>% separate(Author, c("Author1", "Author2"), ",") %>%
  rename(PublicationDate = Publication)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 2].
htmlDf$Pages <- as.numeric(htmlDf$Pages)

Compare dataframes

xmlDf
## # A tibble: 3 x 6
##   Title              Author1   Author2 Publisher     PublicationDate Pages
##   <chr>              <chr>     <chr>   <chr>         <chr>           <dbl>
## 1 Speaking the Lang… Raymond … <NA>    Cambridge Un… March 31, 1989    384
## 2 The Ongoing Moment Geoff Dy… <NA>    Vintage       March 13, 2007    304
## 3 Selected Essays    John Ber… Geoff … Vintage       March 11, 2003    608
jsonDf
## # A tibble: 3 x 6
##   Title              Author1   Author2 Publisher     PublicationDate Pages
##   <chr>              <chr>     <chr>   <chr>         <chr>           <dbl>
## 1 Speaking the Lang… Raymond … <NA>    Cambridge Un… March 31, 1989    384
## 2 The Ongoing Moment Geoff Dy… <NA>    Vintage       March 13, 2007    304
## 3 Selected Essays    John Ber… Geoff … Vintage       March 11, 2003    608
htmlDf
##                                                       Title        Author1
## 1 Speaking the Language of Desire: The Films of Carl Dreyer Raymond Carney
## 2                                        The Ongoing Moment     Geoff Dyer
## 3                                           Selected Essays    John Berger
##       Author2                  Publisher PublicationDate Pages
## 1        <NA> Cambridge University Press  March 31, 1989   384
## 2        <NA>                    Vintage  March 13, 2007   304
## 3  Geoff Dyer                    Vintage  March 11, 2003   608