Load packages
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: bitops
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
Parse XML
# Read in file
xmlDoc <- read_xml("/Users/ryanweber/Desktop/CUNY/Data 607 Db/Assignments/Week 7/Assignment07/books.xml")
# Get root
nodes <- xmlDoc %>% xml_nodes('Book')
xmlDf <- NULL
# For each item, parse out columns
for (i in 1:length(nodes))
{
node <- nodes[[i]]
tempDf <- NULL
tempDf$obTitle <- node %>% xml_nodes('Title') %>% xml_text
tempDf$obAuthor <- node %>% xml_nodes('Author') %>% xml_text %>% paste(., collapse=",")
tempDf$obPub <- node %>% xml_attr('Publisher')
tempDf$obPubDate <- node %>% xml_attr('PublicationDate')
tempDf$obPages <- node %>% xml_attr('Pages')
xmlDf <- bind_rows(xmlDf, tempDf)
}
# Separate out author values
names(xmlDf) <- c("Title", "Author", "Publisher", "PublicationDate", "Pages")
xmlDf <- xmlDf %>% separate(Author, c("Author1", "Author2"), ",")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 2].
xmlDf$Pages <- as.numeric(xmlDf$Pages)
Parse JSON
# Read in file
jsonOb <- fromJSON("/Users/ryanweber/Desktop/CUNY/Data 607 Db/Assignments/Week 7/Assignment07/books.json")
jsonDf <- as.data.frame(jsonOb, stringsAsFactors = FALSE)
# Convert author list to comma-separated string
jsonDf <- jsonDf %>% group_by(book.title) %>% mutate(book.author = paste(unlist(book.author), collapse=",")) %>%
ungroup()
# Separate out author values
names(jsonDf) <- c("Title", "Author", "Publisher", "PublicationDate", "Pages")
jsonDf <- jsonDf %>% separate(Author, c("Author1", "Author2"), ",")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 2].
jsonDf$Pages <- as.numeric(jsonDf$Pages)
Parse HTML
# Read in file
htmlDf <- readHTMLTable("/Users/ryanweber/Desktop/CUNY/Data 607 Db/Assignments/Week 7/Assignment07/books.html", which = 1, stringsAsFactors = FALSE, header = TRUE)
# Separate out author values
htmlDf <- htmlDf %>% separate(Author, c("Author1", "Author2"), ",") %>%
rename(PublicationDate = Publication)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1,
## 2].
htmlDf$Pages <- as.numeric(htmlDf$Pages)
Compare dataframes
xmlDf
## # A tibble: 3 x 6
## Title Author1 Author2 Publisher PublicationDate Pages
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Speaking the Lang… Raymond … <NA> Cambridge Un… March 31, 1989 384
## 2 The Ongoing Moment Geoff Dy… <NA> Vintage March 13, 2007 304
## 3 Selected Essays John Ber… Geoff … Vintage March 11, 2003 608
jsonDf
## # A tibble: 3 x 6
## Title Author1 Author2 Publisher PublicationDate Pages
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 Speaking the Lang… Raymond … <NA> Cambridge Un… March 31, 1989 384
## 2 The Ongoing Moment Geoff Dy… <NA> Vintage March 13, 2007 304
## 3 Selected Essays John Ber… Geoff … Vintage March 11, 2003 608
htmlDf
## Title Author1
## 1 Speaking the Language of Desire: The Films of Carl Dreyer Raymond Carney
## 2 The Ongoing Moment Geoff Dyer
## 3 Selected Essays John Berger
## Author2 Publisher PublicationDate Pages
## 1 <NA> Cambridge University Press March 31, 1989 384
## 2 <NA> Vintage March 13, 2007 304
## 3 Geoff Dyer Vintage March 11, 2003 608