Week 8 assignment: XML & JSON

library(stringr)
library(XML)

## Warning: package 'XML' was built under R version 3.2.3

library(jsonlite)
library(RCurl)

## Loading required package: bitops

## Warning in inDL(x, as.logical(local), as.logical(now), ...): DLL attempted
## to change FPU control word from 8001f to 9001f

Importing HTML tables and JSON to a dataframe is simple. The packages do all the work:

json_df <- fromJSON("https://raw.githubusercontent.com/fdsps/IS607/master/books.json")
html_data<- readHTMLTable(getURL("https://raw.githubusercontent.com/fdsps/IS607/master/books.html"))
html_df <- data.frame(html_data)
json_df

## $books
## $books$book
##           id                                              title
## 1 0688031838    The People's Almanac Presents the Book of Lists
## 2 0441172695 Blood Meridian: Or the Evening Redness in the West
## 3 0679728759                                       Dune Messiah
##                                            author           genre
## 1 David Wallechinsky, Irving Wallace, Amy Wallace     Non-fiction
## 2                                 Cormac McCarthy         Fiction
## 3                                   Frank Herbert Science Fiction
##   publish_year
## 1         1977
## 2         1992
## 3         1969

html_df

##                                          books.Title
## 1    The People's Almanac Presents the Book of Lists
## 2 Blood Meridian: Or the Evening Redness in the West
## 3                                       Dune Messiah
##                                       books.Author     books.Genre
## 1 David Wallechinsky, Irving Wallace, Amy Wallace.     Non-fiction
## 2                                  Cormac McCarthy         Fiction
## 3                                    Frank Herbert Science Fiction
##   books.Year   books.ID
## 1       1977 0688031838
## 2       1992 0441172695
## 3       1969 0679728759

My XML file is problematic because of the id attribute and multiple “author” tags for the first title.

books_xml <- xmlParse(getURL("https://raw.githubusercontent.com/fdsps/IS607/master/books.xml"))
root <- xmlRoot(books_xml)
xmlToDataFrame(root)

##                                                title
## 1    The People's Almanac Presents the Book of Lists
## 2 Blood Meridian: Or the Evening Redness in the West
## 3                                       Dune Messiah
##                                       authors           genre publish_year
## 1 David WallechinskyIrving WallaceAmy Wallace     Non-fiction         1977
## 2                             Cormac McCarthy         Fiction         1992
## 3                               Frank Herbert Science Fiction         1969

My solution is to grab each child node value and combine it into one string.

auth<-c()
author<-c()
n <- length(xpathSApply(root, "//book/authors", xmlChildren)) # count of authors nodes
for (i in 1:n){
  j <- length(xpathSApply(root, "//book/authors", xmlChildren)[[i]]) # count of author children
  if (j>1){
    for (k in 1:j){
      tmp <- xmlValue(xpathSApply(root, "//book/authors", xmlChildren)[[i]][[k]]) # index to each author child
      auth <- str_c(auth, tmp, ";") 
    }
  }
  else {
    auth <- xmlValue(xpathSApply(root, "//book/authors", xmlChildren)[[i]][[1]])
  }
  author<-c(author, auth)
}

author

## [1] "David Wallechinsky;Irving Wallace;Amy Wallace;"
## [2] "Cormac McCarthy"                               
## [3] "Frank Herbert"

Now extract other fields and build dataframe.

ids<-xmlSApply(root, xmlGetAttr, "id")
ids <- paste(ids)
titles <-xpathSApply(root, "//book/title", xmlValue)
genres <-xpathSApply(root, "//book/genre", xmlValue)
pub <- xpathSApply(root, "//book/publish_year", xmlValue)
xml_df <- data.frame(id=ids, title=titles, author=author, genre=genres, pub_year=pub)

xml_df

##           id                                              title
## 1 0688031838    The People's Almanac Presents the Book of Lists
## 2 0441172695 Blood Meridian: Or the Evening Redness in the West
## 3 0679728759                                       Dune Messiah
##                                           author           genre pub_year
## 1 David Wallechinsky;Irving Wallace;Amy Wallace;     Non-fiction     1977
## 2                                Cormac McCarthy         Fiction     1992
## 3                                  Frank Herbert Science Fiction     1969

Week 8 assignment: XML & JSON

FD