library(stringr)
library(XML)
## Warning: package 'XML' was built under R version 3.2.3
library(jsonlite)
library(RCurl)
## Loading required package: bitops
## Warning in inDL(x, as.logical(local), as.logical(now), ...): DLL attempted
## to change FPU control word from 8001f to 9001f
Importing HTML tables and JSON to a dataframe is simple. The packages do all the work:
json_df <- fromJSON("https://raw.githubusercontent.com/fdsps/IS607/master/books.json")
html_data<- readHTMLTable(getURL("https://raw.githubusercontent.com/fdsps/IS607/master/books.html"))
html_df <- data.frame(html_data)
json_df
## $books
## $books$book
## id title
## 1 0688031838 The People's Almanac Presents the Book of Lists
## 2 0441172695 Blood Meridian: Or the Evening Redness in the West
## 3 0679728759 Dune Messiah
## author genre
## 1 David Wallechinsky, Irving Wallace, Amy Wallace Non-fiction
## 2 Cormac McCarthy Fiction
## 3 Frank Herbert Science Fiction
## publish_year
## 1 1977
## 2 1992
## 3 1969
html_df
## books.Title
## 1 The People's Almanac Presents the Book of Lists
## 2 Blood Meridian: Or the Evening Redness in the West
## 3 Dune Messiah
## books.Author books.Genre
## 1 David Wallechinsky, Irving Wallace, Amy Wallace. Non-fiction
## 2 Cormac McCarthy Fiction
## 3 Frank Herbert Science Fiction
## books.Year books.ID
## 1 1977 0688031838
## 2 1992 0441172695
## 3 1969 0679728759
My XML file is problematic because of the id attribute and multiple “author” tags for the first title.
books_xml <- xmlParse(getURL("https://raw.githubusercontent.com/fdsps/IS607/master/books.xml"))
root <- xmlRoot(books_xml)
xmlToDataFrame(root)
## title
## 1 The People's Almanac Presents the Book of Lists
## 2 Blood Meridian: Or the Evening Redness in the West
## 3 Dune Messiah
## authors genre publish_year
## 1 David WallechinskyIrving WallaceAmy Wallace Non-fiction 1977
## 2 Cormac McCarthy Fiction 1992
## 3 Frank Herbert Science Fiction 1969
My solution is to grab each child node value and combine it into one string.
auth<-c()
author<-c()
n <- length(xpathSApply(root, "//book/authors", xmlChildren)) # count of authors nodes
for (i in 1:n){
j <- length(xpathSApply(root, "//book/authors", xmlChildren)[[i]]) # count of author children
if (j>1){
for (k in 1:j){
tmp <- xmlValue(xpathSApply(root, "//book/authors", xmlChildren)[[i]][[k]]) # index to each author child
auth <- str_c(auth, tmp, ";")
}
}
else {
auth <- xmlValue(xpathSApply(root, "//book/authors", xmlChildren)[[i]][[1]])
}
author<-c(author, auth)
}
author
## [1] "David Wallechinsky;Irving Wallace;Amy Wallace;"
## [2] "Cormac McCarthy"
## [3] "Frank Herbert"
Now extract other fields and build dataframe.
ids<-xmlSApply(root, xmlGetAttr, "id")
ids <- paste(ids)
titles <-xpathSApply(root, "//book/title", xmlValue)
genres <-xpathSApply(root, "//book/genre", xmlValue)
pub <- xpathSApply(root, "//book/publish_year", xmlValue)
xml_df <- data.frame(id=ids, title=titles, author=author, genre=genres, pub_year=pub)
xml_df
## id title
## 1 0688031838 The People's Almanac Presents the Book of Lists
## 2 0441172695 Blood Meridian: Or the Evening Redness in the West
## 3 0679728759 Dune Messiah
## author genre pub_year
## 1 David Wallechinsky;Irving Wallace;Amy Wallace; Non-fiction 1977
## 2 Cormac McCarthy Fiction 1992
## 3 Frank Herbert Science Fiction 1969