Working with HTML, XML, and JSON

library(RCurl)
library(XML)
#parse HTML
html_file = "https://raw.githubusercontent.com/christinataylor/IS/master/books.html"
webpage <- getURL(html_file)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)

#extract contents and header
th <- xpathSApply(pagetree, "//table[@id='bookshelf']/tr/th", xmlValue)
td <- xpathSApply(pagetree, "//table[@id='bookshelf']/tr/td", xmlValue)
p <- xpathSApply(pagetree, "//table[@id='bookshelf']/tr/td/p", xmlValue)
content <- as.data.frame(matrix(td, ncol = 4, byrow = TRUE), stringsAsFactors = FALSE)
names(content) <- th
#write multi-value author field - would do programmatically next time
content$Author[[3]] <- paste(p[1],p[2], sep = ";")
#final result
content
##                 Title                        Author Year Price
## 1      R for Everyone                  Jared Lander 2013 25.99
## 2 How Not to be Wrong              Jordan Ellenberg 2014 12.99
## 3               Relic Douglas Preston;Lincoln Child 2003  8.99
library(XML)
library(RCurl)
library(plyr)
xml_file <- "https://raw.githubusercontent.com/christinataylor/IS/master/books.xml"
#parse XML
xData <- getURL(xml_file)
doc <- xmlParse(xData)
#ignore attributes
ldply(xmlToList(doc, addAttributes = FALSE), data.frame)
##    .id               title           author year price      author.1
## 1 book      R for Everyone     Jared Lander 2013 25.99          <NA>
## 2 book How Not to Be Wrong Jordan Ellenberg 2014 12.99          <NA>
## 3 book               Relic  Douglas Preston 2003  8.99 Lincoln Child
#library("jsonlite")
#json_data <- fromJSON(json_file, simplifyDataFrame = TRUE)
#The above method will not go through the lowest level in the JSON file
library("RJSONIO")
json_file <- "https://raw.githubusercontent.com/christinataylor/IS/master/books.json"
#recurse through list of list
json_data <- fromJSON(json_file)
rows <- unlist(json_data, recursive = TRUE, use.names = TRUE)
#just a laundry list now - need more transformation for downstream analysis
as.data.frame(rows)
##                   rows
## 1       R for Everyone
## 2         Jared Lander
## 3                 2013
## 4                25.99
## 5  How Not to be Wrong
## 6     Jordan Ellenberg
## 7                 2014
## 8                12.99
## 9                Relic
## 10     Douglas Preston
## 11       Lincoln Child
## 12                2003
## 13                8.99