Working with HTML, XML, and JSON
library(RCurl)
library(XML)
#parse HTML
html_file = "https://raw.githubusercontent.com/christinataylor/IS/master/books.html"
webpage <- getURL(html_file)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
#extract contents and header
th <- xpathSApply(pagetree, "//table[@id='bookshelf']/tr/th", xmlValue)
td <- xpathSApply(pagetree, "//table[@id='bookshelf']/tr/td", xmlValue)
p <- xpathSApply(pagetree, "//table[@id='bookshelf']/tr/td/p", xmlValue)
content <- as.data.frame(matrix(td, ncol = 4, byrow = TRUE), stringsAsFactors = FALSE)
names(content) <- th
#write multi-value author field - would do programmatically next time
content$Author[[3]] <- paste(p[1],p[2], sep = ";")
#final result
content
## Title Author Year Price
## 1 R for Everyone Jared Lander 2013 25.99
## 2 How Not to be Wrong Jordan Ellenberg 2014 12.99
## 3 Relic Douglas Preston;Lincoln Child 2003 8.99
library(XML)
library(RCurl)
library(plyr)
xml_file <- "https://raw.githubusercontent.com/christinataylor/IS/master/books.xml"
#parse XML
xData <- getURL(xml_file)
doc <- xmlParse(xData)
#ignore attributes
ldply(xmlToList(doc, addAttributes = FALSE), data.frame)
## .id title author year price author.1
## 1 book R for Everyone Jared Lander 2013 25.99 <NA>
## 2 book How Not to Be Wrong Jordan Ellenberg 2014 12.99 <NA>
## 3 book Relic Douglas Preston 2003 8.99 Lincoln Child
#library("jsonlite")
#json_data <- fromJSON(json_file, simplifyDataFrame = TRUE)
#The above method will not go through the lowest level in the JSON file
library("RJSONIO")
json_file <- "https://raw.githubusercontent.com/christinataylor/IS/master/books.json"
#recurse through list of list
json_data <- fromJSON(json_file)
rows <- unlist(json_data, recursive = TRUE, use.names = TRUE)
#just a laundry list now - need more transformation for downstream analysis
as.data.frame(rows)
## rows
## 1 R for Everyone
## 2 Jared Lander
## 3 2013
## 4 25.99
## 5 How Not to be Wrong
## 6 Jordan Ellenberg
## 7 2014
## 8 12.99
## 9 Relic
## 10 Douglas Preston
## 11 Lincoln Child
## 12 2003
## 13 8.99