In this assignment, I will be reading in an HTML file, XML file and JSON file that I wrote that lists 3 of my favorite books. I will then use R to make a meaningful table with the data stored in each format.
suppressWarnings(suppressMessages(library(XML)))
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(RCurl)))
suppressWarnings(suppressMessages(library(RJSONIO)))
htmllink <- getURL("https://raw.githubusercontent.com/swigodsky/DATA-607/master/sarahfavbooks.html")
bookhtml <- htmlParse(file=htmllink)
getth = function(){
th_container = character()
list(th = function(node, ...){
th_container <<- c(th_container, xmlValue(node))
}, returnth = function() th_container)
}
h1 <- getth()
htmlTreeParse(file = htmllink, handlers = h1)
## $th
## function (node, ...)
## {
## th_container <<- c(th_container, xmlValue(node))
## }
## <environment: 0x000000001c6c5de8>
##
## $returnth
## function ()
## th_container
## <environment: 0x000000001c6c5de8>
row <- rep(NA, 6)
bookhtml <- data.frame(stringsAsFactors = FALSE)
bookhtml <- rbind(bookhtml, row)
row <- NULL
for (i in 7:length(h1$returnth())){
row <- c(row, h1$returnth()[i])
if (i%%6==0) {
bookhtml <- rbind(bookhtml, row)
row <- NULL
}
}
rownames(bookhtml) <- NULL
for (i in 1:6){
row <- c(row, h1$returnth()[i])
}
colnames(bookhtml) <- row
bookhtml.df <- bookhtml[-1,]
bookhtml.df
## Book Author 1
## 2 The Goldfinch Donna Tartt
## 3 The Amazing Adventures of Kavalier & Clay Michael Chabon
## 4 I Am Malala Malala Yousafzai
## Author 2 Publisher ISBN
## 2 NULL Back Bay Books 978-0316055444
## 3 NULL Random House Publishing Group 978-0812983586
## 4 Christina Lamb Little, Brown and Company 978-0316322423
## Award
## 2 Pulitzer Prize for Fiction
## 3 Pulitzer Prize for Fiction
## 4 NULL
xmllink <- getURL("https://raw.githubusercontent.com/swigodsky/DATA-607/master/sarahfavbooks2.xml")
bookxml <- xmlParse(file = xmllink)
bookxml.df <- xmlToDataFrame(xmlRoot(bookxml))
authors <- xpathSApply(bookxml, "//author", xmlAttrs)
bookxml.df$author <- c(authors[1,1], authors[1,2], authors[1,3])
colnames(bookxml.df)[2] <- "author1"
bookxml.df <- cbind(bookxml.df, c(authors[2,1], authors[2,2], authors[2,3]))
colnames(bookxml.df)[6] <-"author2"
bookxml.df <- bookxml.df %>%
select(title, author1, author2, publisher, ISBN, award)
bookxml.df
## title author1
## 1 The Goldfinch Donna Tartt
## 2 The Amazing Adventures of Kavalier & Clay Michael Chabon
## 3 I am Malala Malala Yousafzai
## author2 publisher ISBN
## 1 NULL Back Bay Books 978-0316055444
## 2 NULL Random House Publishing Group 978-0812983586
## 3 Christina Lamb Little, Brown and Company 978-0316322423
## award
## 1 Pulizter Prize for Fiction
## 2 Pulizter Prize for Fiction
## 3 NULL
bookjson <- fromJSON("https://raw.githubusercontent.com/swigodsky/DATA-607/master/sarahfavbooks.json", nullValue = NA, simplify=FALSE)
bookjson.df <- do.call("rbind", lapply(bookjson, data.frame, stringsAsFactors=FALSE))
colnames(bookjson.df)[2] <- "author1"
colnames(bookjson.df)[3] <- "author2"
bookjson.df[is.na(bookjson.df)] <- "NULL"
bookjson.df
## title author1
## 1 The Goldfinch Donna Tartt
## 2 The Amazing Adventures of Kavalier & Clay Michael Chabon
## 3 I am Malala Malala Yousafzai
## author2 publisher ISBN
## 1 NULL Back Bay Books 978-0316055444
## 2 NULL Random House Publishing Group 978-0812983586
## 3 Christina Lamb Little, Brown and Company 978-0316322423
## award
## 1 Pulizter Prize for Fiction
## 2 Pulizter Prize for Fiction
## 3 NULL