HTML data
library(XML)
htmldat <- "C:\\Users\\Andrew\\Desktop\\Cuny\\Data Acquisition\\Week 7\\IS607_W8_AG.html"
htmltable <- readHTMLTable(htmldat)
#Without having to create an hierarchy, I am able to create a good table from scratch, although it takes a while.
htmltable
## $`NULL`
## Title Authors Attributes
## 1 Harlot's Ghost Norman Mailer espionage
## 2 Harlot's Ghost Norman Mailer American literature
## 3 Rabbit, Run John Updike escape
## 4 Rabbit, Run John Updike American literature
## 5 Reading Myself and Others Philip Roth Controversies
## 6 Reading Myself and Others Philip Roth American literature
## 7 Reading Myself and Others Martin Asher Controversies
## 8 Reading Myself and Others Martin Asher American literature
XML data
library(XML)
xmldat <- xmlParse("C:\\Users\\Andrew\\Desktop\\Cuny\\Data Acquisition\\Week 7\\IS607_W8_AG.xml")
xmldataframe <- xmlToDataFrame(xmldat)
#XML package has difficulty converting more complex, hierarchical xml code into datatables. Multiple attributes don't fit nicely and are just ignored in this case.
xmldataframe
## title authors background
## 1 Harlot's Ghost Norman Mailer
## 2 Rabbit, Run John Updike
## 3 Reading Myself and Others Philip RothMartin Asher
#adding lost data and tidying up
favauthors <- data.frame(title=(xmldataframe$title))
favauthors$main <- xpathSApply(xmldat, "//main", fun = xmlValue)
favauthors$secondary <- c("","",xpathSApply(xmldat, "//secondary", fun = xmlValue))
favauthors$firstatt <- xpathSApply(xmldat, "//background", xmlGetAttr, "first")
favauthors$secondatt <- xpathSApply(xmldat, "//background", xmlGetAttr, "second")
favauthors
## title main secondary firstatt
## 1 Harlot's Ghost Norman Mailer espionage
## 2 Rabbit, Run John Updike escape
## 3 Reading Myself and Others Philip Roth Martin Asher controversies
## secondatt
## 1 American literature
## 2 American literature
## 3 American literature
JSON data, all there, but not tidy
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:utils':
##
## View
jsondat <- fromJSON("C:\\Users\\Andrew\\Desktop\\Cuny\\Data Acquisition\\Week 7\\IS607_W8_AG.json")
json.df <- do.call("rbind", lapply(jsondat, data.frame, stringsAsFactors = FALSE))
#JSON data is simplier and flatter and easier to extract. JSONlite does a good job converting data, and especially attributes, into a dataframe.
json.df
## title authors
## <favorite_writers.1 Harlot's Ghost Norman Mailer
## <favorite_writers.2 Rabbit, Run John Updike
## <favorite_writers.3 Reading Myself and Others Philip Roth, Martin Asher
## attributes
## <favorite_writers.1 espionage, American literature
## <favorite_writers.2 escape, American literature
## <favorite_writers.3 controversies, American literature