IS607 W8 AGoldberg

HTML data

library(XML)
htmldat <- "C:\\Users\\Andrew\\Desktop\\Cuny\\Data Acquisition\\Week 7\\IS607_W8_AG.html"
htmltable <- readHTMLTable(htmldat)

#Without having to create an hierarchy, I am able to create a good table from scratch, although it takes a while. 
htmltable

## $`NULL`
##                       Title       Authors          Attributes
## 1            Harlot's Ghost Norman Mailer           espionage
## 2            Harlot's Ghost Norman Mailer American literature
## 3               Rabbit, Run   John Updike              escape
## 4               Rabbit, Run   John Updike American literature
## 5 Reading Myself and Others   Philip Roth       Controversies
## 6 Reading Myself and Others   Philip Roth American literature
## 7 Reading Myself and Others  Martin Asher       Controversies
## 8 Reading Myself and Others  Martin Asher American literature

XML data

library(XML)
xmldat <- xmlParse("C:\\Users\\Andrew\\Desktop\\Cuny\\Data Acquisition\\Week 7\\IS607_W8_AG.xml")
xmldataframe <- xmlToDataFrame(xmldat)

#XML package has difficulty converting more complex, hierarchical xml code into datatables. Multiple attributes don't fit nicely and are just ignored in this case. 
xmldataframe

##                       title                 authors background
## 1            Harlot's Ghost           Norman Mailer           
## 2               Rabbit, Run             John Updike           
## 3 Reading Myself and Others Philip RothMartin Asher

#adding lost data and tidying up
favauthors <- data.frame(title=(xmldataframe$title))
favauthors$main <- xpathSApply(xmldat, "//main", fun = xmlValue)
favauthors$secondary <- c("","",xpathSApply(xmldat, "//secondary", fun = xmlValue))
favauthors$firstatt <- xpathSApply(xmldat, "//background", xmlGetAttr, "first")
favauthors$secondatt <- xpathSApply(xmldat, "//background", xmlGetAttr, "second")
favauthors

##                       title          main    secondary      firstatt
## 1            Harlot's Ghost Norman Mailer                  espionage
## 2               Rabbit, Run   John Updike                     escape
## 3 Reading Myself and Others   Philip Roth Martin Asher controversies
##             secondatt
## 1 American literature
## 2 American literature
## 3 American literature

JSON data, all there, but not tidy

library(jsonlite)

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:utils':
## 
##     View

jsondat <- fromJSON("C:\\Users\\Andrew\\Desktop\\Cuny\\Data Acquisition\\Week 7\\IS607_W8_AG.json")
json.df <- do.call("rbind", lapply(jsondat, data.frame, stringsAsFactors = FALSE))


#JSON data is simplier and flatter and easier to extract. JSONlite does a good job converting data, and especially attributes, into a dataframe.
json.df

##                                         title                   authors
## <favorite_writers.1            Harlot's Ghost             Norman Mailer
## <favorite_writers.2               Rabbit, Run               John Updike
## <favorite_writers.3 Reading Myself and Others Philip Roth, Martin Asher
##                                             attributes
## <favorite_writers.1     espionage, American literature
## <favorite_writers.2        escape, American literature
## <favorite_writers.3 controversies, American literature

IS607 W8 AGoldberg

Andrew Goldberg

October 17, 2015