## Loading required package: XML
## Loading required package: RJSONIO
## Loading required package: RCurl
## Loading required package: bitops

HTML

# load html file from github
url <- getURL("https://raw.githubusercontent.com/jasonjgy2000/IS607/master/Assignments/Week%207/books.html")
html_data_set <- readHTMLTable(url)
# convert list to dataframe
html_data_set <- do.call("rbind",html_data_set)
# remove row names
row.names(html_data_set) <- NULL
html_data_set

##                          Title                      Authors Year Published
## 1 Doing Bayesian Data Analysis                John Kruschke           2015
## 2       Further Pure 1 for OCR Douglas Quadling, Hugh Neill           2004
## 3                Into Thin Air Jon Krakauer, Randy Rackliff           1999
##                    Publisher    ISBN-10
## 1             Academic Press 0124058884
## 2 Cambridge University Press 0521548985
## 3                     Anchor 0385494785

XML

# load xml file from github
url <- getURL("https://raw.githubusercontent.com/jasonjgy2000/IS607/master/Assignments/Week%207/books.xml")
xml_data_set <- xmlParse(url)
# get root element of the data set
root <- xmlRoot(xml_data_set)
xml_data_set <- xmlToDataFrame(root)
xml_data_set

##                          Title                      Authors Year_Published
## 1 Doing Bayesian Data Analysis                John Kruschke           2015
## 2       Further Pure 1 for OCR Douglas Quadling, Hugh Neill           2004
## 3                Into Thin Air Jon Krakauer, Randy Rackliff           1999
##                    Publisher    ISBN-10
## 1             Academic Press 0124058884
## 2 Cambridge University Press 0521548985
## 3                     Anchor 0385494785

JSON

# load json file from github
url <- getURL("https://raw.githubusercontent.com/jasonjgy2000/IS607/master/Assignments/Week%207/books.xml")
json_data_set <- fromJSON("books.json")
# unlisting data set from outer list
json_data_set <- sapply(json_data_set[[1]], unlist) 
# convert list to dataframe
json_data_set <- do.call("rbind",json_data_set)
json_data_set

##      Title                          Authors                       
## [1,] "Doing Bayesian Data Analysis" "John Kruschke"               
## [2,] "Further Pure 1 for OCR"       "Douglas Quadling, Hugh Neill"
## [3,] "Into Thin Air"                "Jon Krakauer, Randy Rackliff"
##      Year_Published Publisher                    ISBN-10     
## [1,] "2015"         "Academic Press"             "0124058884"
## [2,] "2004"         "Cambridge University Press" "0521548985"
## [3,] "1999"         "Anchor"                     "0385494785"

Conclusion

Even though each file type required different processing, the dataframe produced by each are identical.

Working with XML and Json

Jason Joseph

October 17, 2015

HTML

XML

JSON

Conclusion