library(RCurl)
library(XML)
library(jsonlite)
library(dplyr)
setwd("C:/Users/bhao/Google Drive/CUNY/git/DATA607/Week7")
options(width=100)

HTML Table

url_html = getURL('https://raw.githubusercontent.com/haobruce/CUNY/master/DATA607/Week7/Hao-Week7_HTML.html')
books_html = readHTMLTable(url_html, stringsAsFactors = F)[[1]]
books_html
##                      BookName   AuthorName1   AuthorName2       Genre            Publisher
## 1 Applied Predictive Modeling      Max Kuhn Kjell Johnson Non-fiction             Springer
## 2   Unsoulded (Cradle Book 1)    Will Wight                   Fantasy               Amazon
## 3             The Forever War Joe Haldemann                    Sci-fi St. Martin's Griffin

XML

url_xml = getURL('https://raw.githubusercontent.com/haobruce/CUNY/master/DATA607/Week7/Hao-Week7_XML.xml')
data_xml = xmlParse(url_xml)
books_xml = xmlToDataFrame(data_xml)
books_xml
##                      bookName   authorName1   authorName2       genre            publisher
## 1 Applied Predictive Modeling      Max Kuhn Kjell Johnson Non-fiction             Springer
## 2    Unsouled (Cradle Book 1)    Will Wight          <NA>     Fantasy               Amazon
## 3             The Forever War Joe Haldemann          <NA>      Sci-fi St. Martin's Griffin
# case with extra parent node
url_xml2 = getURL('https://raw.githubusercontent.com/haobruce/CUNY/master/DATA607/Week7/Hao-Week7_XML2.xml')
data_xml2 = xmlParse(url_xml2)
data_xml2 = xmlRoot(data_xml2)[[1]]
books_xml2 = xmlToDataFrame(data_xml2)
books_xml2
##                      bookName   authorName1   authorName2       genre            publisher
## 1 Applied Predictive Modeling      Max Kuhn Kjell Johnson Non-fiction             Springer
## 2    Unsouled (Cradle Book 1)    Will Wight          <NA>     Fantasy               Amazon
## 3             The Forever War Joe Haldemann          <NA>      Sci-fi St. Martin's Griffin

JSON

url_json = getURL('https://raw.githubusercontent.com/haobruce/CUNY/master/DATA607/Week7/Hao-Week7_JSON.json')
books_json = fromJSON(url_json)[[1]]
books_json
##                      bookName   authorName1   authorName2       genre            publisher
## 1 Applied Predictive Modeling      Max Kuhn Kjell Johnson Non-fiction             Springer
## 2    Unsouled (Cradle Book 1)    Will Wight          <NA>     Fantasy               Amazon
## 3             The Forever War Joe Haldemann          <NA>      Sci-fi St. Martin's Griffin

Are the three data frames identical?