##Data files can be downloaded from #https://github.com/wheremagichappens/an.dy/blob/master/DATA607/Assignment7/books.html
#https://github.com/wheremagichappens/an.dy/blob/master/DATA607/Assignment7/books.xml
#https://github.com/wheremagichappens/an.dy/blob/master/DATA607/Assignment7/books.json


#HTML
file_path <- "file:///C:/Users/ahwang/Documents/books.html"
html <- htmlParse(file_path)
html_table <- readHTMLTable(html, stringAsFactors = FALSE)
html_table
## $`NULL`
##                                      Title
## 1                 The Death of Ivan Ilyich
## 2 Harry Potter and the Philosopher's Stone
## 3                              Let it Snow
##                                        Author   Genre Country_Origin
## 1                                 Leo Tolstoy Fiction         Russia
## 2                               J. K. Rowling Fantasy United Kingdom
## 3 John Green, Maureen Johnson, Lauren Myracle Romance  United States
##   Year_Published
## 1           1886
## 2           1997
## 3           2008
#Get rid of nulls in column titles
html_table_clean <- html_table[[1]]
html_df <- as.data.frame(html_table_clean)
datatable(html_df)
#XML
file_path <- "file:///C:/Users/ahwang/Documents/books.xml"
xml <- xmlParse(file_path)
xml_df <- xmlToDataFrame(xml, stringsAsFactors = FALSE)
datatable(xml_df)
#JSON
file_path <- "file:///C:/Users/ahwang/Documents/books.json"
json <- fromJSON(file_path)
json_df <- do.call(rbind, lapply(json[[1]], data.frame, stringsAsFactors = FALSE))
datatable(json_df)

Check whether 3 data.frames are equal or not

all.equal(html_df, xml_df)
## [1] "Component \"Title\": 'current' is not a factor"         
## [2] "Component \"Author\": 'current' is not a factor"        
## [3] "Component \"Genre\": 'current' is not a factor"         
## [4] "Component \"Country_Origin\": 'current' is not a factor"
## [5] "Component \"Year_Published\": 'current' is not a factor"
all.equal(html_df, json_df)
## [1] "Component \"Title\": 'current' is not a factor"         
## [2] "Component \"Author\": 'current' is not a factor"        
## [3] "Component \"Genre\": 'current' is not a factor"         
## [4] "Component \"Country_Origin\": 'current' is not a factor"
## [5] "Component \"Year_Published\": 'current' is not a factor"
all.equal(xml_df, json_df)
## [1] "Component \"Year_Published\": Modes: character, numeric"              
## [2] "Component \"Year_Published\": target is character, current is numeric"
identical(html_df, xml_df)
## [1] FALSE
identical(html_df, json_df)
## [1] FALSE
identical(xml_df, json_df)
## [1] FALSE

They are not identical data frames.