##Data files can be downloaded from #https://github.com/wheremagichappens/an.dy/blob/master/DATA607/Assignment7/books.html
#https://github.com/wheremagichappens/an.dy/blob/master/DATA607/Assignment7/books.xml
#https://github.com/wheremagichappens/an.dy/blob/master/DATA607/Assignment7/books.json
#HTML
file_path <- "file:///C:/Users/ahwang/Documents/books.html"
html <- htmlParse(file_path)
html_table <- readHTMLTable(html, stringAsFactors = FALSE)
html_table
## $`NULL`
## Title
## 1 The Death of Ivan Ilyich
## 2 Harry Potter and the Philosopher's Stone
## 3 Let it Snow
## Author Genre Country_Origin
## 1 Leo Tolstoy Fiction Russia
## 2 J. K. Rowling Fantasy United Kingdom
## 3 John Green, Maureen Johnson, Lauren Myracle Romance United States
## Year_Published
## 1 1886
## 2 1997
## 3 2008
#Get rid of nulls in column titles
html_table_clean <- html_table[[1]]
html_df <- as.data.frame(html_table_clean)
datatable(html_df)
#XML
file_path <- "file:///C:/Users/ahwang/Documents/books.xml"
xml <- xmlParse(file_path)
xml_df <- xmlToDataFrame(xml, stringsAsFactors = FALSE)
datatable(xml_df)
#JSON
file_path <- "file:///C:/Users/ahwang/Documents/books.json"
json <- fromJSON(file_path)
json_df <- do.call(rbind, lapply(json[[1]], data.frame, stringsAsFactors = FALSE))
datatable(json_df)
Check whether 3 data.frames are equal or not
all.equal(html_df, xml_df)
## [1] "Component \"Title\": 'current' is not a factor"
## [2] "Component \"Author\": 'current' is not a factor"
## [3] "Component \"Genre\": 'current' is not a factor"
## [4] "Component \"Country_Origin\": 'current' is not a factor"
## [5] "Component \"Year_Published\": 'current' is not a factor"
all.equal(html_df, json_df)
## [1] "Component \"Title\": 'current' is not a factor"
## [2] "Component \"Author\": 'current' is not a factor"
## [3] "Component \"Genre\": 'current' is not a factor"
## [4] "Component \"Country_Origin\": 'current' is not a factor"
## [5] "Component \"Year_Published\": 'current' is not a factor"
all.equal(xml_df, json_df)
## [1] "Component \"Year_Published\": Modes: character, numeric"
## [2] "Component \"Year_Published\": target is character, current is numeric"
identical(html_df, xml_df)
## [1] FALSE
identical(html_df, json_df)
## [1] FALSE
identical(xml_df, json_df)
## [1] FALSE
They are not identical data frames.