The assignment was to do the following:
My approach will be as follows:
##########
# html
##########
# Read books.html
books_html <- read_html("https://raw.githubusercontent.com/mmippolito/cuny/main/data607/assignment7/books.html", encoding = "UTF-8")
df_html <- html_table(books_html, fill = TRUE)[[1]]
##########
# json
##########
# Read books.json
rawjson <- fromJSON("https://raw.githubusercontent.com/mmippolito/cuny/main/data607/assignment7/books.json", simplifyVector = TRUE)
# Extract variable names from json, remove pagemap since it's a nested list that we don't care about
vars <- names(rawjson[["books"]][[1]][["book"]])
# Create data frame
df_json <- data.frame(matrix(ncol = 6, nrow = 3))
colnames(df_json) <- vars
# Extract each book as a separate elemnt
tmpbook <- lapply(rawjson[["books"]], "[[", "book")
# Walk each variable, assigning each one to a column of the empty data frame
for (v in vars) {
# Extract this column to a temporary variable
tmpelement <- sapply(tmpbook, "[[", v)
# If this element is a list, collapse it into a single value
if (typeof(tmpelement) == "list") {
df_json[v] <- sapply(tmpelement, paste, collapse = ", ")
}
else {
df_json[v] <- tmpelement
}
}
##########
# xml
##########
# Retrieve and read xml doc
rawxml <- getURL("https://raw.githubusercontent.com/mmippolito/cuny/main/data607/assignment7/books.xml")
# Parse the xml doc
xmldoc <- xmlParse(rawxml)
# Do an in initial pass to convert to a data frame; the authors column will require further handling
df_xml <- xmlToDataFrame(nodes = getNodeSet(xmldoc, "//book"))
# Handle authors, since there can be multiple authors; first convert the whole xml doc to a list
tmplist <- xmlToList(xmldoc, "//book")
# Extract each book into a separate list
tmpbook <- lapply(rawjson[["books"]], "[[", "book")
# Extract the authors field into its own list
tmpauth <- sapply(tmpbook, "[[", "authors")
# Collapse the authors field into a vector and add it to the data frame
df_xml[v] <- sapply(tmpauth, paste, collapse = ", ")
# html
kable(df_html, caption =
"<i><font color=#000000><b>Table 1.</b> Data frame from HTML document</font></i>") %>%
kable_styling(latex_options = "striped")
| id | authors | title | publish_year | pages | notable_feature |
|---|---|---|---|---|---|
| book01 | Carlos Baker | Ernest Hemingway: A Life Story | 1969 | 697 | Often characterized as the definitive Hemingway biography. |
| book02 | Nancy R. Comley, Robert Scholes | Hemingway’s Genders: Rereading the Hemingway Text | 1996 | 168 | Attempts to dispel the notion that Hemingway’s works were abjectly heterosexual and masculine and without any nuance in terms of sexuality. |
| book03 | Kenneth S. Lynn | Hemingway | 1995 | 712 | Reveals the troubled life of a man previously portrayed as stolidness. |
# json
kable(df_html, caption =
"<i><font color=#000000><b>Table 2.</b> Data frame from JSON document</font></i>") %>%
kable_styling(latex_options = "striped")
| id | authors | title | publish_year | pages | notable_feature |
|---|---|---|---|---|---|
| book01 | Carlos Baker | Ernest Hemingway: A Life Story | 1969 | 697 | Often characterized as the definitive Hemingway biography. |
| book02 | Nancy R. Comley, Robert Scholes | Hemingway’s Genders: Rereading the Hemingway Text | 1996 | 168 | Attempts to dispel the notion that Hemingway’s works were abjectly heterosexual and masculine and without any nuance in terms of sexuality. |
| book03 | Kenneth S. Lynn | Hemingway | 1995 | 712 | Reveals the troubled life of a man previously portrayed as stolidness. |
# xml
kable(df_html, caption =
"<i><font color=#000000><b>Table 3.</b> Data frame from XML document</font></i>") %>%
kable_styling(latex_options = "striped")
| id | authors | title | publish_year | pages | notable_feature |
|---|---|---|---|---|---|
| book01 | Carlos Baker | Ernest Hemingway: A Life Story | 1969 | 697 | Often characterized as the definitive Hemingway biography. |
| book02 | Nancy R. Comley, Robert Scholes | Hemingway’s Genders: Rereading the Hemingway Text | 1996 | 168 | Attempts to dispel the notion that Hemingway’s works were abjectly heterosexual and masculine and without any nuance in terms of sexuality. |
| book03 | Kenneth S. Lynn | Hemingway | 1995 | 712 | Reveals the troubled life of a man previously portrayed as stolidness. |
# Compare, using the html data frame as the basis for comparison
vars = colnames(df_html)
for (v in vars) {
# Convert each variable into a vector
m_html <- as.matrix(df_html[v])
m_json <- as.matrix(df_json[v])
m_xml <- as.matrix(df_json[v])
# Walk each element in this vector, using the html vector as a basis
print(paste("Comparing variable ", v))
for (i in 1:length(m_html)) {
# Print row #
print(paste(" Row #", i))
# Compare the html vector to the corresponding json vector
if (m_html[i] == m_json[i]) {
msg <- "equivalent"
}
else {
msg <- "WARNING! NOT EQUIVALENT!"
}
print(paste(" HTML -> JSON: ", msg))
# Compare the html vector to the corresponding xml vector
if (m_html[i] == m_xml[i]) {
msg <- "equivalent"
}
else {
msg <- "WARNING! NOT EQUIVALENT!"
}
print(paste(" HTML -> XML: ", msg))
}
}
## [1] "Comparing variable id"
## [1] " Row # 1"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 2"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 3"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] "Comparing variable authors"
## [1] " Row # 1"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 2"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 3"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] "Comparing variable title"
## [1] " Row # 1"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 2"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 3"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] "Comparing variable publish_year"
## [1] " Row # 1"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 2"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 3"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] "Comparing variable pages"
## [1] " Row # 1"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 2"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 3"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] "Comparing variable notable_feature"
## [1] " Row # 1"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 2"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
## [1] " Row # 3"
## [1] " HTML -> JSON: equivalent"
## [1] " HTML -> XML: equivalent"
There were some differences in how the parsers initially imported the data. This was mainly due to the fact that the authors field could contain multiple values. This required more wrangling for the code that parsed the JSON and XML files.
The end result was three identical data frames having the same format, verified to be identical programmatically.