This assignment loads book information from three different file formats (HTML, XML, and JSON) into R data frames and compares them.
options(encoding = "UTF-8")
Sys.setlocale("LC_ALL", "en_US.UTF-8")
## [1] "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8"
library(XML)
library(jsonlite)
library(RCurl)
Three books on interdisciplinary thinking and consciousness:
# Parse HTML file from GitHub
html_url <- "https://raw.githubusercontent.com/Vaene/Cuny-607-Assignments/main/assignment7/books.html"
html_content <- getURL(html_url)
html_doc <- htmlParse(html_content, encoding = "UTF-8")
# Extract table
html_df <- readHTMLTable(html_doc, stringsAsFactors = FALSE)[[1]]
# Set column names
colnames(html_df) <- c("Title", "Authors", "Year", "Pages", "Award")
# Convert data types
html_df$Year <- as.integer(html_df$Year)
html_df$Pages <- as.integer(html_df$Pages)
print("HTML Data Frame:")
## [1] "HTML Data Frame:"
print(html_df)
## Title
## 1 Gödel, Escher, Bach: An Eternal Golden Braid
## 2 The Mind's I
## 3 The Black Swan
## Authors Year Pages Award
## 1 Douglas Hofstadter 1979 777 Pulitzer Prize
## 2 Douglas Hofstadter, Daniel Dennett 1981 501 None
## 3 Nassim Nicholas Taleb 2007 366 None
str(html_df)
## 'data.frame': 3 obs. of 5 variables:
## $ Title : chr "Gödel, Escher, Bach: An Eternal Golden Braid" "The Mind's I" "The Black Swan"
## $ Authors: chr "Douglas Hofstadter" "Douglas Hofstadter, Daniel Dennett" "Nassim Nicholas Taleb"
## $ Year : int 1979 1981 2007
## $ Pages : int 777 501 366
## $ Award : chr "Pulitzer Prize" "None" "None"
# Parse XML file from GitHub
xml_url <- "https://raw.githubusercontent.com/Vaene/Cuny-607-Assignments/main/assignment7/books.xml"
xml_content <- getURL(xml_url)
xml_doc <- xmlParse(xml_content, encoding = "UTF-8")
# Extract individual fields
titles <- xpathSApply(xml_doc, "//book/title", xmlValue)
years <- as.integer(xpathSApply(xml_doc, "//book/year", xmlValue))
pages <- as.integer(xpathSApply(xml_doc, "//book/pages", xmlValue))
awards <- xpathSApply(xml_doc, "//book/award", xmlValue)
# Handle multiple authors - collapse into single string
authors <- sapply(getNodeSet(xml_doc, "//book"), function(book) {
author_nodes <- getNodeSet(book, ".//author")
paste(sapply(author_nodes, xmlValue), collapse=", ")
})
# Create data frame
xml_df <- data.frame(
Title = titles,
Authors = authors,
Year = years,
Pages = pages,
Award = awards,
stringsAsFactors = FALSE
)
print("XML Data Frame:")
## [1] "XML Data Frame:"
print(xml_df)
## Title
## 1 Gödel, Escher, Bach: An Eternal Golden Braid
## 2 The Mind's I
## 3 The Black Swan
## Authors Year Pages Award
## 1 Douglas Hofstadter 1979 777 Pulitzer Prize
## 2 Douglas Hofstadter, Daniel Dennett 1981 501 None
## 3 Nassim Nicholas Taleb 2007 366 None
str(xml_df)
## 'data.frame': 3 obs. of 5 variables:
## $ Title : chr "Gödel, Escher, Bach: An Eternal Golden Braid" "The Mind's I" "The Black Swan"
## $ Authors: chr "Douglas Hofstadter" "Douglas Hofstadter, Daniel Dennett" "Nassim Nicholas Taleb"
## $ Year : int 1979 1981 2007
## $ Pages : int 777 501 366
## $ Award : chr "Pulitzer Prize" "None" "None"
# Read JSON file from GitHub
json_url <- "https://raw.githubusercontent.com/Vaene/Cuny-607-Assignments/main/assignment7/books.json"
json_data <- fromJSON(json_url)
# Extract books array
json_df <- json_data$books
# Handle authors array - collapse to single string
json_df$Authors <- sapply(json_df$authors, paste, collapse=", ")
# Select and reorder columns
json_df <- json_df[, c("title", "Authors", "year", "pages", "award")]
# Rename columns to match other data frames
colnames(json_df) <- c("Title", "Authors", "Year", "Pages", "Award")
print("JSON Data Frame:")
## [1] "JSON Data Frame:"
print(json_df)
## Title
## 1 Gödel, Escher, Bach: An Eternal Golden Braid
## 2 The Mind's I
## 3 The Black Swan
## Authors Year Pages Award
## 1 Douglas Hofstadter 1979 777 Pulitzer Prize
## 2 Douglas Hofstadter, Daniel Dennett 1981 501 None
## 3 Nassim Nicholas Taleb 2007 366 None
str(json_df)
## 'data.frame': 3 obs. of 5 variables:
## $ Title : chr "Gödel, Escher, Bach: An Eternal Golden Braid" "The Mind's I" "The Black Swan"
## $ Authors: chr "Douglas Hofstadter" "Douglas Hofstadter, Daniel Dennett" "Nassim Nicholas Taleb"
## $ Year : int 1979 1981 2007
## $ Pages : int 777 501 366
## $ Award : chr "Pulitzer Prize" "None" "None"
# Test for identical data frames
cat("Are HTML and XML data frames identical?\n")
## Are HTML and XML data frames identical?
print(identical(html_df, xml_df))
## [1] TRUE
cat("\nAre HTML and JSON data frames identical?\n")
##
## Are HTML and JSON data frames identical?
print(identical(html_df, json_df))
## [1] FALSE
cat("\nAre XML and JSON data frames identical?\n")
##
## Are XML and JSON data frames identical?
print(identical(xml_df, json_df))
## [1] FALSE
# More flexible comparison
cat("\n--- More Flexible Comparison with all.equal() ---\n")
##
## --- More Flexible Comparison with all.equal() ---
cat("\nHTML vs XML:\n")
##
## HTML vs XML:
print(all.equal(html_df, xml_df))
## [1] TRUE
cat("\nHTML vs JSON:\n")
##
## HTML vs JSON:
print(all.equal(html_df, json_df))
## [1] "Component \"Title\": 1 string mismatch"
cat("\nXML vs JSON:\n")
##
## XML vs JSON:
print(all.equal(xml_df, json_df))
## [1] "Component \"Title\": 1 string mismatch"
library(knitr)
cat("\n### HTML Data Frame\n")
##
## ### HTML Data Frame
kable(html_df, caption = "Data loaded from HTML")
Title | Authors | Year | Pages | Award |
---|---|---|---|---|
Gödel, Escher, Bach: An Eternal Golden Braid | Douglas Hofstadter | 1979 | 777 | Pulitzer Prize |
The Mind’s I | Douglas Hofstadter, Daniel Dennett | 1981 | 501 | None |
The Black Swan | Nassim Nicholas Taleb | 2007 | 366 | None |
cat("\n### XML Data Frame\n")
##
## ### XML Data Frame
kable(xml_df, caption = "Data loaded from XML")
Title | Authors | Year | Pages | Award |
---|---|---|---|---|
Gödel, Escher, Bach: An Eternal Golden Braid | Douglas Hofstadter | 1979 | 777 | Pulitzer Prize |
The Mind’s I | Douglas Hofstadter, Daniel Dennett | 1981 | 501 | None |
The Black Swan | Nassim Nicholas Taleb | 2007 | 366 | None |
cat("\n### JSON Data Frame\n")
##
## ### JSON Data Frame
kable(json_df, caption = "Data loaded from JSON")
Title | Authors | Year | Pages | Award |
---|---|---|---|---|
Gödel, Escher, Bach: An Eternal Golden Braid | Douglas Hofstadter | 1979 | 777 | Pulitzer Prize |
The Mind’s I | Douglas Hofstadter, Daniel Dennett | 1981 | 501 | None |
The Black Swan | Nassim Nicholas Taleb | 2007 | 366 | None |
Are the three data frames identical?
if (identical(html_df, xml_df) && identical(html_df, json_df)) {
cat("YES - All three data frames are completely identical.\n")
} else {
cat("NO - The three data frames are NOT identical.\n\n")
cat("However, they contain the same information. Differences may be due to:\n")
cat("- Different internal data structure representations\n")
cat("- Row names or attributes varying by parsing method\n")
cat("- Minor differences in how R stores the data internally\n\n")
cat("Using all.equal() shows they are functionally equivalent.\n")
}
## NO - The three data frames are NOT identical.
##
## However, they contain the same information. Differences may be due to:
## - Different internal data structure representations
## - Row names or attributes varying by parsing method
## - Minor differences in how R stores the data internally
##
## Using all.equal() shows they are functionally equivalent.
The three data frames contain the same information but may not be strictly identical due to differences in how each parsing method stores data internally. The all.equal() function provides a more flexible comparison that shows they are functionally equivalent.