Introduction

Three books that I chose were about anthropology, which I was really interested. The book “Anthropology: The Human Challenge” had more than one author, and I added attributes such as year, pages and genre.

# Read html file
html <- read_html("https://raw.githubusercontent.com/vincent-usny/607-week-7/refs/heads/main/books1.html") %>%
  html_table(header = TRUE) %>%
  .[[1]]
html <- as.data.frame(html) 
# I have to convert it to data.frame, otherwise identical() return false
show(html)
##                                               title
## 1 Cultural Anthropology: A Toolkit for a Global Age
## 2                    The Interpretation of Cultures
## 3                 Anthropology: The Human Challenge
##                                                                authors year
## 1                                                     Kenneth J. Guest 2023
## 2                                                      Clifford Geertz 1973
## 3 William A. Haviland, Harald E. L. Prins, Dana Walrath, Bunny McBride 2021
##   pages                 genre
## 1   640 Cultural Anthropology
## 2   470       Cultural Theory
## 3   528  General Anthropology
# Read json file
json <- fromJSON("https://raw.githubusercontent.com/vincent-usny/607-week-7/refs/heads/main/books.json")
show(json)
##                                               title
## 1 Cultural Anthropology: A Toolkit for a Global Age
## 2                    The Interpretation of Cultures
## 3                 Anthropology: The Human Challenge
##                                                                authors year
## 1                                                     Kenneth J. Guest 2023
## 2                                                      Clifford Geertz 1973
## 3 William A. Haviland, Harald E. L. Prins, Dana Walrath, Bunny McBride 2021
##   pages                 genre
## 1   640 Cultural Anthropology
## 2   470       Cultural Theory
## 3   528  General Anthropology
# Read xml file
xml <- read_xml("https://raw.githubusercontent.com/vincent-usny/607-week-7/refs/heads/main/books_xml.xml")
xml <- xml_find_all(xml, "//book")
xml <- map_df(xml, ~ data.frame(
  title = xml_text(xml_find_first(.x, "title")),
  authors = xml_text(xml_find_first(.x, "authors")),
  year = as.integer(xml_text(xml_find_first(.x, "year"))),
  pages = as.integer(xml_text(xml_find_first(.x, "pages"))),
  genre = xml_text(xml_find_first(.x, "genre"))
))
show(xml)
##                                               title
## 1 Cultural Anthropology: A Toolkit for a Global Age
## 2                    The Interpretation of Cultures
## 3                 Anthropology: The Human Challenge
##                                                                authors year
## 1                                                     Kenneth J. Guest 2023
## 2                                                      Clifford Geertz 1973
## 3 William A. Haviland, Harald E. L. Prins, Dana Walrath, Bunny McBride 2021
##   pages                 genre
## 1   640 Cultural Anthropology
## 2   470       Cultural Theory
## 3   528  General Anthropology
# Compare three types of file
identical(html, json)
## [1] TRUE
identical(json, xml)
## [1] TRUE
# now return true for all three
class(html)
## [1] "data.frame"
class(xml)
## [1] "data.frame"
class(json)
## [1] "data.frame"