Three books that I chose were about anthropology, which I was really interested. The book “Anthropology: The Human Challenge” had more than one author, and I added attributes such as year, pages and genre.
# Read html file
html <- read_html("https://raw.githubusercontent.com/vincent-usny/607-week-7/refs/heads/main/books1.html") %>%
html_table(header = TRUE) %>%
.[[1]]
html <- as.data.frame(html)
# I have to convert it to data.frame, otherwise identical() return false
show(html)
## title
## 1 Cultural Anthropology: A Toolkit for a Global Age
## 2 The Interpretation of Cultures
## 3 Anthropology: The Human Challenge
## authors year
## 1 Kenneth J. Guest 2023
## 2 Clifford Geertz 1973
## 3 William A. Haviland, Harald E. L. Prins, Dana Walrath, Bunny McBride 2021
## pages genre
## 1 640 Cultural Anthropology
## 2 470 Cultural Theory
## 3 528 General Anthropology
# Read json file
json <- fromJSON("https://raw.githubusercontent.com/vincent-usny/607-week-7/refs/heads/main/books.json")
show(json)
## title
## 1 Cultural Anthropology: A Toolkit for a Global Age
## 2 The Interpretation of Cultures
## 3 Anthropology: The Human Challenge
## authors year
## 1 Kenneth J. Guest 2023
## 2 Clifford Geertz 1973
## 3 William A. Haviland, Harald E. L. Prins, Dana Walrath, Bunny McBride 2021
## pages genre
## 1 640 Cultural Anthropology
## 2 470 Cultural Theory
## 3 528 General Anthropology
# Read xml file
xml <- read_xml("https://raw.githubusercontent.com/vincent-usny/607-week-7/refs/heads/main/books_xml.xml")
xml <- xml_find_all(xml, "//book")
xml <- map_df(xml, ~ data.frame(
title = xml_text(xml_find_first(.x, "title")),
authors = xml_text(xml_find_first(.x, "authors")),
year = as.integer(xml_text(xml_find_first(.x, "year"))),
pages = as.integer(xml_text(xml_find_first(.x, "pages"))),
genre = xml_text(xml_find_first(.x, "genre"))
))
show(xml)
## title
## 1 Cultural Anthropology: A Toolkit for a Global Age
## 2 The Interpretation of Cultures
## 3 Anthropology: The Human Challenge
## authors year
## 1 Kenneth J. Guest 2023
## 2 Clifford Geertz 1973
## 3 William A. Haviland, Harald E. L. Prins, Dana Walrath, Bunny McBride 2021
## pages genre
## 1 640 Cultural Anthropology
## 2 470 Cultural Theory
## 3 528 General Anthropology
# Compare three types of file
identical(html, json)
## [1] TRUE
identical(json, xml)
## [1] TRUE
# now return true for all three
class(html)
## [1] "data.frame"
class(xml)
## [1] "data.frame"
class(json)
## [1] "data.frame"