Load book data from three different file formats: HTML, XML, and JSON.
if (!require("rvest")) {
install.packages("rvest", repos = "https://cran.rstudio.com/", dependencies = FALSE)
}
## Loading required package: rvest
library(rvest)
html_data <- read_html("books.html")
html_df <- html_data %>%
html_nodes("table") %>%
html_table(fill = TRUE) %>%
.[[1]]
print(html_df)
## # A tibble: 3 × 5
## Title Authors Year Pages Rating
## <chr> <chr> <int> <int> <dbl>
## 1 Cosmos Carl Sagan 1980 384 4.8
## 2 Astrophysics for People in a Hurry Neil deGrasse… 2017 224 4.5
## 3 Space Atlas: Mapping the Universe and Beyond James Trefil,… 2019 352 4.7
if (!require("xml2")) {
install.packages("xml2", repos = "https://cran.rstudio.com/", dependencies = FALSE)
}
## Loading required package: xml2
library(xml2)
xml_data <- read_xml("books.xml")
book_nodes <- xml_find_all(xml_data, "//book")
titles <- character()
authors_list <- character()
years <- numeric()
pages <- numeric()
ratings <- numeric()
for (book in book_nodes) {
titles <- c(titles, xml_text(xml_find_first(book, ".//title")))
authors_list <- c(authors_list, xml_text(xml_find_first(book, ".//authors")))
years <- c(years, as.numeric(xml_text(xml_find_first(book, ".//year"))))
pages <- c(pages, as.numeric(xml_text(xml_find_first(book, ".//pages"))))
ratings <- c(ratings, as.numeric(xml_text(xml_find_first(book, ".//rating"))))
}
xml_df <- data.frame(
Title = titles,
Authors = authors_list,
Year = years,
Pages = pages,
Rating = ratings
)
print(xml_df)
## Title
## 1 Cosmos
## 2 Astrophysics for People in a Hurry
## 3 Space Atlas: Mapping the Universe and Beyond
## Authors Year Pages Rating
## 1 Carl Sagan 1980 384 4.8
## 2 Neil deGrasse Tyson 2017 224 4.5
## 3 James Trefil, National Geographic 2019 352 4.7
if (!require("jsonlite")) {
install.packages("jsonlite", repos = "https://cran.rstudio.com/", dependencies = FALSE)
}
## Loading required package: jsonlite
library(jsonlite)
json_data <- fromJSON("books.json")
json_df <- json_data$books
print(json_df)
## title
## 1 Cosmos
## 2 Astrophysics for People in a Hurry
## 3 Space Atlas: Mapping the Universe and Beyond
## authors year pages rating
## 1 Carl Sagan 1980 384 4.8
## 2 Neil deGrasse Tyson 2017 224 4.5
## 3 James Trefil, National Geographic 2019 352 4.7
cat("Column names comparison:
")
## Column names comparison:
cat("HTML columns: ", paste(colnames(html_df), collapse = ", "), "
")
## HTML columns: Title, Authors, Year, Pages, Rating
cat("XML columns: ", paste(colnames(xml_df), collapse = ", "), "
")
## XML columns: Title, Authors, Year, Pages, Rating
cat("JSON columns: ", paste(colnames(json_df), collapse = ", "), "
")
## JSON columns: title, authors, year, pages, rating
cat("Dimensions comparison:
")
## Dimensions comparison:
cat("HTML dimensions: ", paste(dim(html_df), collapse = " x "), "
")
## HTML dimensions: 3 x 5
cat("XML dimensions: ", paste(dim(xml_df), collapse = " x "), "
")
## XML dimensions: 3 x 5
cat("JSON dimensions: ", paste(dim(json_df), collapse = " x "), "
")
## JSON dimensions: 3 x 5
cat("Data types comparison:
")
## Data types comparison:
cat("HTML data types:
")
## HTML data types:
print(sapply(html_df, class))
## Title Authors Year Pages Rating
## "character" "character" "integer" "integer" "numeric"
cat("XML data types:
")
## XML data types:
print(sapply(xml_df, class))
## Title Authors Year Pages Rating
## "character" "character" "numeric" "numeric" "numeric"
cat("JSON data types:
")
## JSON data types:
print(sapply(json_df, class))
## title authors year pages rating
## "character" "character" "integer" "integer" "numeric"
data_html <- as.data.frame(lapply(html_df, as.character), stringsAsFactors = FALSE)
data_xml <- as.data.frame(lapply(xml_df, as.character), stringsAsFactors = FALSE)
data_json <- as.data.frame(lapply(json_df, as.character), stringsAsFactors = FALSE)
cat("
Are the data frames identical in content?
")
##
## Are the data frames identical in content?
identical_html_xml <- identical(data_html, data_xml)
identical_html_json <- identical(data_html, data_json)
identical_xml_json <- identical(data_xml, data_json)
cat("HTML and XML identical: ", identical_html_xml, "
")
## HTML and XML identical: TRUE
cat("HTML and JSON identical: ", identical_html_json, "
")
## HTML and JSON identical: FALSE
cat("XML and JSON identical: ", identical_xml_json, "
")
## XML and JSON identical: FALSE