Space Books Analysis

Objective

Load book data from three different file formats: HTML, XML, and JSON.

Loading - HTML

if (!require("rvest")) {
  install.packages("rvest", repos = "https://cran.rstudio.com/", dependencies = FALSE)
}

## Loading required package: rvest

library(rvest)

html_data <- read_html("books.html")

html_df <- html_data %>%
  html_nodes("table") %>%
  html_table(fill = TRUE) %>%
  .[[1]]

print(html_df)

## # A tibble: 3 × 5
##   Title                                        Authors         Year Pages Rating
##   <chr>                                        <chr>          <int> <int>  <dbl>
## 1 Cosmos                                       Carl Sagan      1980   384    4.8
## 2 Astrophysics for People in a Hurry           Neil deGrasse…  2017   224    4.5
## 3 Space Atlas: Mapping the Universe and Beyond James Trefil,…  2019   352    4.7

Loading - XML

if (!require("xml2")) {
  install.packages("xml2", repos = "https://cran.rstudio.com/", dependencies = FALSE)
}

## Loading required package: xml2

library(xml2)

xml_data <- read_xml("books.xml")

book_nodes <- xml_find_all(xml_data, "//book")

titles <- character()
authors_list <- character()
years <- numeric()
pages <- numeric()
ratings <- numeric()

for (book in book_nodes) {
  titles <- c(titles, xml_text(xml_find_first(book, ".//title")))
  authors_list <- c(authors_list, xml_text(xml_find_first(book, ".//authors")))
  years <- c(years, as.numeric(xml_text(xml_find_first(book, ".//year"))))
  pages <- c(pages, as.numeric(xml_text(xml_find_first(book, ".//pages"))))
  ratings <- c(ratings, as.numeric(xml_text(xml_find_first(book, ".//rating"))))
}

xml_df <- data.frame(
  Title = titles,
  Authors = authors_list,
  Year = years,
  Pages = pages,
  Rating = ratings
)

print(xml_df)

##                                          Title
## 1                                       Cosmos
## 2           Astrophysics for People in a Hurry
## 3 Space Atlas: Mapping the Universe and Beyond
##                             Authors Year Pages Rating
## 1                        Carl Sagan 1980   384    4.8
## 2               Neil deGrasse Tyson 2017   224    4.5
## 3 James Trefil, National Geographic 2019   352    4.7

Loading - JSON

if (!require("jsonlite")) {
  install.packages("jsonlite", repos = "https://cran.rstudio.com/", dependencies = FALSE)
}

## Loading required package: jsonlite

library(jsonlite)

json_data <- fromJSON("books.json")

json_df <- json_data$books

print(json_df)

##                                          title
## 1                                       Cosmos
## 2           Astrophysics for People in a Hurry
## 3 Space Atlas: Mapping the Universe and Beyond
##                             authors year pages rating
## 1                        Carl Sagan 1980   384    4.8
## 2               Neil deGrasse Tyson 2017   224    4.5
## 3 James Trefil, National Geographic 2019   352    4.7

Analysis

cat("Column names comparison:
")

## Column names comparison:

cat("HTML columns: ", paste(colnames(html_df), collapse = ", "), "
")

## HTML columns:  Title, Authors, Year, Pages, Rating

cat("XML columns: ", paste(colnames(xml_df), collapse = ", "), "
")

## XML columns:  Title, Authors, Year, Pages, Rating

cat("JSON columns: ", paste(colnames(json_df), collapse = ", "), "

")

## JSON columns:  title, authors, year, pages, rating

cat("Dimensions comparison:
")

## Dimensions comparison:

cat("HTML dimensions: ", paste(dim(html_df), collapse = " x "), "
")

## HTML dimensions:  3 x 5

cat("XML dimensions: ", paste(dim(xml_df), collapse = " x "), "
")

## XML dimensions:  3 x 5

cat("JSON dimensions: ", paste(dim(json_df), collapse = " x "), "

")

## JSON dimensions:  3 x 5

cat("Data types comparison:
")

## Data types comparison:

cat("HTML data types:
")

## HTML data types:

print(sapply(html_df, class))

##       Title     Authors        Year       Pages      Rating 
## "character" "character"   "integer"   "integer"   "numeric"

cat("XML data types:
")

## XML data types:

print(sapply(xml_df, class))

##       Title     Authors        Year       Pages      Rating 
## "character" "character"   "numeric"   "numeric"   "numeric"

cat("JSON data types:
")

## JSON data types:

print(sapply(json_df, class))

##       title     authors        year       pages      rating 
## "character" "character"   "integer"   "integer"   "numeric"

data_html <- as.data.frame(lapply(html_df, as.character), stringsAsFactors = FALSE)
data_xml <- as.data.frame(lapply(xml_df, as.character), stringsAsFactors = FALSE)
data_json <- as.data.frame(lapply(json_df, as.character), stringsAsFactors = FALSE)

cat("
Are the data frames identical in content?
")

## 
## Are the data frames identical in content?

identical_html_xml <- identical(data_html, data_xml)
identical_html_json <- identical(data_html, data_json)
identical_xml_json <- identical(data_xml, data_json)

cat("HTML and XML identical: ", identical_html_xml, "
")

## HTML and XML identical:  TRUE

cat("HTML and JSON identical: ", identical_html_json, "
")

## HTML and JSON identical:  FALSE

cat("XML and JSON identical: ", identical_xml_json, "
")

## XML and JSON identical:  FALSE

Space Books Analysis

Stefan Huber

March 16, 2025

Objective

Loading - HTML

Loading - XML

Loading - JSON

Analysis