R Markdown

#install.packages("rvest")
#install.packages("xml2")
#install.packages("jsonlite")
library(rvest)
## Warning: package 'rvest' was built under R version 4.3.3
library(xml2)
## Warning: package 'xml2' was built under R version 4.3.3
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

html

html_file_path <- "./data/books/books.html"
books_html_data <- read_html(html_file_path)
books_df_html <- html_table(books_html_data, fill = TRUE, header = TRUE)[[1]]
print(books_df_html)
## # A tibble: 3 × 6
##   Title                                   Authors  Year Genre `Page Count` ISBN 
##   <chr>                                   <chr>   <int> <chr>        <int> <chr>
## 1 The Joy of X: A Guided Tour of Math, f… Steven…  2023 Educ…          336 978-…
## 2 The Calculus Lifesaver: All the Tools … Adrian…  2023 Acad…          752 978-…
## 3 Sacred Mathematics: Japanese Temple Ge… Fukaga…  2023 Hist…          392 978-…

xml

xml_file_path <- "./data/books/books.xml"
books_xml_data <- read_xml(xml_file_path)

print(books_xml_data)
## {xml_document}
## <books>
## [1] <book>\n  <title>The Joy of X: A Guided Tour of Math, from One to Infinit ...
## [2] <book>\n  <title>The Calculus Lifesaver: All the Tools You Need to Excel  ...
## [3] <book>\n  <title>Sacred Mathematics: Japanese Temple Geometry</title>\n   ...
book_nodes <- xml_find_all(books_xml_data, ".//book")

books_df_xml <- data.frame(
  title = character(),
  authors = character(),
  year = character(),
  genre = character(),
  pageCount = character(),
  isbn = character(),
  stringsAsFactors = FALSE  
)

for (i in seq_along(book_nodes)) {
  book_node <- book_nodes[i]
  books_df_xml <- rbind(books_df_xml, data.frame(
    title = xml_text(xml_find_first(book_node, "title")),
    authors = xml_text(xml_find_first(book_node, "authors")),
    year = xml_text(xml_find_first(book_node, "year")),
    genre = xml_text(xml_find_first(book_node, "genre")),
    pageCount = xml_text(xml_find_first(book_node, "pageCount")),
    isbn = xml_text(xml_find_first(book_node, "isbn")),
    stringsAsFactors = FALSE
  ))
}

print(books_df_xml)
##                                                                 title
## 1           The Joy of X: A Guided Tour of Math, from One to Infinity
## 2 The Calculus Lifesaver: All the Tools You Need to Excel at Calculus
## 3                        Sacred Mathematics: Japanese Temple Geometry
##                            authors year               genre pageCount
## 1                  Steven Strogatz 2023           Education       336
## 2                    Adrian Banner 2023            Academic       752
## 3 Fukagawa Hidetoshi, Tony Rothman 2023 History/Mathematics       392
##             isbn
## 1 978-0544105850
## 2 978-0691130880
## 3 978-0691127453

json

json_file_path <- "./data/books/books.json"
books_df_json <- fromJSON(json_file_path, flatten = TRUE)
print(books_df_json)
##                                                                 title
## 1           The Joy of X: A Guided Tour of Math, from One to Infinity
## 2 The Calculus Lifesaver: All the Tools You Need to Excel at Calculus
## 3                        Sacred Mathematics: Japanese Temple Geometry
##                            authors year               genre pageCount
## 1                  Steven Strogatz 2023           Education       336
## 2                    Adrian Banner 2023            Academic       752
## 3 Fukagawa Hidetoshi, Tony Rothman 2023 History/Mathematics       392
##             isbn
## 1 978-0544105850
## 2 978-0691130880
## 3 978-0691127453

The three dataframes are identical I had to tweek the xml data loaded in first in order to get the results I wanted