#install.packages("rvest")
#install.packages("xml2")
#install.packages("jsonlite")
library(rvest)
## Warning: package 'rvest' was built under R version 4.3.3
library(xml2)
## Warning: package 'xml2' was built under R version 4.3.3
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
html
html_file_path <- "./data/books/books.html"
books_html_data <- read_html(html_file_path)
books_df_html <- html_table(books_html_data, fill = TRUE, header = TRUE)[[1]]
print(books_df_html)
## # A tibble: 3 × 6
## Title Authors Year Genre `Page Count` ISBN
## <chr> <chr> <int> <chr> <int> <chr>
## 1 The Joy of X: A Guided Tour of Math, f… Steven… 2023 Educ… 336 978-…
## 2 The Calculus Lifesaver: All the Tools … Adrian… 2023 Acad… 752 978-…
## 3 Sacred Mathematics: Japanese Temple Ge… Fukaga… 2023 Hist… 392 978-…
xml
xml_file_path <- "./data/books/books.xml"
books_xml_data <- read_xml(xml_file_path)
print(books_xml_data)
## {xml_document}
## <books>
## [1] <book>\n <title>The Joy of X: A Guided Tour of Math, from One to Infinit ...
## [2] <book>\n <title>The Calculus Lifesaver: All the Tools You Need to Excel ...
## [3] <book>\n <title>Sacred Mathematics: Japanese Temple Geometry</title>\n ...
book_nodes <- xml_find_all(books_xml_data, ".//book")
books_df_xml <- data.frame(
title = character(),
authors = character(),
year = character(),
genre = character(),
pageCount = character(),
isbn = character(),
stringsAsFactors = FALSE
)
for (i in seq_along(book_nodes)) {
book_node <- book_nodes[i]
books_df_xml <- rbind(books_df_xml, data.frame(
title = xml_text(xml_find_first(book_node, "title")),
authors = xml_text(xml_find_first(book_node, "authors")),
year = xml_text(xml_find_first(book_node, "year")),
genre = xml_text(xml_find_first(book_node, "genre")),
pageCount = xml_text(xml_find_first(book_node, "pageCount")),
isbn = xml_text(xml_find_first(book_node, "isbn")),
stringsAsFactors = FALSE
))
}
print(books_df_xml)
## title
## 1 The Joy of X: A Guided Tour of Math, from One to Infinity
## 2 The Calculus Lifesaver: All the Tools You Need to Excel at Calculus
## 3 Sacred Mathematics: Japanese Temple Geometry
## authors year genre pageCount
## 1 Steven Strogatz 2023 Education 336
## 2 Adrian Banner 2023 Academic 752
## 3 Fukagawa Hidetoshi, Tony Rothman 2023 History/Mathematics 392
## isbn
## 1 978-0544105850
## 2 978-0691130880
## 3 978-0691127453
json
json_file_path <- "./data/books/books.json"
books_df_json <- fromJSON(json_file_path, flatten = TRUE)
print(books_df_json)
## title
## 1 The Joy of X: A Guided Tour of Math, from One to Infinity
## 2 The Calculus Lifesaver: All the Tools You Need to Excel at Calculus
## 3 Sacred Mathematics: Japanese Temple Geometry
## authors year genre pageCount
## 1 Steven Strogatz 2023 Education 336
## 2 Adrian Banner 2023 Academic 752
## 3 Fukagawa Hidetoshi, Tony Rothman 2023 History/Mathematics 392
## isbn
## 1 978-0544105850
## 2 978-0691130880
## 3 978-0691127453
The three dataframes are identical I had to tweek the xml data loaded in first in order to get the results I wanted