# Libraries for loading the html file and extracting the table
library(rvest)
library(xml2)
# Set the file path
file_path <- "/Users/candace/Desktop/CUNY_SEM1_Masters/books.html"
cat("Selected file:", file_path, "\n")
## Selected file: /Users/candace/Desktop/CUNY_SEM1_Masters/books.html
# Read the HTML file
html_file <- read_html(file_path)
# Extract table
html_data <- html_file %>% html_table() %>% .[[1]]
print(html_data)
## # A tibble: 3 × 6
## id title authors copyright page_count publisher
## <int> <chr> <chr> <int> <int> <chr>
## 1 1 Key Person of Influence Daniel Priestly … 2014 201 Rethink …
## 2 2 Million Dolllar Weekend Noah Kagan and T… 2024 218 Penguin …
## 3 3 The 12 Week Work Year Brian P. Morgan … 2013 190 Wiley
library(xml2)
library(xml2)
xml_book_table <- read_xml("books.xml")
xml_table <- data.frame(
id = xml_text(xml_find_all(xml_book_table, "//book/id")),
title = xml_text(xml_find_all(xml_book_table, "//book/title")),
authors = xml_text(xml_find_all(xml_book_table, "//book/authors")),
copyright = xml_text(xml_find_all(xml_book_table, "//book/copyright")),
page_count = xml_text(xml_find_all(xml_book_table, "//book/page_count")),
publisher = xml_text(xml_find_all(xml_book_table, "//book/publisher"))
)
print(xml_table)
## id title authors copyright
## 1 1 Key Person of Influence Daniel Priestly and Mike Reid 2014
## 2 2 Million Dollar Weekend Noah Kagan and Tahl Raz 2024
## 3 3 The 12 Week Work Year Brian P. Morgan and Michael Lennington 2013
## page_count publisher
## 1 201 Rethink Press
## 2 218 Penguin Random House LLC
## 3 190 Wiley
xml_book_table <- read_xml("books.xml")
xml_table <- data.frame(
id = xml_text(xml_find_all(xml_book_table, "//book/id")),
title = xml_text(xml_find_all(xml_book_table, "//book/title")),
authors = xml_text(xml_find_all(xml_book_table, "//book/authors")),
copyright = xml_text(xml_find_all(xml_book_table, "//book/copyright")),
page_count = xml_text(xml_find_all(xml_book_table, "//book/page_count")),
publisher = xml_text(xml_find_all(xml_book_table, "//book/publisher"))
)
print(xml_table)
## id title authors copyright
## 1 1 Key Person of Influence Daniel Priestly and Mike Reid 2014
## 2 2 Million Dollar Weekend Noah Kagan and Tahl Raz 2024
## 3 3 The 12 Week Work Year Brian P. Morgan and Michael Lennington 2013
## page_count publisher
## 1 201 Rethink Press
## 2 218 Penguin Random House LLC
## 3 190 Wiley
library(jsonlite)
library(jsonlite)
# Create the data as a data frame
books_data <- data.frame(
id = c(1, 2, 3),
title = c("Key Person of Influence", "Million Dollar Weekend", "The 12 Week Work Year"),
authors = c("Daniel Priestly and Mike Reid", "Noah Kagan and Tahl Raz", "Brian P. Morgan and Michael Lennington"),
copyright = c(2014, 2024, 2013),
page_count = c(201, 218, 190),
publisher = c("Rethink Press", "Penguin Random House LLC", "Wiley")
)
# Write to JSON file
write_json(books_data, "/Users/candace/Desktop/CUNY_SEM1_Masters/favorite_books.json", pretty = TRUE)
# Read it back
json_data <- read_json("/Users/candace/Desktop/CUNY_SEM1_Masters/favorite_books.json", simplifyVector = TRUE)
print(json_data)
## id title authors copyright
## 1 1 Key Person of Influence Daniel Priestly and Mike Reid 2014
## 2 2 Million Dollar Weekend Noah Kagan and Tahl Raz 2024
## 3 3 The 12 Week Work Year Brian P. Morgan and Michael Lennington 2013
## page_count publisher
## 1 201 Rethink Press
## 2 218 Penguin Random House LLC
## 3 190 Wiley
The HTML file was the easiest to set up and read into R. In contrast, the XML and JSON files were a bit more challenging to configure and import. When examining the outputs, the HTML file appeared the most organized—it displayed data neatly in rows and columns similar to a table. Meanwhile, the XML and JSON outputs required more space and printed their content across multiple lines, making them less straightforward to read compared to the HTML output.