Importing HTML
raw_data_html <- read_html("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/book_information.html")
#I used div when I created the html that is why I searching that into the raw_data_html
bookinfo_divs <- raw_data_html |>
html_nodes("div")
#Initialize vectors
book_names <- c()
authors <- c()
publishers <- c()
publish_years <- c()
themes <- c()
#A loop for earch row in bookinfo_divs to extract book_name,authors
for (div_node in bookinfo_divs) {
title <- div_node |>
html_node("h2") |>
html_text(trim = TRUE)
#.[1]: the order for the first <p>; html_text: will return only the text
author <- div_node %>% html_nodes("p") %>% .[1] %>% html_text(trim = TRUE)
year <- div_node %>% html_nodes("p") %>% .[2] %>% html_text(trim = TRUE)
publisher <- div_node %>% html_nodes("p") %>% .[3] %>% html_text(trim = TRUE)
theme <- div_node %>% html_nodes("p") %>% .[4] %>% html_text(trim = TRUE)
# Clean the extracted text by removing labels ( "Author: ")
author <- sub("Authors: ", "", author)
publisher <- sub("Publisher: ", "", publisher)
year <- sub("Year: ", "", year)
Theme <- sub("Theme: ", "", theme)
# Append the extracted data to vectors
book_names <- c(book_names, title)
authors <- c(authors, author)
publishers <- c(publishers, publisher)
publish_years <- c(publish_years, year)
themes <- c(themes, Theme)
}
book_df <- data.frame(
book_name = book_names,
author = authors,
publisher = publishers,
year = publish_years,
theme = themes
)
datatable(book_df)
Importing XML
raw_data_xml <- read_xml("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/books_information.xml")
# Extract data from the XML
book_names <- raw_data_xml |>
xml_find_all("//book/title") |>
xml_text(trim = TRUE)
authors <- raw_data_xml |>
xml_find_all("//book/authors") |>
xml_text(trim = TRUE)
publishers <- raw_data_xml |>
xml_find_all("//book/publisher") |>
xml_text(trim = TRUE)
publish_years <- raw_data_xml |>
xml_find_all("//book/year") |>
xml_text(trim = TRUE)
themes <- raw_data_xml |>
xml_find_all("//book/theme") |>
xml_text(trim = TRUE)
book_df2 <- data.frame(
book_name = book_names,
author = authors,
publisher = publishers,
year = publish_years,
theme = themes
)
datatable(book_df2)
Importing JSON
raw_data_json <- fromJSON("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/book_information.json")
#Identify structure after the load
str(raw_data_json)
## 'data.frame': 3 obs. of 5 variables:
## $ title : chr "The Time of the Hero" "Broad and Alien is the World" "In Search of an Inca: Identity and Utopia in the Andes"
## $ authors :List of 3
## ..$ : chr "Mario Vargas Llosa"
## ..$ : chr "Ciro Alegría"
## ..$ : chr "Alberto Flores Galindo" "Carlos Aguirre"
## $ year : int 1963 1941 2010
## $ publisher: chr "Faber & Faber" "E.P. Dutton & Co." "Cambridge University Press"
## $ theme : chr "Corruption and violence in a Peruvian military academy" "Indigenous struggles and rural life in the Andes" "Historical and cultural analysis of the Incan legacy"
book_df3 <- as.data.frame(raw_data_json)
datatable(book_df3)