Importing HTML

raw_data_html <- read_html("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/book_information.html")

#I used div when I created the html that is why I searching that into the raw_data_html
bookinfo_divs <- raw_data_html |>
                 html_nodes("div")

#Initialize vectors
book_names <- c()
authors <- c()
publishers <- c()
publish_years <- c()
themes <- c()

#A loop for earch row in bookinfo_divs to extract book_name,authors
for (div_node in bookinfo_divs) {
  
  title <- div_node |> 
    html_node("h2") |>
    html_text(trim = TRUE)
  
  #.[1]: the order for the first <p>; html_text: will return only the text
  author <- div_node %>% html_nodes("p") %>% .[1] %>% html_text(trim = TRUE)
  
  year <- div_node %>% html_nodes("p") %>% .[2] %>% html_text(trim = TRUE)
  
  publisher <- div_node %>% html_nodes("p") %>% .[3] %>% html_text(trim = TRUE)
  
  theme <- div_node %>% html_nodes("p") %>% .[4] %>% html_text(trim = TRUE)
  
  
  # Clean the extracted text by removing labels ( "Author: ")
  author <- sub("Authors: ", "", author)
  publisher <- sub("Publisher: ", "", publisher)
  year <- sub("Year: ", "", year)
  Theme <- sub("Theme: ", "", theme)
  
  # Append the extracted data to vectors
  book_names <- c(book_names, title)
  authors <- c(authors, author)
  publishers <- c(publishers, publisher)
  publish_years <- c(publish_years, year)
  themes <- c(themes, Theme)
}

book_df <- data.frame(
  book_name = book_names,
  author = authors,
  publisher = publishers,
  year = publish_years,
  theme = themes
)


datatable(book_df)

Importing XML

raw_data_xml <- read_xml("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/books_information.xml")

# Extract data from the XML
book_names <- raw_data_xml |>
  xml_find_all("//book/title") |>
  xml_text(trim = TRUE)

authors <- raw_data_xml |>
  xml_find_all("//book/authors") |> 
  xml_text(trim = TRUE)

publishers <- raw_data_xml |> 
  xml_find_all("//book/publisher") |> 
  xml_text(trim = TRUE)

publish_years <- raw_data_xml |>
  xml_find_all("//book/year") |>
  xml_text(trim = TRUE)

themes <- raw_data_xml |>
  xml_find_all("//book/theme") |> 
  xml_text(trim = TRUE)


book_df2 <- data.frame(
  book_name = book_names,
  author = authors,
  publisher = publishers,
  year = publish_years,
  theme = themes
)


datatable(book_df2)

Importing JSON

raw_data_json <- fromJSON("https://raw.githubusercontent.com/alinsimon/data607/refs/heads/main/book_information.json")

#Identify structure after the load
str(raw_data_json)
## 'data.frame':    3 obs. of  5 variables:
##  $ title    : chr  "The Time of the Hero" "Broad and Alien is the World" "In Search of an Inca: Identity and Utopia in the Andes"
##  $ authors  :List of 3
##   ..$ : chr "Mario Vargas Llosa"
##   ..$ : chr "Ciro Alegría"
##   ..$ : chr  "Alberto Flores Galindo" "Carlos Aguirre"
##  $ year     : int  1963 1941 2010
##  $ publisher: chr  "Faber & Faber" "E.P. Dutton & Co." "Cambridge University Press"
##  $ theme    : chr  "Corruption and violence in a Peruvian military academy" "Indigenous struggles and rural life in the Andes" "Historical and cultural analysis of the Incan legacy"
book_df3 <- as.data.frame(raw_data_json)

datatable(book_df3)