Introduction
This document demonstrates how to load data from three different
sources: HTML, XML, and JSON. All files are hosted on GitHub for easy
access.
HTML File:
# Raw GitHub URLs
url_html <- "https://raw.githubusercontent.com/hbedros/data607_hw7/main/book.html"
# Loading from HTML
html_file <- read_html(url_html)
# Extracting title of the document
doc_title <- html_file %>% html_node("title") %>% html_text()
# Initialize lists to store extracted details
book_titles <- list()
all_authors <- list()
all_attributes <- list()
# Find all the book divs
book_divs <- html_file %>% html_nodes(".book")
# Initialize a list to store data frames for each book
dfs <- list()
# Loop through each book div
for (i in seq_along(book_divs)) {
# Extract details from the current book div
book_title <- book_divs[[i]] %>% html_node("h1") %>% html_text()
author <- book_divs[[i]] %>% html_node("h2") %>% html_text()
# Clean up and split the author string
author_cleaned <- gsub("Authors?:", "", author, ignore.case = TRUE) %>% trimws()
authors_list <- unlist(str_split(author_cleaned, ",| and ")) %>% trimws()
full_attributes <- book_divs[[i]] %>% html_nodes("p") %>% html_text()
# Extracting only the first main point from each block of text
attributes_list <- sapply(full_attributes, function(attr) {
str_extract(attr, "^[^:]+(?=:)")
})
# Joining authors and attribute heads using ";"
concatenated_authors <- paste(authors_list, collapse="; ")
concatenated_attributes <- paste(attributes_list, collapse="; ")
# Creating a data frame with concatenated values
dfs[[i]] <- data.frame(
book_title = book_title,
author = concatenated_authors,
attributes = concatenated_attributes,
row.names = NULL
)
}
# Bind all book data together
df_books <- do.call(rbind, dfs)
# Using kable() to display the table in a cleaner format
knitr::kable(df_books,
caption = "Books Information from HTML",
align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Books Information from HTML
|
book_title
|
author
|
attributes
|
|
The Alchemist
|
Paulo Coelho
|
Philosophical Journey; Universal Appeal
|
|
Sapiens: A Brief History of Humankind
|
Yuval Noah Harari
|
Broad Overview; Provocative Insights
|
|
OpenIntro Statistics, 4th Edition
|
David M Diez; Christopher D Barr; ; Mine Çetinkaya-Rundel
|
Accessible Content; Open Source; Diverse Applications
|
XML File:
# Raw GitHub URL for XML
url_xml <- "https://raw.githubusercontent.com/hbedros/data607_hw7/main/book.xml"
# Loading from XML
xml_file <- read_xml(url_xml)
# Extract all books
book_nodes <- xml_file %>% xml_find_all(".//book")
# Function to safely extract a child node's text. If the node doesn't exist, it returns NA
extract_safe <- function(node, path) {
result <- node %>% xml_find_first(path) %>% xml_text(trim = TRUE)
if(length(result) == 0) return(NA)
return(result)
}
# Extract data from each book
books_data <- lapply(book_nodes, function(book) {
title <- extract_safe(book, ".//title")
author <- extract_safe(book, ".//author")
# Extracting attribute names (using the 'name' attribute of the attribute node)
attributes_nodes <- xml_find_all(book, ".//attributes/attribute")
attribute_names <- sapply(attributes_nodes, xml_attr, "name")
attributes_combined <- paste(attribute_names, collapse = ";")
data.frame(
book_title = title,
author = author,
attributes = attributes_combined,
stringsAsFactors = FALSE
)
})
# Bind all book data together
df_books_xml <- do.call(rbind, books_data)
# Using kable() to display the table in a cleaner format
knitr::kable(df_books_xml,
caption = "Books Information from XML",
align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Books Information from XML
|
book_title
|
author
|
attributes
|
|
The Alchemist
|
Paulo Coelho
|
Philosophical Journey;Universal Appeal
|
|
Sapiens: A Brief History of Humankind
|
Yuval Noah Harari
|
Broad Overview;Provocative Insights
|
|
OpenIntro Statistics, 4th Edition
|
David M Diez
|
Accessible Content;Open Source;Diverse Applications
|
JSON File:
# Raw GitHub URL for JSON
url_json <- "https://raw.githubusercontent.com/hbedros/data607_hw7/main/book.json"
# Loading from JSON
json_data <- fromJSON(url_json)
# Print structure of the first book
str(json_data$library$books[[1]])
## chr [1:3] "The Alchemist" "Sapiens: A Brief History of Humankind" ...
# Flatten the JSON
flattened_json <- flatten(json_data$library$books)
# View the flattened structure
str(flattened_json)
## 'data.frame': 3 obs. of 10 variables:
## $ title : chr "The Alchemist" "Sapiens: A Brief History of Humankind" "OpenIntro Statistics, 4th Edition"
## $ author : chr "Paulo Coelho" "Yuval Noah Harari" NA
## $ authors :List of 3
## ..$ : NULL
## ..$ : NULL
## ..$ : chr "David M Diez" "Christopher D Barr" "Mine Çetinkaya-Rundel"
## $ attributes.Philosophical Journey: chr "Follows Santiago, a shepherd, on his journey to discover a treasure by the Egyptian pyramids. An allegory about"| __truncated__ NA NA
## $ attributes.Universal Appeal : chr "Translated into numerous languages and a global bestseller." NA NA
## $ attributes.Broad Overview : chr NA "Chronicles the evolution of Homo sapiens from ancient times to the modern era, touching upon various aspects li"| __truncated__ NA
## $ attributes.Provocative Insights : chr NA "Harari provides thought-provoking insights on capitalism, religion, and the possible future paths for humanity." NA
## $ attributes.Accessible Content : chr NA NA "Known for its beginner-friendly approach to statistical concepts."
## $ attributes.Open Source : chr NA NA "Available for free online, embodying a commitment to accessible education."
## $ attributes.Diverse Applications : chr NA NA "Features a wide range of real-world data and examples, emphasizing the practical application of statistics."
# Convert author and authors column into a single authors column
flattened_json$consolidated_authors <- ifelse(is.na(flattened_json$author),
sapply(flattened_json$authors, paste, collapse = "; "),
flattened_json$author)
# Remove the original author and authors columns
flattened_json$author <- NULL
flattened_json$authors <- NULL
# Create a function to consolidate the attributes
consolidate_attributes <- function(row) {
# Extract only the attribute columns (ignoring title and authors)
attributes <- row[grepl("^attributes\\.", names(row))]
# Get the names of the attributes
attribute_names <- gsub("^attributes\\.", "", names(attributes))
# Combine non-NA attributes
combined_attributes <- paste(attribute_names[!is.na(attributes)], collapse = "; ")
return(combined_attributes)
}
# Apply the function to each row
flattened_json$consolidated_attributes <- apply(flattened_json, 1, consolidate_attributes)
# Create the final data frame
final_df <- flattened_json[, c("title", "consolidated_authors", "consolidated_attributes"), drop = FALSE]
colnames(final_df) <- c("Title", "Authors", "Attributes")
# Using kable() to display the table in a cleaner format
knitr::kable(final_df,
caption = "Books Information from JSON",
align = 'c') %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Books Information from JSON
|
Title
|
Authors
|
Attributes
|
|
The Alchemist
|
Paulo Coelho
|
Philosophical Journey; Universal Appeal
|
|
Sapiens: A Brief History of Humankind
|
Yuval Noah Harari
|
Broad Overview; Provocative Insights
|
|
OpenIntro Statistics, 4th Edition
|
David M Diez; Christopher D Barr; Mine Çetinkaya-Rundel
|
Accessible Content; Open Source; Diverse Applications
|