Introduction

This document demonstrates how to load data from three different sources: HTML, XML, and JSON. All files are hosted on GitHub for easy access.

Data Source URLs

HTML File:

# Raw GitHub URLs
url_html <- "https://raw.githubusercontent.com/hbedros/data607_hw7/main/book.html"

# Loading from HTML
html_file <- read_html(url_html)

# Extracting title of the document
doc_title <- html_file %>% html_node("title") %>% html_text()

# Initialize lists to store extracted details
book_titles <- list()
all_authors <- list()
all_attributes <- list()

# Find all the book divs
book_divs <- html_file %>% html_nodes(".book")

# Initialize a list to store data frames for each book
dfs <- list()

# Loop through each book div
for (i in seq_along(book_divs)) {
  # Extract details from the current book div
  book_title <- book_divs[[i]] %>% html_node("h1") %>% html_text()
  author <- book_divs[[i]] %>% html_node("h2") %>% html_text()
  
  # Clean up and split the author string
  author_cleaned <- gsub("Authors?:", "", author, ignore.case = TRUE) %>% trimws()
  authors_list <- unlist(str_split(author_cleaned, ",| and ")) %>% trimws()
  
  full_attributes <- book_divs[[i]] %>% html_nodes("p") %>% html_text()
  
  # Extracting only the first main point from each block of text
  attributes_list <- sapply(full_attributes, function(attr) {
    str_extract(attr, "^[^:]+(?=:)")
  })
  
  # Joining authors and attribute heads using ";"
  concatenated_authors <- paste(authors_list, collapse="; ")
  concatenated_attributes <- paste(attributes_list, collapse="; ")
  
  # Creating a data frame with concatenated values
  dfs[[i]] <- data.frame(
    book_title = book_title,
    author = concatenated_authors,
    attributes = concatenated_attributes,
    row.names = NULL
  )
}

# Bind all book data together
df_books <- do.call(rbind, dfs)

# Using kable() to display the table in a cleaner format
knitr::kable(df_books, 
             caption = "Books Information from HTML", 
             align = 'c') %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Books Information from HTML
book_title	author	attributes
The Alchemist	Paulo Coelho	Philosophical Journey; Universal Appeal
Sapiens: A Brief History of Humankind	Yuval Noah Harari	Broad Overview; Provocative Insights
OpenIntro Statistics, 4th Edition	David M Diez; Christopher D Barr; ; Mine Çetinkaya-Rundel	Accessible Content; Open Source; Diverse Applications

XML File:

# Raw GitHub URL for XML
url_xml <- "https://raw.githubusercontent.com/hbedros/data607_hw7/main/book.xml"

# Loading from XML
xml_file <- read_xml(url_xml)

# Extract all books
book_nodes <- xml_file %>% xml_find_all(".//book")

# Function to safely extract a child node's text. If the node doesn't exist, it returns NA
extract_safe <- function(node, path) {
  result <- node %>% xml_find_first(path) %>% xml_text(trim = TRUE)
  if(length(result) == 0) return(NA)
  return(result)
}

# Extract data from each book
books_data <- lapply(book_nodes, function(book) {
  title <- extract_safe(book, ".//title")
  author <- extract_safe(book, ".//author")
  
  # Extracting attribute names (using the 'name' attribute of the attribute node)
  attributes_nodes <- xml_find_all(book, ".//attributes/attribute")
  attribute_names <- sapply(attributes_nodes, xml_attr, "name")
  attributes_combined <- paste(attribute_names, collapse = ";")
  
  data.frame(
    book_title = title,
    author = author,
    attributes = attributes_combined,
    stringsAsFactors = FALSE
  )
})

# Bind all book data together
df_books_xml <- do.call(rbind, books_data)

# Using kable() to display the table in a cleaner format
knitr::kable(df_books_xml, 
             caption = "Books Information from XML", 
             align = 'c') %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Books Information from XML
book_title	author	attributes
The Alchemist	Paulo Coelho	Philosophical Journey;Universal Appeal
Sapiens: A Brief History of Humankind	Yuval Noah Harari	Broad Overview;Provocative Insights
OpenIntro Statistics, 4th Edition	David M Diez	Accessible Content;Open Source;Diverse Applications

JSON File:

# Raw GitHub URL for JSON
url_json <- "https://raw.githubusercontent.com/hbedros/data607_hw7/main/book.json"

# Loading from JSON
json_data <- fromJSON(url_json)

# Print structure of the first book
str(json_data$library$books[[1]])

##  chr [1:3] "The Alchemist" "Sapiens: A Brief History of Humankind" ...

# Flatten the JSON
flattened_json <- flatten(json_data$library$books)

# View the flattened structure
str(flattened_json)

## 'data.frame':    3 obs. of  10 variables:
##  $ title                           : chr  "The Alchemist" "Sapiens: A Brief History of Humankind" "OpenIntro Statistics, 4th Edition"
##  $ author                          : chr  "Paulo Coelho" "Yuval Noah Harari" NA
##  $ authors                         :List of 3
##   ..$ : NULL
##   ..$ : NULL
##   ..$ : chr  "David M Diez" "Christopher D Barr" "Mine Çetinkaya-Rundel"
##  $ attributes.Philosophical Journey: chr  "Follows Santiago, a shepherd, on his journey to discover a treasure by the Egyptian pyramids. An allegory about"| __truncated__ NA NA
##  $ attributes.Universal Appeal     : chr  "Translated into numerous languages and a global bestseller." NA NA
##  $ attributes.Broad Overview       : chr  NA "Chronicles the evolution of Homo sapiens from ancient times to the modern era, touching upon various aspects li"| __truncated__ NA
##  $ attributes.Provocative Insights : chr  NA "Harari provides thought-provoking insights on capitalism, religion, and the possible future paths for humanity." NA
##  $ attributes.Accessible Content   : chr  NA NA "Known for its beginner-friendly approach to statistical concepts."
##  $ attributes.Open Source          : chr  NA NA "Available for free online, embodying a commitment to accessible education."
##  $ attributes.Diverse Applications : chr  NA NA "Features a wide range of real-world data and examples, emphasizing the practical application of statistics."

# Convert author and authors column into a single authors column
flattened_json$consolidated_authors <- ifelse(is.na(flattened_json$author), 
                                              sapply(flattened_json$authors, paste, collapse = "; "), 
                                              flattened_json$author)

# Remove the original author and authors columns
flattened_json$author <- NULL
flattened_json$authors <- NULL

# Create a function to consolidate the attributes
consolidate_attributes <- function(row) {
  # Extract only the attribute columns (ignoring title and authors)
  attributes <- row[grepl("^attributes\\.", names(row))]
  
  # Get the names of the attributes
  attribute_names <- gsub("^attributes\\.", "", names(attributes))
  
  # Combine non-NA attributes
  combined_attributes <- paste(attribute_names[!is.na(attributes)], collapse = "; ")
  
  return(combined_attributes)
}

# Apply the function to each row
flattened_json$consolidated_attributes <- apply(flattened_json, 1, consolidate_attributes)

# Create the final data frame
final_df <- flattened_json[, c("title", "consolidated_authors", "consolidated_attributes"), drop = FALSE]
colnames(final_df) <- c("Title", "Authors", "Attributes")


# Using kable() to display the table in a cleaner format
knitr::kable(final_df, 
             caption = "Books Information from JSON", 
             align = 'c') %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

Books Information from JSON
Title	Authors	Attributes
The Alchemist	Paulo Coelho	Philosophical Journey; Universal Appeal
Sapiens: A Brief History of Humankind	Yuval Noah Harari	Broad Overview; Provocative Insights
OpenIntro Statistics, 4th Edition	David M Diez; Christopher D Barr; Mine Çetinkaya-Rundel	Accessible Content; Open Source; Diverse Applications

DATA607_HW7

Haig Bedros

2023-10-13

Introduction

Data Source URLs

HTML File:

XML File:

JSON File: