# Read HTML file
html <- read_html('https://github.com/wberritt913/CUNY_DATA607/blob/main/book_info.html')
# Extract text content from specific HTML elements
html_snip <- html |> html_elements("body") |> html_elements("p") |> html_elements("span") |> html_text2()
# Extract information using regular expressions
headers <- str_extract_all(html_snip, "<td>(.*?)</td>")
filtered_headers <- headers[lengths(headers) > 0]
# Extract titles, authors, and attributes using sequence and replacement
title <- filtered_headers[seq(1, length(filtered_headers), by = 3)]
title <- str_replace_all(title, "<td>(.*?)</td>", "\\1")
authors <- filtered_headers[seq(2, length(filtered_headers), by = 3)]
authors <- str_replace_all(authors, "<td>(.*?)</td>", "\\1")
attributes <- filtered_headers[seq(3, length(filtered_headers), by = 3)]
attributes <- str_replace_all(attributes, "<td>(.*?)</td>", "\\1")
# Create a tibble (data frame) with the extracted information
books_html <- tibble(
title = title,
authors = authors,
attributes = attributes
)
# Display the tibble
books_html
## # A tibble: 3 × 3
## title authors attributes
## <chr> <chr> <chr>
## 1 The Lightning Thief Rick Riordan Fantasy, Greek myth…
## 2 Old Yeller Fred Gipson Classic, Coming-of-…
## 3 The Da Vinci Code Dan Brown, Jeff Harding, Paul Michael Mystery, Thriller, …
## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 3 × 3
## title authors attributes
## <chr> <chr> <chr>
## 1 The Lightning Thief Rick Riordan Fantasy, Greek myth…
## 2 Old Yeller Fred Gipson Classic, Coming-of-…
## 3 The Da Vinci Code Dan Brown, Jeff Harding, Paul Michael Mystery, Thriller, …
# Read JSON file
json_data <- fromJSON("https://raw.githubusercontent.com/wberritt913/CUNY_DATA607/main/week7_json.json")
# Display the content of the JSON data
json_data
## $books
## title authors
## 1 The Lightning Thief Rick Riordan
## 2 Old Yeller Fred Gipson
## 3 The Da Vinci Code Dan Brown, Jeff Harding, Paul Michael
## attributes
## 1 Fantasy, Greek mythology, Adventure
## 2 Classic, Coming-of-age, Adventure
## 3 Mystery, Thriller, Conspiracy
# Write HTML content to file
writeLines('
<html>
<head>
<title>Book Title</title>
</head>
<body>
<table>
<tr>
<th>title</th>
<th>authors</th>
<th>attributes</th>
</tr>
<tr>
<td>The Lightning Thief</td>
<td>Rick Riordan</td>
<td>Fantasy, Greek mythology, Adventure</td>
</tr>
<tr>
<td>Old Yeller</td>
<td>Fred Gipson</td>
<td>Classic, Coming-of-age, Adventure</td>
</tr>
<tr>
<td>The Da Vinci Code</td>
<td>Dan Brown, Jeff Harding, Paul Michael</td>
<td>Mystery, Thriller, Conspiracy</td>
</tr>
</table>
</body>
</html>', "books.html") # Write the HTML content to a file named "books.html"
# Write XML content to file
writeLines('
<books>
<book>
<title>The Lightning Thief</title>
<authors>Rick Riordan</authors>
<attributes>Fantasy, Greek mythology, Adventure</attributes>
</book>
<book>
<title>Old Yeller</title>
<authors>Fred Gipson</authors>
<attributes>Classic, Coming-of-age, Adventure</attributes>
</book>
<book>
<title>The Da Vinci Code</title>
<authors>Dan Brown, Jeff Harding, Paul Michael</authors>
<attributes>Mystery, Thriller, Conspiracy</attributes>
</book>
</books>', "books.xml") # Write the XML content to a file named "books.xml"
# Write JSON content to file
writeLines('{
"books": [
{
"title": "The Lightning Thief",
"authors": ["Rick Riordan"],
"attributes": ["Fantasy", "Greek mythology", "Adventure"]
},
{
"title": "Old Yeller",
"authors": ["Fred Gipson"],
"attributes": ["Classic", "Coming-of-age", "Adventure"]
},
{
"title": "The Da Vinci Code",
"authors": ["Dan Brown", "Jeff Harding", "Paul Michael"],
"attributes": ["Mystery", "Thriller", "Conspiracy"]
}
]
}', "books.json") # Write the JSON content to a file named "books.json"
# Read HTML file
html <- read_html('books.html')
# Extract 'table' element and convert to a data frame
table <- html |> html_element("table") |> html_table()
# Display the resulting data frame
table
## # A tibble: 3 × 3
## title authors attributes
## <chr> <chr> <chr>
## 1 The Lightning Thief Rick Riordan Fantasy, Greek myth…
## 2 Old Yeller Fred Gipson Classic, Coming-of-…
## 3 The Da Vinci Code Dan Brown, Jeff Harding, Paul Michael Mystery, Thriller, …
# Read XML file
xml <- read_xml('books.xml')
# Extract 'book' elements from XML
book <- xml |> xml_find_all("book")
# Create a data frame (tibble) with extracted information
xml_df <- tibble(
title = book |> xml_find_all("title") |> xml_text(),
authors = book |> xml_find_all("authors") |> xml_text(),
attributes = book |> xml_find_all("attributes") |> xml_text()
)
# Display the resulting data frame
xml_df
## # A tibble: 3 × 3
## title authors attributes
## <chr> <chr> <chr>
## 1 The Lightning Thief Rick Riordan Fantasy, Greek myth…
## 2 Old Yeller Fred Gipson Classic, Coming-of-…
## 3 The Da Vinci Code Dan Brown, Jeff Harding, Paul Michael Mystery, Thriller, …
# Read JSON file
json_data <- fromJSON("books.json")
# Create a data frame (tibble) with extracted information
json_df <- tibble(
titles = json_data$books$title,
authors = sapply(json_data$books$authors, paste, collapse = ", "),
attributes = sapply(json_data$books$attributes, paste, collapse = ", ")
)
# Display the original JSON data
json_data
## $books
## title authors
## 1 The Lightning Thief Rick Riordan
## 2 Old Yeller Fred Gipson
## 3 The Da Vinci Code Dan Brown, Jeff Harding, Paul Michael
## attributes
## 1 Fantasy, Greek mythology, Adventure
## 2 Classic, Coming-of-age, Adventure
## 3 Mystery, Thriller, Conspiracy
# Display the resulting data frame
json_df
## # A tibble: 3 × 3
## titles authors attributes
## <chr> <chr> <chr>
## 1 The Lightning Thief Rick Riordan Fantasy, Greek myth…
## 2 Old Yeller Fred Gipson Classic, Coming-of-…
## 3 The Da Vinci Code Dan Brown, Jeff Harding, Paul Michael Mystery, Thriller, …