Working with HTML, JSON, and XML in R

Install and load necessary packages

options(repos = c(CRAN = "https://cran.rstudio.com"))

req_packages <- c("DBI","RMySQL","dplyr","dbplyr","knitr","tidyr", "readr", "stringr","tibble", "rmarkdown", "purrr", "lubridate", "here", "httr2", "RCurl","rvest","xml2","jsonlite","kableExtra")
for (pkg in req_packages) {
  if (!require(pkg, character.only = TRUE)) {
    message(paste("Installing package:", pkg))
    install.packages(pkg, dependencies = TRUE)
  } else {
    message(paste(pkg, " already installed."))
  }
  library(pkg, character.only = TRUE)
}

## Loading required package: DBI

## DBI  already installed.

## Loading required package: RMySQL

## RMySQL  already installed.

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## dplyr  already installed.

## Loading required package: dbplyr

## 
## Attaching package: 'dbplyr'

## The following objects are masked from 'package:dplyr':
## 
##     ident, sql

## dbplyr  already installed.

## Loading required package: knitr

## knitr  already installed.

## Loading required package: tidyr

## tidyr  already installed.

## Loading required package: readr

## readr  already installed.

## Loading required package: stringr

## stringr  already installed.

## Loading required package: tibble

## tibble  already installed.

## Loading required package: rmarkdown

## rmarkdown  already installed.

## Loading required package: purrr

## purrr  already installed.

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

## lubridate  already installed.

## Loading required package: here

## here() starts at /Users/paulabrown/Documents/CUNY SPS- Data 607/Week 7 Assignments

## here  already installed.

## Loading required package: httr2

## httr2  already installed.

## Loading required package: RCurl

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

## RCurl  already installed.

## Loading required package: rvest

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:readr':
## 
##     guess_encoding

## rvest  already installed.

## Loading required package: xml2

## 
## Attaching package: 'xml2'

## The following object is masked from 'package:httr2':
## 
##     url_parse

## xml2  already installed.

## Loading required package: jsonlite

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

## jsonlite  already installed.

## Loading required package: kableExtra

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

## kableExtra  already installed.

Read in the HTML, JSON, and XML files

# HTML
# You'll need to use library(rvest) if not already configured
books_html <- read_html("https://raw.githubusercontent.com/PaulaB989/Week7/refs/heads/main/Paula's%20Favorite%20Books.html")

books_html_opt2 <- read_html("https://raw.githubusercontent.com/PaulaB989/Week7/refs/heads/main/Paula's%20Favorite%20Books_alt.html")

# JSON
# You'll need to use library(jsonlite) if not already configured
books_json <- fromJSON("https://raw.githubusercontent.com/PaulaB989/Week7/refs/heads/main/Paula's%20Favorite%20Books.json")

# XML
# You'll need to use library(xml2) if not already configured
books_xml <- read_xml("https://raw.githubusercontent.com/PaulaB989/Week7/refs/heads/main/Paula's%20Favorite%20Books.xml")

Create dataframe for HTML file option 1

books_html_df <- books_html %>%
  html_node("table") %>%
  html_table(fill = TRUE)
print(books_html_df)

## # A tibble: 3 × 4
##   Title                     Authors and Credenti…¹ Publisher `Year of Copyright`
##   <chr>                     <chr>                  <chr>                   <int>
## 1 Data Science Programming… John Paul Mueller; Lu… John Wil…                2020
## 2 Investing All-In-One for… Robert S. Griswold, M… John Wil…                2017
## 3 Home-Based Business All-… Paul Edwards; Sarah E… John Wil…                2010
## # ℹ abbreviated name: ¹`Authors and Credentials`

Create dataframe for HTML file option 2

books_html_opt2_df <- books_html_opt2 %>%
  html_node("table") %>%
  html_table(fill = TRUE)
print(books_html_opt2_df)

## # A tibble: 3 × 5
##   Title                        Authors Credentials Publisher `Year of Copyright`
##   <chr>                        <chr>   <chr>       <chr>                   <int>
## 1 Data Science Programming Al… John P… ";GDE"      John Wil…                2020
## 2 Investing All-In-One for Du… Robert… "MBA, MSBA… John Wil…                2017
## 3 Home-Based Business All-In-… Paul E… ""          John Wil…                2010

Create dataframe for JSON file

books_json_df <- books_json$books %>%
  unnest(authors) %>%
  unnest(credentials) %>%
  select("Title"=title, "Author"=name, "Credentials"=credentials, "Publisher"=publisher, "Year of Copyright"=copyright_year)
print(books_json_df)

## # A tibble: 11 × 5
##    Title                        Author Credentials Publisher `Year of Copyright`
##    <chr>                        <chr>  <chr>       <chr>                   <int>
##  1 Data Science Programming Al… John … <NA>        John Wil…                2020
##  2 Data Science Programming Al… Luca … GDE         John Wil…                2020
##  3 Investing All-In-One for Du… Rober… MBA         John Wil…                2017
##  4 Investing All-In-One for Du… Rober… MSBA        John Wil…                2017
##  5 Investing All-In-One for Du… Matt … <NA>        John Wil…                2017
##  6 Investing All-In-One for Du… Paul … <NA>        John Wil…                2017
##  7 Investing All-In-One for Du… Eric … MBA         John Wil…                2017
##  8 Investing All-In-One for Du… Russe… <NA>        John Wil…                2017
##  9 Home-Based Business All-In-… Paul … <NA>        John Wil…                2010
## 10 Home-Based Business All-In-… Sarah… <NA>        John Wil…                2010
## 11 Home-Based Business All-In-… Peter… <NA>        John Wil…                2010

View XML structure to begin creating the dataframe

# xml_structure(books_xml)

# View structure of children nodes for first item in the array
book_nodes <- xml_children(books_xml)
xml_structure(xml_children(book_nodes[[1]]))

## [[1]]
## <title>
##   {text}
## 
## [[2]]
## <authors>
##   {comment}
##   <author>
##     <name>
##       {text}
##     <credentials>
##       <credential>
##   <author>
##     <name>
##       {text}
##     <credentials>
##       <credential>
##         {text}
## 
## [[3]]
## <publisher>
##   {text}
## 
## [[4]]
## <copyrightyear>
##   {text}

Create the dataframe for the XML file

titles <- c()
authors <- c()
credentials <- c()
publishers <- c()
years <- c()

# Find all <book> nodes
book_nodes <- xml_find_all(books_xml, ".//book")

# Loop through each book
for (book in book_nodes) {
  title <- xml_text(xml_find_first(book, "./title"))
  publisher <- xml_text(xml_find_first(book, "./publisher"))
  year <- xml_text(xml_find_first(book, "./copyrightyear"))

# Find all authors for this book
  author_nodes <- xml_find_all(book, ".//author")
  
  for (author in author_nodes) {
    name <- xml_text(xml_find_first(author, "./name"))
    creds <- xml_find_all(author, "./credentials/credential") %>% xml_text()
    
    # If no credentials, store NA
    if (length(creds) == 0 || all(creds == "")) {
      creds <- NA
    }
     # Add one row per credential (or NA)
    for (cred in creds) {
      titles <- c(titles, title)
      authors <- c(authors, name)
      credentials <- c(credentials, cred)
      publishers <- c(publishers, publisher)
      years <- c(years, year)
    }
  }
}

# Combine into a data frame
books_xml_df <- data.frame(
  Title = titles,
  Author = authors,
  Credentials = credentials,
  Publisher = publishers,
  Copyright_Year = years,
  stringsAsFactors = FALSE
)
# View the result
print(books_xml_df)

##                                              Title             Author
## 1  Data Science Programming All-In-One for Dummies  John Paul Mueller
## 2  Data Science Programming All-In-One for Dummies      Luca Massaron
## 3                 Investing All-In-One for Dummies Robert S. Griswold
## 4                 Investing All-In-One for Dummies Robert S. Griswold
## 5                 Investing All-In-One for Dummies        Matt Krantz
## 6                 Investing All-In-One for Dummies   Paul Mladjenovic
## 7                 Investing All-In-One for Dummies         Eric Tyson
## 8                 Investing All-In-One for Dummies       Russell Wild
## 9       Home-Based Business All-In-One for Dummies       Paul Edwards
## 10      Home-Based Business All-In-One for Dummies      Sarah Edwards
## 11      Home-Based Business All-In-One for Dummies      Peter Economy
##    Credentials                Publisher Copyright_Year
## 1         <NA>  John Wiley & Sons, Inc.           2020
## 2          GDE  John Wiley & Sons, Inc.           2020
## 3          MBA  John Wiley & Sons, Inc.           2017
## 4         MSBA  John Wiley & Sons, Inc.           2017
## 5         <NA>  John Wiley & Sons, Inc.           2017
## 6         <NA>  John Wiley & Sons, Inc.           2017
## 7          MBA  John Wiley & Sons, Inc.           2017
## 8         <NA>  John Wiley & Sons, Inc.           2017
## 9         <NA>  John Wiley & Sons, Inc.           2010
## 10        <NA>  John Wiley & Sons, Inc.           2010
## 11        <NA>  John Wiley & Sons, Inc.           2010

#Are the three data frames identical? The three data frames are not identical. The JSON and XML data frames match — both have 11 rows and 5 columns. In contrast, the HTML data frame has only 3 rows and 5 columns, with all authors, and credentials merged into a single cell.