options(repos = c(CRAN = "https://cran.rstudio.com"))
req_packages <- c("DBI","RMySQL","dplyr","dbplyr","knitr","tidyr", "readr", "stringr","tibble", "rmarkdown", "purrr", "lubridate", "here", "httr2", "RCurl","rvest","xml2","jsonlite","kableExtra")
for (pkg in req_packages) {
if (!require(pkg, character.only = TRUE)) {
message(paste("Installing package:", pkg))
install.packages(pkg, dependencies = TRUE)
} else {
message(paste(pkg, " already installed."))
}
library(pkg, character.only = TRUE)
}
## Loading required package: DBI
## DBI already installed.
## Loading required package: RMySQL
## RMySQL already installed.
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## dplyr already installed.
## Loading required package: dbplyr
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
## dbplyr already installed.
## Loading required package: knitr
## knitr already installed.
## Loading required package: tidyr
## tidyr already installed.
## Loading required package: readr
## readr already installed.
## Loading required package: stringr
## stringr already installed.
## Loading required package: tibble
## tibble already installed.
## Loading required package: rmarkdown
## rmarkdown already installed.
## Loading required package: purrr
## purrr already installed.
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## lubridate already installed.
## Loading required package: here
## here() starts at /Users/paulabrown/Documents/CUNY SPS- Data 607/Week 7 Assignments
## here already installed.
## Loading required package: httr2
## httr2 already installed.
## Loading required package: RCurl
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
## RCurl already installed.
## Loading required package: rvest
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
## rvest already installed.
## Loading required package: xml2
##
## Attaching package: 'xml2'
## The following object is masked from 'package:httr2':
##
## url_parse
## xml2 already installed.
## Loading required package: jsonlite
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
## jsonlite already installed.
## Loading required package: kableExtra
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
## kableExtra already installed.
# HTML
# You'll need to use library(rvest) if not already configured
books_html <- read_html("https://raw.githubusercontent.com/PaulaB989/Week7/refs/heads/main/Paula's%20Favorite%20Books.html")
books_html_opt2 <- read_html("https://raw.githubusercontent.com/PaulaB989/Week7/refs/heads/main/Paula's%20Favorite%20Books_alt.html")
# JSON
# You'll need to use library(jsonlite) if not already configured
books_json <- fromJSON("https://raw.githubusercontent.com/PaulaB989/Week7/refs/heads/main/Paula's%20Favorite%20Books.json")
# XML
# You'll need to use library(xml2) if not already configured
books_xml <- read_xml("https://raw.githubusercontent.com/PaulaB989/Week7/refs/heads/main/Paula's%20Favorite%20Books.xml")
books_html_df <- books_html %>%
html_node("table") %>%
html_table(fill = TRUE)
print(books_html_df)
## # A tibble: 3 × 4
## Title Authors and Credenti…¹ Publisher `Year of Copyright`
## <chr> <chr> <chr> <int>
## 1 Data Science Programming… John Paul Mueller; Lu… John Wil… 2020
## 2 Investing All-In-One for… Robert S. Griswold, M… John Wil… 2017
## 3 Home-Based Business All-… Paul Edwards; Sarah E… John Wil… 2010
## # ℹ abbreviated name: ¹`Authors and Credentials`
books_html_opt2_df <- books_html_opt2 %>%
html_node("table") %>%
html_table(fill = TRUE)
print(books_html_opt2_df)
## # A tibble: 3 × 5
## Title Authors Credentials Publisher `Year of Copyright`
## <chr> <chr> <chr> <chr> <int>
## 1 Data Science Programming Al… John P… ";GDE" John Wil… 2020
## 2 Investing All-In-One for Du… Robert… "MBA, MSBA… John Wil… 2017
## 3 Home-Based Business All-In-… Paul E… "" John Wil… 2010
books_json_df <- books_json$books %>%
unnest(authors) %>%
unnest(credentials) %>%
select("Title"=title, "Author"=name, "Credentials"=credentials, "Publisher"=publisher, "Year of Copyright"=copyright_year)
print(books_json_df)
## # A tibble: 11 × 5
## Title Author Credentials Publisher `Year of Copyright`
## <chr> <chr> <chr> <chr> <int>
## 1 Data Science Programming Al… John … <NA> John Wil… 2020
## 2 Data Science Programming Al… Luca … GDE John Wil… 2020
## 3 Investing All-In-One for Du… Rober… MBA John Wil… 2017
## 4 Investing All-In-One for Du… Rober… MSBA John Wil… 2017
## 5 Investing All-In-One for Du… Matt … <NA> John Wil… 2017
## 6 Investing All-In-One for Du… Paul … <NA> John Wil… 2017
## 7 Investing All-In-One for Du… Eric … MBA John Wil… 2017
## 8 Investing All-In-One for Du… Russe… <NA> John Wil… 2017
## 9 Home-Based Business All-In-… Paul … <NA> John Wil… 2010
## 10 Home-Based Business All-In-… Sarah… <NA> John Wil… 2010
## 11 Home-Based Business All-In-… Peter… <NA> John Wil… 2010
# xml_structure(books_xml)
# View structure of children nodes for first item in the array
book_nodes <- xml_children(books_xml)
xml_structure(xml_children(book_nodes[[1]]))
## [[1]]
## <title>
## {text}
##
## [[2]]
## <authors>
## {comment}
## <author>
## <name>
## {text}
## <credentials>
## <credential>
## <author>
## <name>
## {text}
## <credentials>
## <credential>
## {text}
##
## [[3]]
## <publisher>
## {text}
##
## [[4]]
## <copyrightyear>
## {text}
titles <- c()
authors <- c()
credentials <- c()
publishers <- c()
years <- c()
# Find all <book> nodes
book_nodes <- xml_find_all(books_xml, ".//book")
# Loop through each book
for (book in book_nodes) {
title <- xml_text(xml_find_first(book, "./title"))
publisher <- xml_text(xml_find_first(book, "./publisher"))
year <- xml_text(xml_find_first(book, "./copyrightyear"))
# Find all authors for this book
author_nodes <- xml_find_all(book, ".//author")
for (author in author_nodes) {
name <- xml_text(xml_find_first(author, "./name"))
creds <- xml_find_all(author, "./credentials/credential") %>% xml_text()
# If no credentials, store NA
if (length(creds) == 0 || all(creds == "")) {
creds <- NA
}
# Add one row per credential (or NA)
for (cred in creds) {
titles <- c(titles, title)
authors <- c(authors, name)
credentials <- c(credentials, cred)
publishers <- c(publishers, publisher)
years <- c(years, year)
}
}
}
# Combine into a data frame
books_xml_df <- data.frame(
Title = titles,
Author = authors,
Credentials = credentials,
Publisher = publishers,
Copyright_Year = years,
stringsAsFactors = FALSE
)
# View the result
print(books_xml_df)
## Title Author
## 1 Data Science Programming All-In-One for Dummies John Paul Mueller
## 2 Data Science Programming All-In-One for Dummies Luca Massaron
## 3 Investing All-In-One for Dummies Robert S. Griswold
## 4 Investing All-In-One for Dummies Robert S. Griswold
## 5 Investing All-In-One for Dummies Matt Krantz
## 6 Investing All-In-One for Dummies Paul Mladjenovic
## 7 Investing All-In-One for Dummies Eric Tyson
## 8 Investing All-In-One for Dummies Russell Wild
## 9 Home-Based Business All-In-One for Dummies Paul Edwards
## 10 Home-Based Business All-In-One for Dummies Sarah Edwards
## 11 Home-Based Business All-In-One for Dummies Peter Economy
## Credentials Publisher Copyright_Year
## 1 <NA> John Wiley & Sons, Inc. 2020
## 2 GDE John Wiley & Sons, Inc. 2020
## 3 MBA John Wiley & Sons, Inc. 2017
## 4 MSBA John Wiley & Sons, Inc. 2017
## 5 <NA> John Wiley & Sons, Inc. 2017
## 6 <NA> John Wiley & Sons, Inc. 2017
## 7 MBA John Wiley & Sons, Inc. 2017
## 8 <NA> John Wiley & Sons, Inc. 2017
## 9 <NA> John Wiley & Sons, Inc. 2010
## 10 <NA> John Wiley & Sons, Inc. 2010
## 11 <NA> John Wiley & Sons, Inc. 2010
#Are the three data frames identical? The three data frames are not identical. The JSON and XML data frames match — both have 11 rows and 5 columns. In contrast, the HTML data frame has only 3 rows and 5 columns, with all authors, and credentials merged into a single cell.