Week 7 Assignment

Load the needed packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(rvest)

## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(jsonlite)

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

library(xml2)

Get the html file I created hosted on GitHub

html_books_file <- read_html("https://raw.githubusercontent.com/autistic96/week7-assignment/main/books.html")

html_books_file

## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\r\n    <table>\n<tr>\n<th>Title</th>\r\n    \t\t<th>Author(s)</th> ...

str(html_books_file)

## List of 2
##  $ node:<externalptr> 
##  $ doc :<externalptr> 
##  - attr(*, "class")= chr [1:2] "xml_document" "xml_node"

# Tibble
html_books_file |> 
  html_element("table") |> 
  html_table()

## # A tibble: 3 × 4
##   Title                         `Author(s)`                    Edition Pages
##   <chr>                         <chr>                            <int> <int>
## 1 The Algorithm Design Manual   Steven S. Skiena                     3   810
## 2 Cracking the Coding Interview Gayle Laakmann McDowell              6   687
## 3 Algorithms                    Robert Sedgewick & Kevin Wayne       4   976

Get the XML file from my GitHub

xml_books_file <- read_xml("https://raw.githubusercontent.com/autistic96/week7-assignment/main/books.xml")

str(xml_books_file)

## List of 2
##  $ node:<externalptr> 
##  $ doc :<externalptr> 
##  - attr(*, "class")= chr [1:2] "xml_document" "xml_node"

# Extraction
titles <- xml_text(xml_find_all(xml_books_file, "//Title"))
authors <- xml_text(xml_find_all(xml_books_file, "//Author"))
editions <- as.integer(xml_text(xml_find_all(xml_books_file, "//Edition")))
pages <- as.integer(xml_text(xml_find_all(xml_books_file, "//Pages")))

# Create the data frame
xml_to_df <- data.frame(Title = titles, Author = authors, Edition = editions, Pages = pages)

print(xml_to_df)

##                           Title                         Author Edition Pages
## 1   The Algorithm Design Manual               Steven S. Skiena       3   810
## 2 Cracking the Coding Interview        Gayle Laakmann McDowell       6   687
## 3                    Algorithms Robert Sedgewick & Kevin Wayne       4   976

Getting the JSON file from my GitHub

json_books_file <- fromJSON("https://raw.githubusercontent.com/autistic96/week7-assignment/main/books.json")

str(json_books_file)

## List of 1
##  $ books:'data.frame':   3 obs. of  4 variables:
##   ..$ Title  : chr [1:3] "The Algorithm Design Manual" "Cracking the Coding Interview" "Algorithms"
##   ..$ Author : chr [1:3] "Steven S. Skiena" "Gayle Laakmann McDowell" "Robert Sedgewick & Kevin Wayne"
##   ..$ Edition: int [1:3] 3 6 4
##   ..$ Pages  : int [1:3] 810 687 976

# RStudio already displays this as a data frame without needing to convert.
json_books_file

## $books
##                           Title                         Author Edition Pages
## 1   The Algorithm Design Manual               Steven S. Skiena       3   810
## 2 Cracking the Coding Interview        Gayle Laakmann McDowell       6   687
## 3                    Algorithms Robert Sedgewick & Kevin Wayne       4   976

All three data frames are identical for every category. The title and author are both characters. The edition number and pages are both integers.

Week 7 Assignment

Kelly Eng

2023-10-15