Load the needed packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(xml2)
Get the html file I created hosted on GitHub
html_books_file <- read_html("https://raw.githubusercontent.com/autistic96/week7-assignment/main/books.html")
html_books_file
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\r\n <table>\n<tr>\n<th>Title</th>\r\n \t\t<th>Author(s)</th> ...
str(html_books_file)
## List of 2
## $ node:<externalptr>
## $ doc :<externalptr>
## - attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# Tibble
html_books_file |>
html_element("table") |>
html_table()
## # A tibble: 3 × 4
## Title `Author(s)` Edition Pages
## <chr> <chr> <int> <int>
## 1 The Algorithm Design Manual Steven S. Skiena 3 810
## 2 Cracking the Coding Interview Gayle Laakmann McDowell 6 687
## 3 Algorithms Robert Sedgewick & Kevin Wayne 4 976
Get the XML file from my GitHub
xml_books_file <- read_xml("https://raw.githubusercontent.com/autistic96/week7-assignment/main/books.xml")
str(xml_books_file)
## List of 2
## $ node:<externalptr>
## $ doc :<externalptr>
## - attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# Extraction
titles <- xml_text(xml_find_all(xml_books_file, "//Title"))
authors <- xml_text(xml_find_all(xml_books_file, "//Author"))
editions <- as.integer(xml_text(xml_find_all(xml_books_file, "//Edition")))
pages <- as.integer(xml_text(xml_find_all(xml_books_file, "//Pages")))
# Create the data frame
xml_to_df <- data.frame(Title = titles, Author = authors, Edition = editions, Pages = pages)
print(xml_to_df)
## Title Author Edition Pages
## 1 The Algorithm Design Manual Steven S. Skiena 3 810
## 2 Cracking the Coding Interview Gayle Laakmann McDowell 6 687
## 3 Algorithms Robert Sedgewick & Kevin Wayne 4 976
Getting the JSON file from my GitHub
json_books_file <- fromJSON("https://raw.githubusercontent.com/autistic96/week7-assignment/main/books.json")
str(json_books_file)
## List of 1
## $ books:'data.frame': 3 obs. of 4 variables:
## ..$ Title : chr [1:3] "The Algorithm Design Manual" "Cracking the Coding Interview" "Algorithms"
## ..$ Author : chr [1:3] "Steven S. Skiena" "Gayle Laakmann McDowell" "Robert Sedgewick & Kevin Wayne"
## ..$ Edition: int [1:3] 3 6 4
## ..$ Pages : int [1:3] 810 687 976
# RStudio already displays this as a data frame without needing to convert.
json_books_file
## $books
## Title Author Edition Pages
## 1 The Algorithm Design Manual Steven S. Skiena 3 810
## 2 Cracking the Coding Interview Gayle Laakmann McDowell 6 687
## 3 Algorithms Robert Sedgewick & Kevin Wayne 4 976
All three data frames are identical for every category. The title and author are both characters. The edition number and pages are both integers.