##XML,xml12, rjson, and jsonlite will all be used to parse our documents
library(xml2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(XML)
library(rjson)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## 
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
library(readr)
library(RCurl)
## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete

The xml file was difficult to parse cleanly as a dataframe, so I sliced each column from the parsed XML and formed a dataframe from this.

data_xml = read_xml("https://raw.githubusercontent.com/sphill12/DATA607/main/books.xml")
xml_book <- xml_text(xml_find_all(data_xml, ".//Book_Name"))
xml_author <- xml_text(xml_find_all(data_xml, ".//Authors"))
xml_atr_1 <- xml_text(xml_find_all(data_xml, ".//Attribute_One"))
xml_atr_2 <- xml_text(xml_find_all(data_xml, ".//Attribute_Two"))
df_xml <- tibble(book = xml_book, author = xml_author, attribute_1 = xml_atr_1, attribute_2 = xml_atr_2)
df_xml = df_xml[-1,]
df_xml
## # A tibble: 3 × 4
##   book                 author                            attribute_1 attribute_2
##   <chr>                <chr>                             <chr>       <chr>      
## 1 Kitchen Confidential Anthony Bourdain                  Anthony Bo… Be careful…
## 2 The Way of Kings     Brandon Sanderson                 Brandon Sa… The world …
## 3 R for Data Science   Hadley Wickham, Mine Cetinkaya-R… The chapte… When websc…

The html document was much cleaner compared to the xml, and only required that the column names be added to the document.

data_html <- getURL("https://raw.githubusercontent.com/sphill12/DATA607/main/books.html")
df_html <- readHTMLTable(data_html, as_html = TRUE, as.data.frame = TRUE)
df_html <- as.data.frame(df_html)
colnames(df_html) <- c("Book Name", "Authors", "Attribute One", "Attribute Two")
df_html
##              Book Name                                                  Authors
## 1 Kitchen Confidential                                         Anthony Bourdain
## 2     The Way of Kings                                        Brandon Sanderson
## 3   R for Data Science Hadley Wickham, Mine Cetinkaya-Rundel, Garrett Grolemund
##                                                                                                                                                                    Attribute One
## 1              Anthony Bourdain advises to never order fish on a monday from a restaurant, as this will likely be product that wasn't sold on the weekend and is about to spoil.
## 2                                                         Brandon Sanderson often weaves multiple independant story lines together in his books, which come together at the end.
## 3 The chapter on tidy data helped me to understand it. The characteristics of tidy data are that each variable is a column, each observation is a row, and each value is a cell.
##                                                                                                                                            Attribute Two
## 1 Be careful when ordering well done meals, chefs often save the worst cuts of meat to give to those that order it as it is harder to tell in this case.
## 2                   The world building is one of my favorite things about the books, with an interesting magic system and creatures unique to the world.
## 3 When webscraping, it is important to consider the ethics of what you are doing. Data that is public, non-personal, and factual is generally ok to use.

The json document also cleanly was made into a dataframe. Once again, only column names were added before the dataset was satisfactory.

data_json <- jsonlite::fromJSON("https://raw.githubusercontent.com/sphill12/DATA607/main/books.json")
df_json <- as.data.frame(data_json)
colnames(df_json) <- c("Book Name", "Authors", "Attribute One", "Attribute Two")
df_json
##              Book Name                                                  Authors
## 1 Kitchen Confidential                                         Anthony Bourdain
## 2     The Way of Kings                                        Brandon Sanderson
## 3   R for Data Science Hadley Wickham, Mine Cetinkaya-Rundel, Garrett Grolemund
##                                                                                                                                                                    Attribute One
## 1              Anthony Bourdain advises to never order fish on a monday from a restaurant, as this will likely be product that wasn't sold on the weekend and is about to spoil.
## 2                                                         Brandon Sanderson often weaves multiple independant story lines together in his books, which come together at the end.
## 3 The chapter on tidy data helped me to understand it. The characteristics of tidy data are that each variable is a column, each observation is a row, and each value is a cell.
##                                                                                                                                            Attribute Two
## 1 Be careful when ordering well done meals, chefs often save the worst cuts of meat to give to those that order it as it is harder to tell in this case.
## 2                   The world building is one of my favorite things about the books, with an interesting magic system and creatures unique to the world.
## 3 When webscraping, it is important to consider the ethics of what you are doing. Data that is public, non-personal, and factual is generally ok to use.

Overall, the end result of the 3 different file types was the same, but they required different functions to parse, and different transformations to make them satisfactory.