The purpose of this assignment is to understand multiple file sources and its format and the structure loaded in R.

Load Packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   1.0.1 
## v tibble  3.1.6      v dplyr   1.0.10
## v tidyr   1.2.0      v stringr 1.5.0 
## v readr   2.1.2      v forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(xml2)
## Warning: package 'xml2' was built under R version 4.0.5
library(rvest)
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

Setting up multiple data sources in different formats

xml_url <- "https://raw.githubusercontent.com/eddiexunyc/607_assignment_7/main/Resources/fav_book.xml"
json_url <- "https://raw.githubusercontent.com/eddiexunyc/607_assignment_7/main/Resources/fav_book.json"
html_url <- "https://raw.githubusercontent.com/eddiexunyc/607_assignment_7/main/Resources/fav_book.html"

Load XML Dataset

# read xml dataset
xml_data <- read_xml(xml_url)

# read text structure in xml format
xml_structure(xml_data)
## <root>
##   <row>
##     <book_id>
##       {text}
##     <book_title>
##       {text}
##     <book_author>
##       {text}
##     <book_genre>
##       {text}
##     <book_lang>
##       {text}
##     <book_price>
##       {text}
##   <row>
##     <book_id>
##       {text}
##     <book_title>
##       {text}
##     <book_author>
##       {text}
##     <book_genre>
##       {text}
##     <book_lang>
##       {text}
##     <book_price>
##       {text}
##   <row>
##     <book_id>
##       {text}
##     <book_title>
##       {text}
##     <book_author>
##       {text}
##     <book_genre>
##       {text}
##     <book_lang>
##       {text}
##     <book_price>
##       {text}
# extract each data from the xml file
book_id <- xml_text(xml_find_all(xml_data, xpath = "//book_id"))
book_title <-xml_text(xml_find_all(xml_data, xpath = "//book_title"))
book_author <-xml_text(xml_find_all(xml_data, xpath = "//book_author"))
book_genre <-xml_text(xml_find_all(xml_data, xpath = "//book_genre"))
book_lang <-xml_text(xml_find_all(xml_data, xpath = "//book_lang"))
book_price <-xml_text(xml_find_all(xml_data, xpath = "//book_price"))

#convert the extracted data into a dataframe
fav_book_df <- tibble(`Book ID` = book_id , `Title` = book_title, `Author` = book_author, `Genre` = book_genre, `Language` = book_lang, `Price` = book_price)

#view the dataframe
knitr::kable(fav_book_df)
Book ID Title Author Genre Language Price
1 Pride and Prejudice Jane Austen Romance English 7.99
2 The Da Vinci Code Dan Brown Mystery Thriller English 20.73
3 Relic Douglas Preston & Lincoln Child Horror Fiction English 8.99

Load JSON dataset

# read JSON dataset
json_data <- fromJSON(json_url)

#convert the data into a dataframe and rename the column
json_df <- as.data.frame(json_data)
colnames(json_df) <- c("Book ID", "Title", "Author", "Genre", "Language", "Price")

#view dataframe
knitr::kable(json_df)
Book ID Title Author Genre Language Price
1 Pride and Prejudice Jane Austen Romance English 7.99
2 The Da Vinci Code Dan Brown Mystery Thriller English 20.73
3 Relic Douglas Preston & Lincoln Child Horror Fiction English 8.99

Load HTML dataset

# read HTML dataset
html_data <- read_html(html_url)

# convert html data into a dataframe
html_df <- html_data %>%
  html_node("table") %>%
  html_table(header = TRUE, fill = TRUE)

# view dataframe
knitr::kable(html_df)
book_id book_title book_author book_genre book_lang book_price
1 Pride and Prejudice Jane Austin Romance English 7.99
2 The Da Vinci Code Dan Brown Mystery Thriller English 20.73
3 Relic Douglas Preston & Lincoln Child Horror Fiction English 8.99

Conclusion

The data from different did not come in identical as the format structure is different between JSON, HTML, and XML. Even though the data sources come in with different formats, it can be easily imported with right libraries. From there, data cleaning and analysis can be done easily.