607_assignment

The purpose of this assignment is to understand multiple file sources and its format and the structure loaded in R.

Load Packages

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.4.0      v purrr   1.0.1 
## v tibble  3.1.6      v dplyr   1.0.10
## v tidyr   1.2.0      v stringr 1.5.0 
## v readr   2.1.2      v forcats 0.5.2

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.5

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(xml2)

## Warning: package 'xml2' was built under R version 4.0.5

library(rvest)

## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(jsonlite)

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

Setting up multiple data sources in different formats

xml_url <- "https://raw.githubusercontent.com/eddiexunyc/607_assignment_7/main/Resources/fav_book.xml"
json_url <- "https://raw.githubusercontent.com/eddiexunyc/607_assignment_7/main/Resources/fav_book.json"
html_url <- "https://raw.githubusercontent.com/eddiexunyc/607_assignment_7/main/Resources/fav_book.html"

Load XML Dataset

# read xml dataset
xml_data <- read_xml(xml_url)

# read text structure in xml format
xml_structure(xml_data)

## <root>
##   <row>
##     <book_id>
##       {text}
##     <book_title>
##       {text}
##     <book_author>
##       {text}
##     <book_genre>
##       {text}
##     <book_lang>
##       {text}
##     <book_price>
##       {text}
##   <row>
##     <book_id>
##       {text}
##     <book_title>
##       {text}
##     <book_author>
##       {text}
##     <book_genre>
##       {text}
##     <book_lang>
##       {text}
##     <book_price>
##       {text}
##   <row>
##     <book_id>
##       {text}
##     <book_title>
##       {text}
##     <book_author>
##       {text}
##     <book_genre>
##       {text}
##     <book_lang>
##       {text}
##     <book_price>
##       {text}

# extract each data from the xml file
book_id <- xml_text(xml_find_all(xml_data, xpath = "//book_id"))
book_title <-xml_text(xml_find_all(xml_data, xpath = "//book_title"))
book_author <-xml_text(xml_find_all(xml_data, xpath = "//book_author"))
book_genre <-xml_text(xml_find_all(xml_data, xpath = "//book_genre"))
book_lang <-xml_text(xml_find_all(xml_data, xpath = "//book_lang"))
book_price <-xml_text(xml_find_all(xml_data, xpath = "//book_price"))

#convert the extracted data into a dataframe
fav_book_df <- tibble(`Book ID` = book_id , `Title` = book_title, `Author` = book_author, `Genre` = book_genre, `Language` = book_lang, `Price` = book_price)

#view the dataframe
knitr::kable(fav_book_df)

Book ID	Title	Author	Genre	Language	Price
1	Pride and Prejudice	Jane Austen	Romance	English	7.99
2	The Da Vinci Code	Dan Brown	Mystery Thriller	English	20.73
3	Relic	Douglas Preston & Lincoln Child	Horror Fiction	English	8.99

Load JSON dataset

# read JSON dataset
json_data <- fromJSON(json_url)

#convert the data into a dataframe and rename the column
json_df <- as.data.frame(json_data)
colnames(json_df) <- c("Book ID", "Title", "Author", "Genre", "Language", "Price")

#view dataframe
knitr::kable(json_df)

Book ID	Title	Author	Genre	Language	Price
1	Pride and Prejudice	Jane Austen	Romance	English	7.99
2	The Da Vinci Code	Dan Brown	Mystery Thriller	English	20.73
3	Relic	Douglas Preston & Lincoln Child	Horror Fiction	English	8.99

Load HTML dataset

# read HTML dataset
html_data <- read_html(html_url)

# convert html data into a dataframe
html_df <- html_data %>%
  html_node("table") %>%
  html_table(header = TRUE, fill = TRUE)

# view dataframe
knitr::kable(html_df)

book_id	book_title	book_author	book_genre	book_lang	book_price
1	Pride and Prejudice	Jane Austin	Romance	English	7.99
2	The Da Vinci Code	Dan Brown	Mystery Thriller	English	20.73
3	Relic	Douglas Preston & Lincoln Child	Horror Fiction	English	8.99

Conclusion

The data from different did not come in identical as the format structure is different between JSON, HTML, and XML. Even though the data sources come in with different formats, it can be easily imported with right libraries. From there, data cleaning and analysis can be done easily.

607_assignment_7

Eddie Xu

2023-03-10