library(htmltools)
library(xml2)
library(rvest)
library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
##
## flatten
library(knitr)
html_df <- read_html("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.html") %>%
html_node("table") %>%
html_table()
kable(html_df, caption = "HTML Data Frame:")
| Title | Authors | Pages | Level | Focus |
|---|---|---|---|---|
| Practical Statistics for Data Scientists | Peter Bruce, Andrew Bruce | 350 | Intermediate | Real-world applications |
| Python for Data Analysis | Wes McKinney | 400 | Beginner-Intermediate | Data wrangling |
| R for Data Science | Hadley Wickham, Garrett Grolemund | 500 | All Levels | Tidyverse workflow |
xml_data <- read_xml("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.xml")
# Extract all book nodes and convert to data frame
books_list <- xml_find_all(xml_data, "//book")
xml_df <- map_dfr(books_list, function(book) {
data.frame(
Title = xml_find_first(book, "Title") %>% xml_text(),
Authors = xml_find_first(book, "Authors") %>% xml_text(),
Pages = xml_find_first(book, "Pages") %>% xml_text() %>% as.numeric(),
Level = xml_find_first(book, "Level") %>% xml_text(),
Focus = xml_find_first(book, "Focus") %>% xml_text(),
stringsAsFactors = FALSE
)
})
kable(xml_df, caption = "XML Data Frame")
| Title | Authors | Pages | Level | Focus |
|---|---|---|---|---|
| Practical Statistics for Data Scientists | Peter Bruce, Andrew Bruce | 350 | Intermediate | Real-world applications |
| Python for Data Analysis | Wes McKinney | 400 | Beginner-Intermediate | Data wrangling |
| R for Data Science | Hadley Wickham, Garrett Grolemund | 500 | All Levels | Tidyverse workflow |
json_df <- fromJSON("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.json") %>%
as.data.frame() %>%
rename(
Title = Title,
Authors = Authors,
Pages = Pages,
Level = Level,
Focus = Focus
)
# Fix the authors column (convert from list to character)
json_df$Authors <- map_chr(json_df$Authors, ~paste(.x, collapse = ", "))
kable(json_df, caption = "Json Data Frame")
| Title | Authors | Pages | Level | Focus |
|---|---|---|---|---|
| Practical Statistics for Data Scientists | Peter Bruce, Andrew Bruce | 350 | Intermediate | Real-world applications |
| Python for Data Analysis | Wes McKinney | 400 | Beginner-Intermediate | Data wrangling |
| R for Data Science | Hadley Wickham, Garrett Grolemund | 500 | All Levels | Tidyverse workflow |
# 4. Compare data frames
identical(html_df, xml_df)
## [1] FALSE
identical(json_df, xml_df)
## [1] FALSE
identical(html_df, json_df)
## [1] FALSE
5.1 Find if all columns are identical in all files
identical(json_df$Title,xml_df$Title)
## [1] TRUE
identical(json_df$Authors,xml_df$Authors)
## [1] TRUE
identical(json_df$Level,xml_df$Level)
## [1] TRUE
identical(json_df$Pages,xml_df$Pages)
## [1] FALSE
identical(json_df$Focus,xml_df$Focus)
## [1] TRUE
identical(json_df$Title,html_df$Title)
## [1] TRUE
identical(json_df$Authors,html_df$Authors)
## [1] TRUE
identical(json_df$Level,html_df$Level)
## [1] TRUE
identical(json_df$Pages,html_df$Pages)
## [1] TRUE
identical(json_df$Focus,html_df$Focus)
## [1] TRUE
Check Data types for all columns
# Get data types for each column
cat("Column Data Types:\n")
## Column Data Types:
data.frame(
HTML_Columns = sapply(html_df, class),
XML_Columns = sapply(xml_df, class),
JSON_Columns = sapply(json_df, class)
)
## HTML_Columns XML_Columns JSON_Columns
## Title character character character
## Authors character character character
## Pages integer numeric integer
## Level character character character
## Focus character character character
So, XML Pages are stored in numeric whereas in html and json it is stored in integers.
#change data type for xml pages to int
xml_df$Pages <- as.integer(xml_df$Pages)
#Let find if they are identical now
identical(xml_df$Title,html_df$Title)
## [1] TRUE
identical(xml_df$Authors,html_df$Authors)
## [1] TRUE
identical(xml_df$Level,html_df$Level)
## [1] TRUE
identical(xml_df$Pages,html_df$Pages)
## [1] TRUE
identical(xml_df$Focus,html_df$Focus)
## [1] TRUE
identical(json_df$Title,html_df$Title)
## [1] TRUE
identical(json_df$Authors,html_df$Authors)
## [1] TRUE
identical(json_df$Level,html_df$Level)
## [1] TRUE
identical(json_df$Pages,html_df$Pages)
## [1] TRUE
identical(json_df$Focus,html_df$Focus)
## [1] TRUE