Mehreen Ali Gillani

Step 1: import libraries

library(htmltools)
library(xml2)
library(rvest)
library(jsonlite)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
## 
##     flatten
library(knitr)

Step 2: Read HTML file from Github, store it in a dataframe and print

html_df <- read_html("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.html") %>%
  html_node("table") %>%
  html_table()

kable(html_df, caption = "HTML Data Frame:")
HTML Data Frame:
Title Authors Pages Level Focus
Practical Statistics for Data Scientists Peter Bruce, Andrew Bruce 350 Intermediate Real-world applications
Python for Data Analysis Wes McKinney 400 Beginner-Intermediate Data wrangling
R for Data Science Hadley Wickham, Garrett Grolemund 500 All Levels Tidyverse workflow

Step 3: Read XML file from Github, store it in a seprate dataframe and print

xml_data <- read_xml("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.xml")

# Extract all book nodes and convert to data frame
books_list <- xml_find_all(xml_data, "//book")


xml_df <- map_dfr(books_list, function(book) {
  data.frame(
    Title = xml_find_first(book, "Title") %>% xml_text(),
    Authors = xml_find_first(book, "Authors") %>% xml_text(),
    Pages = xml_find_first(book, "Pages") %>% xml_text() %>% as.numeric(),
    Level = xml_find_first(book, "Level") %>% xml_text(),
    Focus = xml_find_first(book, "Focus") %>% xml_text(),
    stringsAsFactors = FALSE
  )
})

kable(xml_df, caption = "XML Data Frame")
XML Data Frame
Title Authors Pages Level Focus
Practical Statistics for Data Scientists Peter Bruce, Andrew Bruce 350 Intermediate Real-world applications
Python for Data Analysis Wes McKinney 400 Beginner-Intermediate Data wrangling
R for Data Science Hadley Wickham, Garrett Grolemund 500 All Levels Tidyverse workflow

Step 4: Read JSON file from Github, store it in a seprate dataframe and print

json_df <- fromJSON("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.json") %>%
  as.data.frame() %>%
  rename(
    Title = Title,
    Authors = Authors,
    Pages = Pages,
    Level = Level,
    Focus = Focus
  )

# Fix the authors column (convert from list to character)
json_df$Authors <- map_chr(json_df$Authors, ~paste(.x, collapse = ", "))


kable(json_df, caption = "Json Data Frame")
Json Data Frame
Title Authors Pages Level Focus
Practical Statistics for Data Scientists Peter Bruce, Andrew Bruce 350 Intermediate Real-world applications
Python for Data Analysis Wes McKinney 400 Beginner-Intermediate Data wrangling
R for Data Science Hadley Wickham, Garrett Grolemund 500 All Levels Tidyverse workflow

Step 5: Compare these dataframes to check if they are identical

# 4. Compare data frames
identical(html_df, xml_df)
## [1] FALSE
identical(json_df, xml_df)
## [1] FALSE
identical(html_df, json_df)
## [1] FALSE

5.1 Find if all columns are identical in all files

identical(json_df$Title,xml_df$Title)
## [1] TRUE
identical(json_df$Authors,xml_df$Authors)
## [1] TRUE
identical(json_df$Level,xml_df$Level)
## [1] TRUE
identical(json_df$Pages,xml_df$Pages)
## [1] FALSE
identical(json_df$Focus,xml_df$Focus)
## [1] TRUE
identical(json_df$Title,html_df$Title)
## [1] TRUE
identical(json_df$Authors,html_df$Authors)
## [1] TRUE
identical(json_df$Level,html_df$Level)
## [1] TRUE
identical(json_df$Pages,html_df$Pages)
## [1] TRUE
identical(json_df$Focus,html_df$Focus)
## [1] TRUE

Check Data types for all columns

# Get data types for each column
cat("Column Data Types:\n")
## Column Data Types:
data.frame(
  HTML_Columns = sapply(html_df, class),
  XML_Columns = sapply(xml_df, class),
  JSON_Columns = sapply(json_df, class)
)
##         HTML_Columns XML_Columns JSON_Columns
## Title      character   character    character
## Authors    character   character    character
## Pages        integer     numeric      integer
## Level      character   character    character
## Focus      character   character    character

So, XML Pages are stored in numeric whereas in html and json it is stored in integers.

#change data type for xml pages to int
xml_df$Pages <- as.integer(xml_df$Pages)
#Let find if they are identical now

identical(xml_df$Title,html_df$Title)
## [1] TRUE
identical(xml_df$Authors,html_df$Authors)
## [1] TRUE
identical(xml_df$Level,html_df$Level)
## [1] TRUE
identical(xml_df$Pages,html_df$Pages)
## [1] TRUE
identical(xml_df$Focus,html_df$Focus)
## [1] TRUE
identical(json_df$Title,html_df$Title)
## [1] TRUE
identical(json_df$Authors,html_df$Authors)
## [1] TRUE
identical(json_df$Level,html_df$Level)
## [1] TRUE
identical(json_df$Pages,html_df$Pages)
## [1] TRUE
identical(json_df$Focus,html_df$Focus)
## [1] TRUE