Mehreen Ali Gillani

Step 1: import libraries

library(htmltools)
library(xml2)
library(rvest)
library(jsonlite)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
## 
##     flatten
library(knitr)

Step 2: Read HTML file from Github, store it in a dataframe and print

html_df <- read_html("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.html") %>%
  html_node("table") %>%
  html_table()

kable(html_df, caption = "HTML Data Frame:")
HTML Data Frame:
Title Authors Pages Level Focus
Practical Statistics for Data Scientists Peter Bruce, Andrew Bruce 350 Intermediate Real-world applications
Python for Data Analysis Wes McKinney 400 Beginner-Intermediate Data wrangling
R for Data Science Hadley Wickham, Garrett Grolemund 500 All levels Tidyverse workflow

Step 3: Read XML file from Github, store it in a seprate dataframe and print

xml_data <- read_xml("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.xml")

# Extract all book nodes and convert to data frame
books_list <- xml_find_all(xml_data, "//book")


xml_df <- map_dfr(books_list, function(book) {
  data.frame(
    Title = xml_find_first(book, "title") %>% xml_text(),
    Authors = xml_find_first(book, "authors") %>% xml_text(),
    Pages = xml_find_first(book, "pages") %>% xml_text() %>% as.numeric(),
    Level = xml_find_first(book, "level") %>% xml_text(),
    Focus = xml_find_first(book, "focus") %>% xml_text(),
    stringsAsFactors = FALSE
  )
})

kable(xml_df, caption = "XML Data Frame")
XML Data Frame
Title Authors Pages Level Focus
Practical Statistics for Data Scientists Peter Bruce, Andrew Bruce 350 Intermediate Real-world applications
Python for Data Analysis Wes McKinney 400 Beginner-Intermediate Data wrangling
R for Data Science Hadley Wickham, Garrett Grolemund 500 All levels Tidyverse workflow

Step 4: Read JSON file from Github, store it in a seprate dataframe and print

json_df <- fromJSON("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.json") %>%
  as.data.frame() %>%
  rename(
    Title = title,
    Authors = authors,
    Pages = pages,
    Level = level,
    Focus = focus
  )

# Fix the authors column (convert from list to character)
json_df$Authors <- map_chr(json_df$Authors, ~paste(.x, collapse = ", "))


kable(json_df, caption = "Json Data Frame")
Json Data Frame
Title Authors Pages Level Focus
Practical Statistics for Data Scientists Peter Bruce, Andrew Bruce 350 Intermediate Real-world applications
Python for Data Analysis Wes McKinney 400 Beginner-Intermediate Data wrangling
R for Data Science Hadley Wickham, Garrett Grolemund 500 All levels Tidyverse workflow

Step 5: Compare these dataframes to check if they are identical

# 4. Compare data frames
identical(html_df, xml_df)
## [1] FALSE
identical(html_df, json_df)
## [1] FALSE