Assignment

Mehreen Ali Gillani

Step 1: import libraries

library(htmltools)
library(xml2)
library(rvest)
library(jsonlite)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:jsonlite':
## 
##     flatten

library(knitr)

Step 2: Read HTML file from Github, store it in a dataframe and print

html_df <- read_html("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.html") %>%
  html_node("table") %>%
  html_table()

kable(html_df, caption = "HTML Data Frame:")

HTML Data Frame:
Title	Authors	Pages	Level	Focus
Practical Statistics for Data Scientists	Peter Bruce, Andrew Bruce	350	Intermediate	Real-world applications
Python for Data Analysis	Wes McKinney	400	Beginner-Intermediate	Data wrangling
R for Data Science	Hadley Wickham, Garrett Grolemund	500	All levels	Tidyverse workflow

Step 3: Read XML file from Github, store it in a seprate dataframe and print

xml_data <- read_xml("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.xml")

# Extract all book nodes and convert to data frame
books_list <- xml_find_all(xml_data, "//book")


xml_df <- map_dfr(books_list, function(book) {
  data.frame(
    Title = xml_find_first(book, "title") %>% xml_text(),
    Authors = xml_find_first(book, "authors") %>% xml_text(),
    Pages = xml_find_first(book, "pages") %>% xml_text() %>% as.numeric(),
    Level = xml_find_first(book, "level") %>% xml_text(),
    Focus = xml_find_first(book, "focus") %>% xml_text(),
    stringsAsFactors = FALSE
  )
})

kable(xml_df, caption = "XML Data Frame")

XML Data Frame
Title	Authors	Pages	Level	Focus
Practical Statistics for Data Scientists	Peter Bruce, Andrew Bruce	350	Intermediate	Real-world applications
Python for Data Analysis	Wes McKinney	400	Beginner-Intermediate	Data wrangling
R for Data Science	Hadley Wickham, Garrett Grolemund	500	All levels	Tidyverse workflow

Step 4: Read JSON file from Github, store it in a seprate dataframe and print

json_df <- fromJSON("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.json") %>%
  as.data.frame() %>%
  rename(
    Title = title,
    Authors = authors,
    Pages = pages,
    Level = level,
    Focus = focus
  )

# Fix the authors column (convert from list to character)
json_df$Authors <- map_chr(json_df$Authors, ~paste(.x, collapse = ", "))


kable(json_df, caption = "Json Data Frame")

Json Data Frame
Title	Authors	Pages	Level	Focus
Practical Statistics for Data Scientists	Peter Bruce, Andrew Bruce	350	Intermediate	Real-world applications
Python for Data Analysis	Wes McKinney	400	Beginner-Intermediate	Data wrangling
R for Data Science	Hadley Wickham, Garrett Grolemund	500	All levels	Tidyverse workflow

Step 5: Compare these dataframes to check if they are identical

# 4. Compare data frames
identical(html_df, xml_df)

## [1] FALSE

identical(html_df, json_df)

## [1] FALSE

Assignment_7

2025-10-10

Mehreen Ali Gillani

Step 1: import libraries

Step 2: Read HTML file from Github, store it in a dataframe and print

Step 3: Read XML file from Github, store it in a seprate dataframe and print

Step 4: Read JSON file from Github, store it in a seprate dataframe and print

Step 5: Compare these dataframes to check if they are identical