Mehreen Ali Gillani
Step 1: import libraries
library(htmltools)
library(xml2)
library(rvest)
library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:jsonlite':
##
## flatten
library(knitr)
Step 2: Read HTML file from Github, store it in a dataframe and
print
html_df <- read_html("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.html") %>%
html_node("table") %>%
html_table()
kable(html_df, caption = "HTML Data Frame:")
HTML Data Frame:
Practical Statistics for Data Scientists |
Peter Bruce, Andrew Bruce |
350 |
Intermediate |
Real-world applications |
Python for Data Analysis |
Wes McKinney |
400 |
Beginner-Intermediate |
Data wrangling |
R for Data Science |
Hadley Wickham, Garrett Grolemund |
500 |
All levels |
Tidyverse workflow |
Step 3: Read XML file from Github, store it in a seprate dataframe
and print
xml_data <- read_xml("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.xml")
# Extract all book nodes and convert to data frame
books_list <- xml_find_all(xml_data, "//book")
xml_df <- map_dfr(books_list, function(book) {
data.frame(
Title = xml_find_first(book, "title") %>% xml_text(),
Authors = xml_find_first(book, "authors") %>% xml_text(),
Pages = xml_find_first(book, "pages") %>% xml_text() %>% as.numeric(),
Level = xml_find_first(book, "level") %>% xml_text(),
Focus = xml_find_first(book, "focus") %>% xml_text(),
stringsAsFactors = FALSE
)
})
kable(xml_df, caption = "XML Data Frame")
XML Data Frame
Practical Statistics for Data Scientists |
Peter Bruce, Andrew Bruce |
350 |
Intermediate |
Real-world applications |
Python for Data Analysis |
Wes McKinney |
400 |
Beginner-Intermediate |
Data wrangling |
R for Data Science |
Hadley Wickham, Garrett Grolemund |
500 |
All levels |
Tidyverse workflow |
Step 4: Read JSON file from Github, store it in a seprate dataframe
and print
json_df <- fromJSON("https://raw.githubusercontent.com/mehreengillani/DATA607-Assignment7/refs/heads/main/books.json") %>%
as.data.frame() %>%
rename(
Title = title,
Authors = authors,
Pages = pages,
Level = level,
Focus = focus
)
# Fix the authors column (convert from list to character)
json_df$Authors <- map_chr(json_df$Authors, ~paste(.x, collapse = ", "))
kable(json_df, caption = "Json Data Frame")
Json Data Frame
Practical Statistics for Data Scientists |
Peter Bruce, Andrew Bruce |
350 |
Intermediate |
Real-world applications |
Python for Data Analysis |
Wes McKinney |
400 |
Beginner-Intermediate |
Data wrangling |
R for Data Science |
Hadley Wickham, Garrett Grolemund |
500 |
All levels |
Tidyverse workflow |
Step 5: Compare these dataframes to check if they are identical
# 4. Compare data frames
identical(html_df, xml_df)
## [1] FALSE
identical(html_df, json_df)
## [1] FALSE