Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(xml2)
## Warning: package 'xml2' was built under R version 4.3.3
library(rjson)
library(XML)
## Warning: package 'XML' was built under R version 4.3.2
library(DT)
## Warning: package 'DT' was built under R version 4.3.3
library(rvest)
## Warning: package 'rvest' was built under R version 4.3.3
library(tibble)
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
book_info_html <- read_html("https://raw.githubusercontent.com/topkelama/lfsStorage/main/info_books.html")
Convert to dataframe
tables1 <- html_table(book_info_html)
book_df_html <- data.frame(tables1[[1]], stringsAsFactors = FALSE)
book_df_html
## Title Authors Pages Release.Year Price
## 1 Cloud Essentials+ Study Guide Quentin Docter 368 2020 35.56
## 2 Cloud Essentials+ Study Guide Cory Fuchs 368 2020 35.56
## 3 Data Science Using Python and R Chantal D. Larose 247 2019 90.15
## 4 Data Science Using Python and R Daniel T. Larose 247 2019 90.15
## 5 Kubernetes Up and Running Brendan Burns 277 2019 47.82
## 6 Kubernetes Up and Running Joe Beda 277 2019 47.82
## 7 Kubernetes Up and Running Kelsey Hightower 277 2019 47.82
## ISBN
## 1 978-1-119-64222-0
## 2 978-1-119-64222-0
## 3 9781119526834
## 4 9781119526834
## 5 978-1-492-04653-0
## 6 978-1-492-04653-0
## 7 978-1-492-04653-0
book_info_xml <- read_xml("https://raw.githubusercontent.com/topkelama/lfsStorage/main/info_books.xml")
convert to dataframe
bookInfo_parse <- xmlParse(book_info_xml)
book_xml_df <- xmlToDataFrame(bookInfo_parse, stringsAsFactors = FALSE)
book_xml_df
## Title Authors Pages
## 1 Cloud Essentials+ Study Guide Quentin DocterCory Fuchs 368
## 2 Data Science Using Python and R Chantal D. LaroseDaniel T.Larose 247
## 3 Kubernetes Up and Running Brendan BurnsJoe BedaKelsey Hightower 277
## Release_Year Price ISBN
## 1 2020 35.56 978-1-119-64222-0
## 2 2019 90.15 9781119526834
## 3 2019 47.82 978-1-492-04653-0
json_file <- "https://raw.githubusercontent.com/topkelama/lfsStorage/main/info_books.json"
download.file(json_file, destfile = "info_books.json", quiet = TRUE)
# Read and parse the JSON file
book_info_json <- fromJSON("info_books.json")
# Expand the Authors list into separate rows
book_info_json <- book_info_json %>%
mutate(Authors = lapply(Authors, toString)) %>%
unnest(Authors)
book_info_json
## # A tibble: 3 × 6
## Title Authors Pages `Release Year` Price ISBN
## <chr> <chr> <int> <int> <dbl> <chr>
## 1 Cloud Essentials+ Study Guide Quentin Doct… 368 2020 35.7 978-…
## 2 Data Science Using Python and R Chantal D. L… 247 2019 90.2 9781…
## 3 Kubernetes Up and Running Brendan Burn… 277 2019 47.8 978-…
File links:- XML File JSON File HTML File
Conclusion:- The JSON file did not have issue with the nested lists such as Author column, where there are multiple authors for a single Title. It automatically, did put in long format. However, with the xml and json, it did put multiple authors in a single row under the Authors column. Three dataframes have similarities but not identical.