Import Libraries:
library(rvest)
## Warning: package 'rvest' was built under R version 4.1.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 1.4.0
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
library(DT)
## Warning: package 'DT' was built under R version 4.1.3
Reading HTML file
# Get the file link
github_link <- "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.html"
github_link
## [1] "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.html"
# Start by reading a HTML page with read_html():
html_data<- read_html(github_link)
html_data
## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n <table border="1">\n<tr>\n<th>Name</th>\n <th> ...
Converting the HTML data into dataframe
data <-html_table(html_data)
html_df <- as.data.frame(data)
html_df
## Name Author Publisher ISBN.13 Year
## 1 R for Data Science Hadley Wickham O Reilly 978-1491910399 2017
## 2 The Language of SQL Larry Rockoff Pearson Education Inc 978-0134658254 2016
## 3 R for Everyone Jared P Lander Pearson Education Inc 978-0134546926 2017
Load the JSON Package
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.1.3
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
Read a JSON file into a dataframe
github_link = "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.json"
json_df <- read_json(github_link, simplifyVector = TRUE)
json_df
## name author publisher isbn-13 year
## 1 R for Data Science Hadley Wickham O Reilly 978-1491910399 2017
## 2 The Language of SQL Larry Rockoff Pearson Education Inc 978-0134658254 2016
## 3 R for Everyone Jared P Lander Pearson Education Inc 978-0134546926 2017
loading the library and other important packages
library("XML")
## Warning: package 'XML' was built under R version 4.1.2
library("methods")
the contents of books_data.xml are parsed
github_link = "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.XML"
github_link
## [1] "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.XML"
library(httr)
## Warning: package 'httr' was built under R version 4.1.3
library(XML)
r = GET(github_link)
xml_data <- xmlTreeParse(r, useInternal=TRUE)
print(xml_data)
## <?xml version="1.0" encoding="UTF-8"?>
## <books_data>
## <book>
## <name>R for Data Science</name>
## <author>Hadley Wickham</author>
## <publisher>O Reilly</publisher>
## <isbn-13>978-1491910399</isbn-13>
## <year>2017</year>
## </book>
## <book>
## <name>The Language of SQL</name>
## <author>Larry Rockoff</author>
## <publisher>Pearson Education Inc</publisher>
## <isbn-13>978-0134658254</isbn-13>
## <year>2016</year>
## </book>
## <book>
## <name>R for Everyone</name>
## <author>Jared P Lander</author>
## <publisher>Pearson Education Inc</publisher>
## <isbn-13>978-0134546926</isbn-13>
## <year>2017</year>
## </book>
## </books_data>
##
xml_df <- xmlToDataFrame(xml_data)
xml_df
## name author publisher isbn-13 year
## 1 R for Data Science Hadley Wickham O Reilly 978-1491910399 2017
## 2 The Language of SQL Larry Rockoff Pearson Education Inc 978-0134658254 2016
## 3 R for Everyone Jared P Lander Pearson Education Inc 978-0134546926 2017
Now to compare these html and json dataframes, simply pass them to the comparedf() function:
Please find the conclusion in the very end of the file
library(arsenal)
## Warning: package 'arsenal' was built under R version 4.1.3
comparedf(html_df, json_df)
## Compare Object
##
## Function Call:
## comparedf(x = html_df, y = json_df)
##
## Shared: 0 non-by variables and 3 observations.
## Not shared: 10 variables and 0 observations.
##
## Differences found in 0/0 variables compared.
## 0 variables compared have non-identical attributes.
Use summary() to get a more detailed summary
summary(comparedf(html_df, json_df))
##
##
## Table: Summary of data.frames
##
## version arg ncol nrow
## -------- -------- ----- -----
## x html_df 5 3
## y json_df 5 3
##
##
##
## Table: Summary of overall comparison
##
## statistic value
## ------------------------------------------------------------ ------
## Number of by-variables 0
## Number of non-by variables in common 0
## Number of variables compared 0
## Number of variables in x but not y 5
## Number of variables in y but not x 5
## Number of variables compared with some values unequal 0
## Number of variables compared with all values equal 0
## Number of observations in common 3
## Number of observations in x but not y 0
## Number of observations in y but not x 0
## Number of observations with some compared variables unequal 0
## Number of observations with all compared variables equal 3
## Number of values unequal 0
##
##
##
## Table: Variables not shared
##
## version variable position class
## -------- ---------- --------- ----------
## x Name 1 character
## x Author 2 character
## x Publisher 3 character
## x ISBN.13 4 character
## x Year 5 integer
## y name 1 character
## y author 2 character
## y publisher 3 character
## y isbn-13 4 character
## y year 5 character
##
##
##
## Table: Other variables not compared
##
##
## --------------------------------
## No other variables not compared
## --------------------------------
##
##
##
## Table: Observations not shared
##
##
## ---------------------------
## No observations not shared
## ---------------------------
##
##
##
## Table: Differences detected by variable
##
##
## ------------------------------------
## No differences detected by variable
## ------------------------------------
##
##
##
## Table: Differences detected
##
##
## ------------------------
## No differences detected
## ------------------------
##
##
##
## Table: Non-identical attributes
##
##
## ----------------------------
## No non-identical attributes
## ----------------------------
Now compare html and xml dataframes
comparedf(html_df, xml_df)
## Compare Object
##
## Function Call:
## comparedf(x = html_df, y = xml_df)
##
## Shared: 0 non-by variables and 3 observations.
## Not shared: 10 variables and 0 observations.
##
## Differences found in 0/0 variables compared.
## 0 variables compared have non-identical attributes.
Use summary() to get a more detailed summary
summary(comparedf(html_df, xml_df))
##
##
## Table: Summary of data.frames
##
## version arg ncol nrow
## -------- -------- ----- -----
## x html_df 5 3
## y xml_df 5 3
##
##
##
## Table: Summary of overall comparison
##
## statistic value
## ------------------------------------------------------------ ------
## Number of by-variables 0
## Number of non-by variables in common 0
## Number of variables compared 0
## Number of variables in x but not y 5
## Number of variables in y but not x 5
## Number of variables compared with some values unequal 0
## Number of variables compared with all values equal 0
## Number of observations in common 3
## Number of observations in x but not y 0
## Number of observations in y but not x 0
## Number of observations with some compared variables unequal 0
## Number of observations with all compared variables equal 3
## Number of values unequal 0
##
##
##
## Table: Variables not shared
##
## version variable position class
## -------- ---------- --------- ----------
## x Name 1 character
## x Author 2 character
## x Publisher 3 character
## x ISBN.13 4 character
## x Year 5 integer
## y name 1 character
## y author 2 character
## y publisher 3 character
## y isbn-13 4 character
## y year 5 character
##
##
##
## Table: Other variables not compared
##
##
## --------------------------------
## No other variables not compared
## --------------------------------
##
##
##
## Table: Observations not shared
##
##
## ---------------------------
## No observations not shared
## ---------------------------
##
##
##
## Table: Differences detected by variable
##
##
## ------------------------------------
## No differences detected by variable
## ------------------------------------
##
##
##
## Table: Differences detected
##
##
## ------------------------
## No differences detected
## ------------------------
##
##
##
## Table: Non-identical attributes
##
##
## ----------------------------
## No non-identical attributes
## ----------------------------
Now compare json and xml dataframes
comparedf(json_df, xml_df)
## Compare Object
##
## Function Call:
## comparedf(x = json_df, y = xml_df)
##
## Shared: 5 non-by variables and 3 observations.
## Not shared: 0 variables and 0 observations.
##
## Differences found in 0/5 variables compared.
## 0 variables compared have non-identical attributes.
Use summary() to get a more detailed summary
summary(comparedf(json_df, xml_df))
##
##
## Table: Summary of data.frames
##
## version arg ncol nrow
## -------- -------- ----- -----
## x json_df 5 3
## y xml_df 5 3
##
##
##
## Table: Summary of overall comparison
##
## statistic value
## ------------------------------------------------------------ ------
## Number of by-variables 0
## Number of non-by variables in common 5
## Number of variables compared 5
## Number of variables in x but not y 0
## Number of variables in y but not x 0
## Number of variables compared with some values unequal 0
## Number of variables compared with all values equal 5
## Number of observations in common 3
## Number of observations in x but not y 0
## Number of observations in y but not x 0
## Number of observations with some compared variables unequal 0
## Number of observations with all compared variables equal 3
## Number of values unequal 0
##
##
##
## Table: Variables not shared
##
##
## ------------------------
## No variables not shared
## ------------------------
##
##
##
## Table: Other variables not compared
##
##
## --------------------------------
## No other variables not compared
## --------------------------------
##
##
##
## Table: Observations not shared
##
##
## ---------------------------
## No observations not shared
## ---------------------------
##
##
##
## Table: Differences detected by variable
##
## var.x var.y n NAs
## ---------- ---------- --- ----
## name name 0 0
## author author 0 0
## publisher publisher 0 0
## isbn-13 isbn-13 0 0
## year year 0 0
##
##
##
## Table: Differences detected
##
##
## ------------------------
## No differences detected
## ------------------------
##
##
##
## Table: Non-identical attributes
##
##
## ----------------------------
## No non-identical attributes
## ----------------------------
Conclusion:
Comparing all the datafarmes, I conclude that all the dataframes are equal
# The data frames from the HTML and XML route are the same
html_df == xml_df
## Name Author Publisher ISBN.13 Year
## [1,] TRUE TRUE TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE TRUE TRUE
# The data frames from the HTML and JSON route are the same
html_df == json_df
## Name Author Publisher ISBN.13 Year
## [1,] TRUE TRUE TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE TRUE TRUE
# The data frames from the HTML and JSON route are the same
xml_df == json_df
## name author publisher isbn-13 year
## [1,] TRUE TRUE TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE TRUE TRUE