Import Libraries:

library(rvest)

## Warning: package 'rvest' was built under R version 4.1.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   1.4.0

## Warning: package 'ggplot2' was built under R version 4.1.2

## Warning: package 'stringr' was built under R version 4.1.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()

library(DT)

## Warning: package 'DT' was built under R version 4.1.3

Reading HTML file

# Get the file link
github_link <- "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.html"

github_link

## [1] "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.html"

# Start by reading a HTML page with read_html():

html_data<- read_html(github_link)

html_data

## {html_document}
## <html lang="en">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n    <table border="1">\n<tr>\n<th>Name</th>\n                <th> ...

Converting the HTML data into dataframe

data <-html_table(html_data)
html_df <- as.data.frame(data)

html_df

##                  Name         Author             Publisher        ISBN.13 Year
## 1  R for Data Science Hadley Wickham              O Reilly 978-1491910399 2017
## 2 The Language of SQL  Larry Rockoff Pearson Education Inc 978-0134658254 2016
## 3      R for Everyone Jared P Lander Pearson Education Inc 978-0134546926 2017

Load the JSON Package

library(jsonlite)

## Warning: package 'jsonlite' was built under R version 4.1.3

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

Read a JSON file into a dataframe

github_link = "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.json"



json_df <- read_json(github_link, simplifyVector = TRUE)

json_df

##                  name         author             publisher        isbn-13 year
## 1  R for Data Science Hadley Wickham              O Reilly 978-1491910399 2017
## 2 The Language of SQL  Larry Rockoff Pearson Education Inc 978-0134658254 2016
## 3      R for Everyone Jared P Lander Pearson Education Inc 978-0134546926 2017

loading the library and other important packages

library("XML")

## Warning: package 'XML' was built under R version 4.1.2

library("methods")

the contents of books_data.xml are parsed

github_link = "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.XML"

github_link

## [1] "https://raw.githubusercontent.com/uzmabb182/CUNY-SPS-Assignments/main/data_607/week7/books_data.XML"

library(httr)

## Warning: package 'httr' was built under R version 4.1.3

library(XML)

r = GET(github_link)
xml_data <- xmlTreeParse(r, useInternal=TRUE)
 
print(xml_data)

## <?xml version="1.0" encoding="UTF-8"?>
## <books_data>
##   <book>
##     <name>R for Data Science</name>
##     <author>Hadley Wickham</author>
##     <publisher>O Reilly</publisher>
##     <isbn-13>978-1491910399</isbn-13>
##     <year>2017</year>
##   </book>
##   <book>
##     <name>The Language of SQL</name>
##     <author>Larry Rockoff</author>
##     <publisher>Pearson Education Inc</publisher>
##     <isbn-13>978-0134658254</isbn-13>
##     <year>2016</year>
##   </book>
##   <book>
##     <name>R for Everyone</name>
##     <author>Jared P Lander</author>
##     <publisher>Pearson Education Inc</publisher>
##     <isbn-13>978-0134546926</isbn-13>
##     <year>2017</year>
##   </book>
## </books_data>
##

xml_df <- xmlToDataFrame(xml_data)
xml_df

##                  name         author             publisher        isbn-13 year
## 1  R for Data Science Hadley Wickham              O Reilly 978-1491910399 2017
## 2 The Language of SQL  Larry Rockoff Pearson Education Inc 978-0134658254 2016
## 3      R for Everyone Jared P Lander Pearson Education Inc 978-0134546926 2017

Now to compare these html and json dataframes, simply pass them to the comparedf() function:

Please find the conclusion in the very end of the file

library(arsenal)

## Warning: package 'arsenal' was built under R version 4.1.3

comparedf(html_df, json_df)

## Compare Object
## 
## Function Call: 
## comparedf(x = html_df, y = json_df)
## 
## Shared: 0 non-by variables and 3 observations.
## Not shared: 10 variables and 0 observations.
## 
## Differences found in 0/0 variables compared.
## 0 variables compared have non-identical attributes.

Use summary() to get a more detailed summary

summary(comparedf(html_df, json_df))

## 
## 
## Table: Summary of data.frames
## 
## version   arg        ncol   nrow
## --------  --------  -----  -----
## x         html_df       5      3
## y         json_df       5      3
## 
## 
## 
## Table: Summary of overall comparison
## 
## statistic                                                      value
## ------------------------------------------------------------  ------
## Number of by-variables                                             0
## Number of non-by variables in common                               0
## Number of variables compared                                       0
## Number of variables in x but not y                                 5
## Number of variables in y but not x                                 5
## Number of variables compared with some values unequal              0
## Number of variables compared with all values equal                 0
## Number of observations in common                                   3
## Number of observations in x but not y                              0
## Number of observations in y but not x                              0
## Number of observations with some compared variables unequal        0
## Number of observations with all compared variables equal           3
## Number of values unequal                                           0
## 
## 
## 
## Table: Variables not shared
## 
## version   variable     position  class     
## --------  ----------  ---------  ----------
## x         Name                1  character 
## x         Author              2  character 
## x         Publisher           3  character 
## x         ISBN.13             4  character 
## x         Year                5  integer   
## y         name                1  character 
## y         author              2  character 
## y         publisher           3  character 
## y         isbn-13             4  character 
## y         year                5  character 
## 
## 
## 
## Table: Other variables not compared
## 
##                                  
##  --------------------------------
##  No other variables not compared 
##  --------------------------------
## 
## 
## 
## Table: Observations not shared
## 
##                             
##  ---------------------------
##  No observations not shared 
##  ---------------------------
## 
## 
## 
## Table: Differences detected by variable
## 
##                                      
##  ------------------------------------
##  No differences detected by variable 
##  ------------------------------------
## 
## 
## 
## Table: Differences detected
## 
##                          
##  ------------------------
##  No differences detected 
##  ------------------------
## 
## 
## 
## Table: Non-identical attributes
## 
##                              
##  ----------------------------
##  No non-identical attributes 
##  ----------------------------

Now compare html and xml dataframes

comparedf(html_df, xml_df)

## Compare Object
## 
## Function Call: 
## comparedf(x = html_df, y = xml_df)
## 
## Shared: 0 non-by variables and 3 observations.
## Not shared: 10 variables and 0 observations.
## 
## Differences found in 0/0 variables compared.
## 0 variables compared have non-identical attributes.

Use summary() to get a more detailed summary

summary(comparedf(html_df, xml_df))

## 
## 
## Table: Summary of data.frames
## 
## version   arg        ncol   nrow
## --------  --------  -----  -----
## x         html_df       5      3
## y         xml_df        5      3
## 
## 
## 
## Table: Summary of overall comparison
## 
## statistic                                                      value
## ------------------------------------------------------------  ------
## Number of by-variables                                             0
## Number of non-by variables in common                               0
## Number of variables compared                                       0
## Number of variables in x but not y                                 5
## Number of variables in y but not x                                 5
## Number of variables compared with some values unequal              0
## Number of variables compared with all values equal                 0
## Number of observations in common                                   3
## Number of observations in x but not y                              0
## Number of observations in y but not x                              0
## Number of observations with some compared variables unequal        0
## Number of observations with all compared variables equal           3
## Number of values unequal                                           0
## 
## 
## 
## Table: Variables not shared
## 
## version   variable     position  class     
## --------  ----------  ---------  ----------
## x         Name                1  character 
## x         Author              2  character 
## x         Publisher           3  character 
## x         ISBN.13             4  character 
## x         Year                5  integer   
## y         name                1  character 
## y         author              2  character 
## y         publisher           3  character 
## y         isbn-13             4  character 
## y         year                5  character 
## 
## 
## 
## Table: Other variables not compared
## 
##                                  
##  --------------------------------
##  No other variables not compared 
##  --------------------------------
## 
## 
## 
## Table: Observations not shared
## 
##                             
##  ---------------------------
##  No observations not shared 
##  ---------------------------
## 
## 
## 
## Table: Differences detected by variable
## 
##                                      
##  ------------------------------------
##  No differences detected by variable 
##  ------------------------------------
## 
## 
## 
## Table: Differences detected
## 
##                          
##  ------------------------
##  No differences detected 
##  ------------------------
## 
## 
## 
## Table: Non-identical attributes
## 
##                              
##  ----------------------------
##  No non-identical attributes 
##  ----------------------------

Now compare json and xml dataframes

comparedf(json_df, xml_df)

## Compare Object
## 
## Function Call: 
## comparedf(x = json_df, y = xml_df)
## 
## Shared: 5 non-by variables and 3 observations.
## Not shared: 0 variables and 0 observations.
## 
## Differences found in 0/5 variables compared.
## 0 variables compared have non-identical attributes.

Use summary() to get a more detailed summary

summary(comparedf(json_df, xml_df))

## 
## 
## Table: Summary of data.frames
## 
## version   arg        ncol   nrow
## --------  --------  -----  -----
## x         json_df       5      3
## y         xml_df        5      3
## 
## 
## 
## Table: Summary of overall comparison
## 
## statistic                                                      value
## ------------------------------------------------------------  ------
## Number of by-variables                                             0
## Number of non-by variables in common                               5
## Number of variables compared                                       5
## Number of variables in x but not y                                 0
## Number of variables in y but not x                                 0
## Number of variables compared with some values unequal              0
## Number of variables compared with all values equal                 5
## Number of observations in common                                   3
## Number of observations in x but not y                              0
## Number of observations in y but not x                              0
## Number of observations with some compared variables unequal        0
## Number of observations with all compared variables equal           3
## Number of values unequal                                           0
## 
## 
## 
## Table: Variables not shared
## 
##                          
##  ------------------------
##  No variables not shared 
##  ------------------------
## 
## 
## 
## Table: Other variables not compared
## 
##                                  
##  --------------------------------
##  No other variables not compared 
##  --------------------------------
## 
## 
## 
## Table: Observations not shared
## 
##                             
##  ---------------------------
##  No observations not shared 
##  ---------------------------
## 
## 
## 
## Table: Differences detected by variable
## 
## var.x       var.y         n   NAs
## ----------  ----------  ---  ----
## name        name          0     0
## author      author        0     0
## publisher   publisher     0     0
## isbn-13     isbn-13       0     0
## year        year          0     0
## 
## 
## 
## Table: Differences detected
## 
##                          
##  ------------------------
##  No differences detected 
##  ------------------------
## 
## 
## 
## Table: Non-identical attributes
## 
##                              
##  ----------------------------
##  No non-identical attributes 
##  ----------------------------

Conclusion:

Comparing all the datafarmes, I conclude that all the dataframes are equal

# The data frames from the HTML and XML route are the same
html_df == xml_df

##      Name Author Publisher ISBN.13 Year
## [1,] TRUE   TRUE      TRUE    TRUE TRUE
## [2,] TRUE   TRUE      TRUE    TRUE TRUE
## [3,] TRUE   TRUE      TRUE    TRUE TRUE

# The data frames from the HTML and JSON route are the same
html_df == json_df

##      Name Author Publisher ISBN.13 Year
## [1,] TRUE   TRUE      TRUE    TRUE TRUE
## [2,] TRUE   TRUE      TRUE    TRUE TRUE
## [3,] TRUE   TRUE      TRUE    TRUE TRUE

# The data frames from the HTML and JSON route are the same
xml_df == json_df

##      name author publisher isbn-13 year
## [1,] TRUE   TRUE      TRUE    TRUE TRUE
## [2,] TRUE   TRUE      TRUE    TRUE TRUE
## [3,] TRUE   TRUE      TRUE    TRUE TRUE

Week 7 - Working with HTML, JSON, and XML file

Import Libraries:

Reading HTML file

Converting the HTML data into dataframe

Load the JSON Package

Read a JSON file into a dataframe

loading the library and other important packages

the contents of books_data.xml are parsed

Now to compare these html and json dataframes, simply pass them to the comparedf() function:

Please find the conclusion in the very end of the file

Use summary() to get a more detailed summary

Now compare html and xml dataframes

Use summary() to get a more detailed summary

Now compare json and xml dataframes

Use summary() to get a more detailed summary

Conclusion:

Comparing all the datafarmes, I conclude that all the dataframes are equal