Data 607 Assignment 7

library(XML)
library(rjson)
library(httr)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Introduction

I prepared three files with the same data in the three specified formats: HTML, JSON and XML. These files are hosted on GitHub.
I captured the following attributes of books:
* Title * Authors * Publisher * Year of Publication
Each of the files was manually written and then loaded into Github for reference / reproducibility.

Data Loading

XML Data

xml_data_raw <- xmlParse(GET("https://raw.githubusercontent.com/nathtrish334/Data-607/main/bookdata.xml"))

HTML Data

html_data_raw <- readLines("https://raw.githubusercontent.com/nathtrish334/Data-607/main/bookdata.html")
# Pattern for the <td> html tags
regex_td = "<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>"
# Extract the lines that matching the pattern
regx_lines = grep(regex_td, html_data_raw[1:length(html_data_raw)], value=TRUE)
# Extract the required information from <td> tag
html_data_raw <- trimws(unlist(sub("<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>", "\\1", regx_lines)))
#str(html_data_raw)

JSON Data

json_data_raw <- fromJSON(file="https://raw.githubusercontent.com/nathtrish334/Data-607/main/bookdata.json")

Data Frames

XML Data Frame

xml_data_df <- xmlToDataFrame(xml_data_raw)
xml_data_df

##                              Title                  Authors    Publisher Year
## 1      Introduction to Probability   Grinstead, C. Snell, J          AMS 1997
## 2 A First Course in Linear Algebra         Robert A. Beezer    Cambridge 2008
## 3               R for Data Science G. Grolemund, Wickham H.     O'Reilly 2017
## 4        Linear Regression using R           David J. Lilja Minest. Uni. 2016

HTML Data Frame

html_data_df <- cbind.data.frame(split(html_data_raw, rep(1:4, times=length(html_data_raw)/4)), stringsAsFactors=F)
names(html_data_df) <- c("Title", "Authors", "Publisher", "Year")
html_data_df

##                              Title                  Authors    Publisher Year
## 1      Introduction to Probability   Grinstead, C. Snell, J          AMS 1997
## 2 A First Course in Linear Algebra         Robert A. Beezer    Cambridge 2008
## 3               R for Data Science G. Grolemund, Wickham H.     O'Reilly 2017
## 4        Linear Regression using R           David J. Lilja Minest. Uni. 2016

#str(html_data_df)

JSON Data Frame

json_data_df <- data.frame(do.call("rbind", json_data_raw))
# Unnest data and convert it back to a df
json_data_df <- unnest(json_data_df, cols = c(Title, Authors, Publisher, Year)) %>% data.frame()
json_data_df

##                              Title                  Authors    Publisher Year
## 1      Introduction to Probability   Grinstead, C. Snell, J          AMS 1997
## 2 A First Course in Linear Algebra         Robert A. Beezer    Cambridge 2008
## 3               R for Data Science G. Grolemund, Wickham H.     O'Reilly 2017
## 4        Linear Regression using R           David J. Lilja Minest. Uni. 2016

Comparison of the Data Frames

library(diffobj)
identical(xml_data_df,html_data_df)

## [1] TRUE

identical(xml_data_df,json_data_df)

## [1] FALSE

identical(json_data_df,html_data_df)

## [1] FALSE

#Check differences
diffObj(xml_data_df,json_data_df)

## < str(xml_data_df)                       > str(json_data_df)                    
## @@ 3,3 @@                                @@ 3,3 @@                              
##    $ Authors  : chr  "Grinstead, C. Sne     $ Authors  : chr  "Grinstead, C. Sne
##   ll, J" "Robert A. Beezer" "G. Grolemu    ll, J" "Robert A. Beezer" "G. Grolemu
##   nd, Wickham H." "David J. Lilja"         nd, Wickham H." "David J. Lilja"     
##    $ Publisher: chr  "AMS" "Cambridge"      $ Publisher: chr  "AMS" "Cambridge" 
##   "O'Reilly" "Minest. Uni."                "O'Reilly" "Minest. Uni."            
## <  $ Year     : chr  "1997" "2008" "201  >  $ Year     : num  1997 2008 2017 201
## : 7" "2016"                              : 6

diffObj(json_data_df,html_data_df)

## < str(json_data_df)                      > str(html_data_df)                    
## @@ 3,3 @@                                @@ 3,3 @@                              
##    $ Authors  : chr  "Grinstead, C. Sne     $ Authors  : chr  "Grinstead, C. Sne
##   ll, J" "Robert A. Beezer" "G. Grolemu    ll, J" "Robert A. Beezer" "G. Grolemu
##   nd, Wickham H." "David J. Lilja"         nd, Wickham H." "David J. Lilja"     
##    $ Publisher: chr  "AMS" "Cambridge"      $ Publisher: chr  "AMS" "Cambridge" 
##   "O'Reilly" "Minest. Uni."                "O'Reilly" "Minest. Uni."            
## <  $ Year     : num  1997 2008 2017 201  >  $ Year     : chr  "1997" "2008" "201
## : 6                                      : 7" "2016"

The only notable difference in the data frames is the ‘Year’ column. This is has a type of chr in html and xml data frames, whereas its type in the json data frame is num.