library(XML)
library(rjson)
library(httr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
I prepared three files with the same data in the three specified formats: HTML, JSON and XML. These files are hosted on GitHub.
I captured the following attributes of books:
* Title * Authors * Publisher * Year of Publication
Each of the files was manually written and then loaded into Github for reference / reproducibility.
xml_data_raw <- xmlParse(GET("https://raw.githubusercontent.com/nathtrish334/Data-607/main/bookdata.xml"))
html_data_raw <- readLines("https://raw.githubusercontent.com/nathtrish334/Data-607/main/bookdata.html")
# Pattern for the <td> html tags
regex_td = "<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>"
# Extract the lines that matching the pattern
regx_lines = grep(regex_td, html_data_raw[1:length(html_data_raw)], value=TRUE)
# Extract the required information from <td> tag
html_data_raw <- trimws(unlist(sub("<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>", "\\1", regx_lines)))
#str(html_data_raw)
json_data_raw <- fromJSON(file="https://raw.githubusercontent.com/nathtrish334/Data-607/main/bookdata.json")
xml_data_df <- xmlToDataFrame(xml_data_raw)
xml_data_df
## Title Authors Publisher Year
## 1 Introduction to Probability Grinstead, C. Snell, J AMS 1997
## 2 A First Course in Linear Algebra Robert A. Beezer Cambridge 2008
## 3 R for Data Science G. Grolemund, Wickham H. O'Reilly 2017
## 4 Linear Regression using R David J. Lilja Minest. Uni. 2016
html_data_df <- cbind.data.frame(split(html_data_raw, rep(1:4, times=length(html_data_raw)/4)), stringsAsFactors=F)
names(html_data_df) <- c("Title", "Authors", "Publisher", "Year")
html_data_df
## Title Authors Publisher Year
## 1 Introduction to Probability Grinstead, C. Snell, J AMS 1997
## 2 A First Course in Linear Algebra Robert A. Beezer Cambridge 2008
## 3 R for Data Science G. Grolemund, Wickham H. O'Reilly 2017
## 4 Linear Regression using R David J. Lilja Minest. Uni. 2016
#str(html_data_df)
json_data_df <- data.frame(do.call("rbind", json_data_raw))
# Unnest data and convert it back to a df
json_data_df <- unnest(json_data_df, cols = c(Title, Authors, Publisher, Year)) %>% data.frame()
json_data_df
## Title Authors Publisher Year
## 1 Introduction to Probability Grinstead, C. Snell, J AMS 1997
## 2 A First Course in Linear Algebra Robert A. Beezer Cambridge 2008
## 3 R for Data Science G. Grolemund, Wickham H. O'Reilly 2017
## 4 Linear Regression using R David J. Lilja Minest. Uni. 2016
library(diffobj)
identical(xml_data_df,html_data_df)
## [1] TRUE
identical(xml_data_df,json_data_df)
## [1] FALSE
identical(json_data_df,html_data_df)
## [1] FALSE
#Check differences
diffObj(xml_data_df,json_data_df)
## < str(xml_data_df) > str(json_data_df)
## @@ 3,3 @@ @@ 3,3 @@
## $ Authors : chr "Grinstead, C. Sne $ Authors : chr "Grinstead, C. Sne
## ll, J" "Robert A. Beezer" "G. Grolemu ll, J" "Robert A. Beezer" "G. Grolemu
## nd, Wickham H." "David J. Lilja" nd, Wickham H." "David J. Lilja"
## $ Publisher: chr "AMS" "Cambridge" $ Publisher: chr "AMS" "Cambridge"
## "O'Reilly" "Minest. Uni." "O'Reilly" "Minest. Uni."
## < $ Year : chr "1997" "2008" "201 > $ Year : num 1997 2008 2017 201
## : 7" "2016" : 6
diffObj(json_data_df,html_data_df)
## < str(json_data_df) > str(html_data_df)
## @@ 3,3 @@ @@ 3,3 @@
## $ Authors : chr "Grinstead, C. Sne $ Authors : chr "Grinstead, C. Sne
## ll, J" "Robert A. Beezer" "G. Grolemu ll, J" "Robert A. Beezer" "G. Grolemu
## nd, Wickham H." "David J. Lilja" nd, Wickham H." "David J. Lilja"
## $ Publisher: chr "AMS" "Cambridge" $ Publisher: chr "AMS" "Cambridge"
## "O'Reilly" "Minest. Uni." "O'Reilly" "Minest. Uni."
## < $ Year : num 1997 2008 2017 201 > $ Year : chr "1997" "2008" "201
## : 6 : 7" "2016"
The only notable difference in the data frames is the ‘Year’ column. This is has a type of chr in html and xml data frames, whereas its type in the json data frame is num.