Introduction
In this assignment, we load book data from two different formats: HTML and JSON, and compare the resulting data frames in R.
# LOAD LIBRARY
library (rvest)
library (jsonlite)
library (dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
# READ HTML
html_raw <- read_html ("books.html" )
df_html <- html_raw %>%
html_table () %>%
.[[1 ]]
df_html <- df_html %>% mutate (year = as.integer (year))
knitr:: kable (df_html, caption = "Data Frame from HTML" )
Data Frame from HTML
The Notebook
Nicholas Sparks
1996
Pride and Prejudice
Jane Austen
1813
Wild Roses
Nora Roberts
2004
# READ JSON
df_json <- fromJSON ("books.json" )
if (is.list (df_json$ authors)) {
df_json$ authors <- sapply (df_json$ authors, paste, collapse = ", " )
}
df_json <- df_json %>% mutate (year = as.integer (year))
knitr:: kable (df_json, caption = "Data Frame from JSON" )
Data Frame from JSON
The Notebook
Nicholas Sparks
1996
Pride and Prejudice
Jane Austen
1813
Wild Roses
Nora Roberts
2004
# Comparison tables
identical_check <- all.equal (df_html, df_json)
print (paste ("Are the data frames identical?" ,
if (isTRUE (identical_check)) "Yes!" else "No, there are small differences." ))
[1] "Are the data frames identical? No, there are small differences."
[1] "Attributes: < Component \"class\": Lengths (3, 1) differ (string compare on first 1) >"
[2] "Attributes: < Component \"class\": 1 string mismatch >"