library(RCurl)
library(XML)
library(jsonlite)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
library(rlist)
library(DT)
#Loading File
html_url <- getURL("https://raw.githubusercontent.com/rossboehme/DATA607/main/assignment5/html-books.html",.opts = list(ssl.verifypeer = FALSE) )
htmlbooks <- readHTMLTable(html_url,header=TRUE)
htmlbooks <- as.data.frame(list.clean(htmlbooks, fun = is.null, recursive = FALSE))
html_t <- as.data.frame(t(htmlbooks))
names(html_t) <- names(htmlbooks)
rownames(html_t) <- 1:nrow(html_t)
htmlbooks <- html_t
#Cleaning File
names(htmlbooks) <- str_replace_all(string = names(htmlbooks), pattern = "^RossBooks.", replacement = "")
htmlbooks$Page.Count <- as.numeric(htmlbooks$Page.Count)
names(htmlbooks) <- str_replace_all(string = names(htmlbooks), pattern = "\\.", replacement = " ")
names(htmlbooks) <- str_replace_all(string = names(htmlbooks), pattern = "\\s\\w\\s", replacement = "(s)")
#Showing file and data types
print(htmlbooks)
## Book Title
## 1 Naked Economics: Undressing the Dismal Science
## 2 Principles for Dealing with the Changing World Order: Why Nations Succeed and Fail
## 3 Basic Economics
## 4 The Second Machine Age: Work, Progress, and Prosperity in a Time of Brilliant Technologies
## 5 Money: The True Story of a Made-Up Thing
## Author(s) Genre(s) Review Page Count
## 1 Charles Wheelan Economics 4/5 400
## 2 Ray Dalio Economics; History 3.5/5 576
## 3 Thomas Sowell Economics 5/5 448
## 4 Erik Brynjolfsson; Andrew McAfee Technology; Economics 4.5/5 336
## 5 Jacob Goldstein Economics; History 4/5 272
summary(htmlbooks)
## Book Title Author(s) Genre(s) Review
## Length:5 Length:5 Length:5 Length:5
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Page Count
## Min. :272.0
## 1st Qu.:336.0
## Median :400.0
## Mean :406.4
## 3rd Qu.:448.0
## Max. :576.0
#Loading File
download.file("https://raw.githubusercontent.com/rossboehme/DATA607/main/assignment5/xml-books.xml", destfile = "xmlbooks.xml")
xmlbooks <- xmlParse("xmlbooks.xml")
xmlbooks <- xmlToDataFrame(xmlbooks)
#Cleaning File
xmlbooks$page_count <- as.numeric(xmlbooks$page_count)
names(xmlbooks) <- names(htmlbooks)
#Showing File and Data Types
xmlbooks
## Book Title
## 1 Naked Economics: Undressing the Dismal Science
## 2 Principles for Dealing with the Changing World Order: Why Nations Succeed and Fail
## 3 Basic Economics
## 4 The Second Machine Age: Work, Progress, and Prosperity in a Time of Brilliant Technologies
## 5 Money: The True Story of a Made-Up Thing
## Author(s) Genre(s) Review Page Count
## 1 Charles Wheelan Economics 4/5 400
## 2 Ray Dalio Economics; History 3.5/5 576
## 3 Thomas Sowell Economics 5/5 448
## 4 Erik Brynjolfsson; Andrew McAfee Technology; Economics 4.5/5 336
## 5 Jacob Goldstein Economics; History 4/5 272
summary(xmlbooks)
## Book Title Author(s) Genre(s) Review
## Length:5 Length:5 Length:5 Length:5
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Page Count
## Min. :272.0
## 1st Qu.:336.0
## Median :400.0
## Mean :406.4
## 3rd Qu.:448.0
## Max. :576.0
#Loading File
jsonbooks <- fromJSON("https://raw.githubusercontent.com/rossboehme/DATA607/main/assignment5/json-books.json") %>% as.data.frame
#Cleaning File
names(jsonbooks) <- names(htmlbooks)
#Showing file and data types
jsonbooks
## Book Title
## 1 Naked Economics: Undressing the Dismal Science
## 2 Principles for Dealing with the Changing World Order: Why Nations Succeed and Fail
## 3 Basic Economics
## 4 The Second Machine Age: Work, Progress, and Prosperity in a Time of Brilliant Technologies
## 5 Money: The True Story of a Made-Up Thing
## Author(s) Genre(s) Review Page Count
## 1 Charles Wheelan Economics 4/5 400
## 2 Ray Dalio Economics; History 3.5/5 576
## 3 Thomas Sowell Economics 5/5 448
## 4 Erik Brynjolfsson; Andrew McAfee Technology; Economics 4.5/5 336
## 5 Jacob Goldstein Economics; History 4/5 272
summary(jsonbooks)
## Book Title Author(s) Genre(s) Review
## Length:5 Length:5 Length:5 Length:5
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Page Count
## Min. :272.0
## 1st Qu.:336.0
## Median :400.0
## Mean :406.4
## 3rd Qu.:448.0
## Max. :576.0
They are all identical per the below chunk’s outputs
htmlbooks == xmlbooks & htmlbooks == jsonbooks
## Book Title Author(s) Genre(s) Review Page Count
## 1 TRUE TRUE TRUE TRUE TRUE
## 2 TRUE TRUE TRUE TRUE TRUE
## 3 TRUE TRUE TRUE TRUE TRUE
## 4 TRUE TRUE TRUE TRUE TRUE
## 5 TRUE TRUE TRUE TRUE TRUE
summary(xmlbooks) == summary(htmlbooks) & summary(htmlbooks) == summary(jsonbooks)
## Book Title Author(s) Genre(s) Review Page Count
## TRUE TRUE TRUE TRUE TRUE
## TRUE TRUE TRUE TRUE TRUE
## TRUE TRUE TRUE TRUE TRUE
## NA NA NA NA TRUE
## NA NA NA NA TRUE
## NA NA NA NA TRUE
names(htmlbooks) == names(xmlbooks) & names(htmlbooks) == names(jsonbooks)
## [1] TRUE TRUE TRUE TRUE TRUE
This assignment demonstrated that structured JSON or XML data are easier to work with than HTML data.