Assignment 5

Loading Libraries

library(RCurl)
library(XML)
library(jsonlite)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::complete() masks RCurl::complete()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ purrr::flatten()  masks jsonlite::flatten()
## ✖ dplyr::lag()      masks stats::lag()
library(rlist)
library(DT)

Book Files to Read

Created by manually writing files in VS Code - HTML - XML - JSON

Loading HTML Book File

#Loading File
html_url <- getURL("https://raw.githubusercontent.com/rossboehme/DATA607/main/assignment5/html-books.html",.opts = list(ssl.verifypeer = FALSE) )
htmlbooks <- readHTMLTable(html_url,header=TRUE)
htmlbooks <- as.data.frame(list.clean(htmlbooks, fun = is.null, recursive = FALSE))
html_t <- as.data.frame(t(htmlbooks))
names(html_t) <- names(htmlbooks)
rownames(html_t) <- 1:nrow(html_t)
htmlbooks <- html_t
#Cleaning File
names(htmlbooks) <- str_replace_all(string = names(htmlbooks), pattern = "^RossBooks.", replacement = "") 
htmlbooks$Page.Count <- as.numeric(htmlbooks$Page.Count)
names(htmlbooks) <- str_replace_all(string = names(htmlbooks), pattern = "\\.", replacement = " ") 
names(htmlbooks) <- str_replace_all(string = names(htmlbooks), pattern = "\\s\\w\\s", replacement = "(s)") 
#Showing file and data types
print(htmlbooks)
##                                                                                   Book Title
## 1                                             Naked Economics: Undressing the Dismal Science
## 2         Principles for Dealing with the Changing World Order: Why Nations Succeed and Fail
## 3                                                                            Basic Economics
## 4 The Second Machine Age: Work, Progress, and Prosperity in a Time of Brilliant Technologies
## 5                                                   Money: The True Story of a Made-Up Thing
##                          Author(s)              Genre(s) Review Page Count
## 1                  Charles Wheelan             Economics    4/5        400
## 2                        Ray Dalio    Economics; History  3.5/5        576
## 3                    Thomas Sowell             Economics    5/5        448
## 4 Erik Brynjolfsson; Andrew McAfee Technology; Economics  4.5/5        336
## 5                  Jacob Goldstein    Economics; History    4/5        272
summary(htmlbooks)
##   Book Title         Author(s)           Genre(s)            Review         
##  Length:5           Length:5           Length:5           Length:5          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Page Count   
##  Min.   :272.0  
##  1st Qu.:336.0  
##  Median :400.0  
##  Mean   :406.4  
##  3rd Qu.:448.0  
##  Max.   :576.0

Loading XML Book File

#Loading File
download.file("https://raw.githubusercontent.com/rossboehme/DATA607/main/assignment5/xml-books.xml", destfile = "xmlbooks.xml")
xmlbooks <- xmlParse("xmlbooks.xml")
xmlbooks <- xmlToDataFrame(xmlbooks)
#Cleaning File
xmlbooks$page_count <- as.numeric(xmlbooks$page_count)
names(xmlbooks) <- names(htmlbooks)
#Showing File and Data Types
xmlbooks
##                                                                                   Book Title
## 1                                             Naked Economics: Undressing the Dismal Science
## 2         Principles for Dealing with the Changing World Order: Why Nations Succeed and Fail
## 3                                                                            Basic Economics
## 4 The Second Machine Age: Work, Progress, and Prosperity in a Time of Brilliant Technologies
## 5                                                   Money: The True Story of a Made-Up Thing
##                          Author(s)              Genre(s) Review Page Count
## 1                  Charles Wheelan             Economics    4/5        400
## 2                        Ray Dalio    Economics; History  3.5/5        576
## 3                    Thomas Sowell             Economics    5/5        448
## 4 Erik Brynjolfsson; Andrew McAfee Technology; Economics  4.5/5        336
## 5                  Jacob Goldstein    Economics; History    4/5        272
summary(xmlbooks)
##   Book Title         Author(s)           Genre(s)            Review         
##  Length:5           Length:5           Length:5           Length:5          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Page Count   
##  Min.   :272.0  
##  1st Qu.:336.0  
##  Median :400.0  
##  Mean   :406.4  
##  3rd Qu.:448.0  
##  Max.   :576.0

Loading JSON Book File

#Loading File
jsonbooks <- fromJSON("https://raw.githubusercontent.com/rossboehme/DATA607/main/assignment5/json-books.json") %>% as.data.frame
#Cleaning File
names(jsonbooks) <- names(htmlbooks)
#Showing file and data types
jsonbooks
##                                                                                   Book Title
## 1                                             Naked Economics: Undressing the Dismal Science
## 2         Principles for Dealing with the Changing World Order: Why Nations Succeed and Fail
## 3                                                                            Basic Economics
## 4 The Second Machine Age: Work, Progress, and Prosperity in a Time of Brilliant Technologies
## 5                                                   Money: The True Story of a Made-Up Thing
##                          Author(s)              Genre(s) Review Page Count
## 1                  Charles Wheelan             Economics    4/5        400
## 2                        Ray Dalio    Economics; History  3.5/5        576
## 3                    Thomas Sowell             Economics    5/5        448
## 4 Erik Brynjolfsson; Andrew McAfee Technology; Economics  4.5/5        336
## 5                  Jacob Goldstein    Economics; History    4/5        272
summary(jsonbooks)
##   Book Title         Author(s)           Genre(s)            Review         
##  Length:5           Length:5           Length:5           Length:5          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    Page Count   
##  Min.   :272.0  
##  1st Qu.:336.0  
##  Median :400.0  
##  Mean   :406.4  
##  3rd Qu.:448.0  
##  Max.   :576.0

Comparing Dataframes

They are all identical per the below chunk’s outputs

htmlbooks == xmlbooks & htmlbooks == jsonbooks
##   Book Title Author(s) Genre(s) Review Page Count
## 1       TRUE      TRUE     TRUE   TRUE       TRUE
## 2       TRUE      TRUE     TRUE   TRUE       TRUE
## 3       TRUE      TRUE     TRUE   TRUE       TRUE
## 4       TRUE      TRUE     TRUE   TRUE       TRUE
## 5       TRUE      TRUE     TRUE   TRUE       TRUE
summary(xmlbooks) == summary(htmlbooks) & summary(htmlbooks) == summary(jsonbooks)
##   Book Title  Author(s)   Genre(s)    Review   Page Count
##         TRUE       TRUE       TRUE      TRUE         TRUE
##         TRUE       TRUE       TRUE      TRUE         TRUE
##         TRUE       TRUE       TRUE      TRUE         TRUE
##           NA         NA         NA        NA         TRUE
##           NA         NA         NA        NA         TRUE
##           NA         NA         NA        NA         TRUE
names(htmlbooks) == names(xmlbooks) & names(htmlbooks) == names(jsonbooks)
## [1] TRUE TRUE TRUE TRUE TRUE

This assignment demonstrated that structured JSON or XML data are easier to work with than HTML data.