The Files
Reading from JSON
library(tidyjson)
##
## Attaching package: 'tidyjson'
## The following object is masked from 'package:stats':
##
## filter
link <- 'https://raw.githubusercontent.com/st3vejobs/Data-607-Working-with-JSON-in-R-/main/books.json'
jsonbooks <- jsonlite::fromJSON(link)
jsonbooks <- data.frame(jsonbooks)
jsonbooks
## title pages year author
## 1 Astrophysics for People in a Hurry 144 2017 Neil DeGrasse Tyson
Reading from XML
library(xml2)
library(XML)
library(RCurl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::complete() masks RCurl::complete()
## x dplyr::filter() masks tidyjson::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
link <- "https://raw.githubusercontent.com/st3vejobs/Data-607-Working-with-JSON-in-R-/main/books.xml"
data <- getURL(link)
xmlraw <- xmlParse(data)
xmllist <- xmlToList(xmlraw)
xmlunlist <- unlist(xmllist)
xmlunlist <- xmlunlist[1:4]
x <- data.frame(xmlunlist)
xmlbooks <- data.frame(t(x))
rownames(xmlbooks) <- c()
xmlbooks
## title pages year author
## 1 On The Shoulders of Giants 1280 2003 Stephen Hawking
Reading from HTML
library(textreadr)
##
## Attaching package: 'textreadr'
## The following objects are masked from 'package:xml2':
##
## read_html, read_xml
library(stringr)
link <- "https://raw.githubusercontent.com/st3vejobs/Data-607-Working-with-JSON-in-R-/main/books.html"
rawHTML <- read_html(link)
booksraw <- unlist(rawHTML)
header <- c(str_extract(booksraw[1], "[A-Za-z]+"))
header <- append(header, str_extract(booksraw[2], "[A-Za-z]+"))
header <- append(header, str_extract(booksraw[3], "[A-Za-z]+"))
header <- append(header, str_extract(booksraw[4], "[A-Za-z]+"))
books1 <- str_remove(booksraw[1], "[A-Za-z]+")
books1 <- str_remove_all(books1, '[[:punct:]]')
books2 <- str_remove(booksraw[2], "[A-Za-z]+")
books2 <- str_remove_all(books2, '[[:punct:]]')
books3 <- str_remove(booksraw[3], "[A-Za-z]+")
books3 <- str_remove_all(books3, '[[:punct:]]')
books4 <- str_remove(booksraw[4], "[A-Za-z]+")
books4 <- str_remove_all(books4, '[[:punct:]]')
books <- c(books1,books2,books3,books4)
bookshtml <- data.frame()
bookshtml <- rbind(bookshtml, books)
colnames(bookshtml) <- header
bookshtml
## title year pages author
## 1 The Backyard Astronomers Guide 1991 416 Terence Dickinson Alan Dyer
Comments
These data frames did not load in the same manner, and they all required different techniques to tidy them to appear identical. JSON was the simplest format to turn into a data frame. In the end, the final data frames are in the same format.