Working with JSON in R

The Files

Reading from JSON

library(tidyjson)

## 
## Attaching package: 'tidyjson'

## The following object is masked from 'package:stats':
## 
##     filter

link <- 'https://raw.githubusercontent.com/st3vejobs/Data-607-Working-with-JSON-in-R-/main/books.json'

jsonbooks <- jsonlite::fromJSON(link)
jsonbooks <- data.frame(jsonbooks)
jsonbooks

##                                title pages year              author
## 1 Astrophysics for People in a Hurry   144 2017 Neil DeGrasse Tyson

Reading from XML

library(xml2)
library(XML)
library(RCurl)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::complete() masks RCurl::complete()
## x dplyr::filter()   masks tidyjson::filter(), stats::filter()
## x dplyr::lag()      masks stats::lag()

link <- "https://raw.githubusercontent.com/st3vejobs/Data-607-Working-with-JSON-in-R-/main/books.xml"
data <- getURL(link)

xmlraw <- xmlParse(data)
xmllist <- xmlToList(xmlraw)
xmlunlist <- unlist(xmllist)

xmlunlist <- xmlunlist[1:4]

x <- data.frame(xmlunlist)
xmlbooks <- data.frame(t(x))
rownames(xmlbooks) <- c()
xmlbooks

##                        title pages year          author
## 1 On The Shoulders of Giants  1280 2003 Stephen Hawking

Reading from HTML

library(textreadr)

## 
## Attaching package: 'textreadr'

## The following objects are masked from 'package:xml2':
## 
##     read_html, read_xml

library(stringr)
link <- "https://raw.githubusercontent.com/st3vejobs/Data-607-Working-with-JSON-in-R-/main/books.html"
rawHTML <- read_html(link)
booksraw <- unlist(rawHTML)

header <- c(str_extract(booksraw[1], "[A-Za-z]+"))
header <- append(header, str_extract(booksraw[2], "[A-Za-z]+"))
header <- append(header, str_extract(booksraw[3], "[A-Za-z]+"))
header <- append(header, str_extract(booksraw[4], "[A-Za-z]+"))


books1 <- str_remove(booksraw[1], "[A-Za-z]+")
books1 <- str_remove_all(books1, '[[:punct:]]')
books2 <- str_remove(booksraw[2], "[A-Za-z]+")
books2 <- str_remove_all(books2, '[[:punct:]]')
books3 <- str_remove(booksraw[3], "[A-Za-z]+")
books3 <- str_remove_all(books3, '[[:punct:]]')
books4 <- str_remove(booksraw[4], "[A-Za-z]+")
books4 <- str_remove_all(books4, '[[:punct:]]')


books <- c(books1,books2,books3,books4)
bookshtml <- data.frame()
bookshtml <- rbind(bookshtml, books)
colnames(bookshtml) <- header
bookshtml

##                             title  year pages                       author
## 1  The Backyard Astronomers Guide  1991   416  Terence Dickinson Alan Dyer

Comments

These data frames did not load in the same manner, and they all required different techniques to tidy them to appear identical. JSON was the simplest format to turn into a data frame. In the end, the final data frames are in the same format.