loading packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(XML)
## Warning: package 'XML' was built under R version 4.0.4
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten

functions used

# function to transform data in list format to data frame 

list_to_df = function(list,format){
  len = length(list)
  
  #create empty df
  
  books_df = data.frame(matrix(ncol = length(format)))
  names(books_df) = format
  
  #transform items in list to rows
  
  for (i in seq(len)){
    books = data.frame(list[[i]])
    if (length(books) > length(format)){
      books = books %>%
        #hard coded for author column(s)
        pivot_longer(c(2,3),values_to = 'author')
      books = books[format]
    }
    books_df = union(books_df,books)
  }
  return(na.omit(books_df))
}

Looking at NoSQL data types

data

json = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.json'

# xml and html files fail to load through github so we have to download then get them locally

html_url = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.html'
download.file(html_url,destfile = "books.html")
html = 'books.html'

xml_url = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.xml'
download.file(xml_url,destfile = "books.xml")
xml = 'books.xml'

looking to transform data from various (xml, html, json) sources to R data frames

xml

# formatting my table
format = c('title','author','year','cost')

#loading xml results as a list
xml_list = xmlToList(xml)

#transforming list to df
books_xml = list_to_df(xml_list,format)
head(books_xml)
## # A tibble: 4 x 4
##   title                          author              year  cost 
##   <chr>                          <chr>               <chr> <chr>
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003  0.99 
## 2 Don Quioxte (Penguin Classics) John Rutherford     2003  0.99 
## 3 Oh, the Places You'll Go!      Dr. Suess           1990  9.99 
## 4 The Cat in The Hat             Dr. Suess           1957  7.99

json

# loading json as a list
json_list = fromJSON(json,simplifyVector = FALSE)$books

# transforming list to df
books_json = list_to_df(json_list,format)
head(books_json)
## # A tibble: 4 x 4
##   title                          author              year  cost 
##   <chr>                          <chr>               <chr> <chr>
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003  0.99 
## 2 Don Quioxte (Penguin Classics) John Rutherford     2003  0.99 
## 3 Oh, the Places You'll Go!      Dr. Suess           1990  9.99 
## 4 The Cat in The Hat             Dr. Suess           1957  7.99

html

#loading HTML file
books_html = readHTMLTable(html)$'NULL'

#HTML already loads nicely however there are two authors for the first entry
head(books_html)
##                            title                              author year cost
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes,John Rutherford 2003 0.99
## 2      Oh, the Places You'll Go!                           Dr. Suess 1990 9.99
## 3             The Cat in The Hat                           Dr. Suess 1957 7.99
#splitting authors by comma
authors = books_html$author %>%
  strsplit(',')

title = c()
year = c()
cost = c()
author = unlist(authors)

for (i in seq(length(authors))){
   len = length(authors[[i]])
   title = c(title,rep(books_html$'title'[i],len))
   year = c(year,rep(books_html$'year'[i],len))
   cost = c(cost,rep(books_html$'cost'[i],len))
}

#new df with author split
books_html2 = data.frame(title,author,year,cost)
head(books_html2)
##                            title              author year cost
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003 0.99
## 2 Don Quioxte (Penguin Classics)     John Rutherford 2003 0.99
## 3      Oh, the Places You'll Go!           Dr. Suess 1990 9.99
## 4             The Cat in The Hat           Dr. Suess 1957 7.99

Comparing tables

At face value all of the tables transformed from various source (json, xml, and html) look the same.

# json df vs xml df
books_json == books_xml
##      title author year cost
## [1,]  TRUE   TRUE TRUE TRUE
## [2,]  TRUE   TRUE TRUE TRUE
## [3,]  TRUE   TRUE TRUE TRUE
## [4,]  TRUE   TRUE TRUE TRUE
# json df vs html df
books_json == books_html2
##      title author year cost
## [1,]  TRUE   TRUE TRUE TRUE
## [2,]  TRUE   TRUE TRUE TRUE
## [3,]  TRUE   TRUE TRUE TRUE
## [4,]  TRUE   TRUE TRUE TRUE
# html df vs xml df
books_html2 == books_xml
##      title author year cost
## [1,]  TRUE   TRUE TRUE TRUE
## [2,]  TRUE   TRUE TRUE TRUE
## [3,]  TRUE   TRUE TRUE TRUE
## [4,]  TRUE   TRUE TRUE TRUE

As you can see, all the dataframes are the same