loading packages

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(XML)

## Warning: package 'XML' was built under R version 4.0.4

library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

functions used

# function to transform data in list format to data frame 

list_to_df = function(list,format){
  len = length(list)
  
  #create empty df
  
  books_df = data.frame(matrix(ncol = length(format)))
  names(books_df) = format
  
  #transform items in list to rows
  
  for (i in seq(len)){
    books = data.frame(list[[i]])
    if (length(books) > length(format)){
      books = books %>%
        #hard coded for author column(s)
        pivot_longer(c(2,3),values_to = 'author')
      books = books[format]
    }
    books_df = union(books_df,books)
  }
  return(na.omit(books_df))
}

Looking at NoSQL data types

data

json = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.json'

# xml and html files fail to load through github so we have to download then get them locally

html_url = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.html'
download.file(html_url,destfile = "books.html")
html = 'books.html'

xml_url = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.xml'
download.file(xml_url,destfile = "books.xml")
xml = 'books.xml'

looking to transform data from various (xml, html, json) sources to R data frames

xml

# formatting my table
format = c('title','author','year','cost')

#loading xml results as a list
xml_list = xmlToList(xml)

#transforming list to df
books_xml = list_to_df(xml_list,format)
head(books_xml)

## # A tibble: 4 x 4
##   title                          author              year  cost 
##   <chr>                          <chr>               <chr> <chr>
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003  0.99 
## 2 Don Quioxte (Penguin Classics) John Rutherford     2003  0.99 
## 3 Oh, the Places You'll Go!      Dr. Suess           1990  9.99 
## 4 The Cat in The Hat             Dr. Suess           1957  7.99

json

# loading json as a list
json_list = fromJSON(json,simplifyVector = FALSE)$books

# transforming list to df
books_json = list_to_df(json_list,format)
head(books_json)

## # A tibble: 4 x 4
##   title                          author              year  cost 
##   <chr>                          <chr>               <chr> <chr>
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003  0.99 
## 2 Don Quioxte (Penguin Classics) John Rutherford     2003  0.99 
## 3 Oh, the Places You'll Go!      Dr. Suess           1990  9.99 
## 4 The Cat in The Hat             Dr. Suess           1957  7.99

html

#loading HTML file
books_html = readHTMLTable(html)$'NULL'

#HTML already loads nicely however there are two authors for the first entry
head(books_html)

##                            title                              author year cost
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes,John Rutherford 2003 0.99
## 2      Oh, the Places You'll Go!                           Dr. Suess 1990 9.99
## 3             The Cat in The Hat                           Dr. Suess 1957 7.99

#splitting authors by comma
authors = books_html$author %>%
  strsplit(',')

title = c()
year = c()
cost = c()
author = unlist(authors)

for (i in seq(length(authors))){
   len = length(authors[[i]])
   title = c(title,rep(books_html$'title'[i],len))
   year = c(year,rep(books_html$'year'[i],len))
   cost = c(cost,rep(books_html$'cost'[i],len))
}

#new df with author split
books_html2 = data.frame(title,author,year,cost)
head(books_html2)

##                            title              author year cost
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003 0.99
## 2 Don Quioxte (Penguin Classics)     John Rutherford 2003 0.99
## 3      Oh, the Places You'll Go!           Dr. Suess 1990 9.99
## 4             The Cat in The Hat           Dr. Suess 1957 7.99

Comparing tables

At face value all of the tables transformed from various source (json, xml, and html) look the same.

# json df vs xml df
books_json == books_xml

##      title author year cost
## [1,]  TRUE   TRUE TRUE TRUE
## [2,]  TRUE   TRUE TRUE TRUE
## [3,]  TRUE   TRUE TRUE TRUE
## [4,]  TRUE   TRUE TRUE TRUE

# json df vs html df
books_json == books_html2

##      title author year cost
## [1,]  TRUE   TRUE TRUE TRUE
## [2,]  TRUE   TRUE TRUE TRUE
## [3,]  TRUE   TRUE TRUE TRUE
## [4,]  TRUE   TRUE TRUE TRUE

# html df vs xml df
books_html2 == books_xml

##      title author year cost
## [1,]  TRUE   TRUE TRUE TRUE
## [2,]  TRUE   TRUE TRUE TRUE
## [3,]  TRUE   TRUE TRUE TRUE
## [4,]  TRUE   TRUE TRUE TRUE

data607week7

Richard

3/19/2021

loading packages

functions used

Looking at NoSQL data types

data

looking to transform data from various (xml, html, json) sources to R data frames

xml

json

html

Comparing tables

At face value all of the tables transformed from various source (json, xml, and html) look the same.

As you can see, all the dataframes are the same