loading packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(XML)
## Warning: package 'XML' was built under R version 4.0.4
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
functions used
# function to transform data in list format to data frame
list_to_df = function(list,format){
len = length(list)
#create empty df
books_df = data.frame(matrix(ncol = length(format)))
names(books_df) = format
#transform items in list to rows
for (i in seq(len)){
books = data.frame(list[[i]])
if (length(books) > length(format)){
books = books %>%
#hard coded for author column(s)
pivot_longer(c(2,3),values_to = 'author')
books = books[format]
}
books_df = union(books_df,books)
}
return(na.omit(books_df))
}
Looking at NoSQL data types
data
json = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.json'
# xml and html files fail to load through github so we have to download then get them locally
html_url = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.html'
download.file(html_url,destfile = "books.html")
html = 'books.html'
xml_url = 'https://raw.githubusercontent.com/schoolkidrich/R/main/DATA%20607/week7/books.xml'
download.file(xml_url,destfile = "books.xml")
xml = 'books.xml'
looking to transform data from various (xml, html, json) sources to R data frames
xml
# formatting my table
format = c('title','author','year','cost')
#loading xml results as a list
xml_list = xmlToList(xml)
#transforming list to df
books_xml = list_to_df(xml_list,format)
head(books_xml)
## # A tibble: 4 x 4
## title author year cost
## <chr> <chr> <chr> <chr>
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003 0.99
## 2 Don Quioxte (Penguin Classics) John Rutherford 2003 0.99
## 3 Oh, the Places You'll Go! Dr. Suess 1990 9.99
## 4 The Cat in The Hat Dr. Suess 1957 7.99
json
# loading json as a list
json_list = fromJSON(json,simplifyVector = FALSE)$books
# transforming list to df
books_json = list_to_df(json_list,format)
head(books_json)
## # A tibble: 4 x 4
## title author year cost
## <chr> <chr> <chr> <chr>
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003 0.99
## 2 Don Quioxte (Penguin Classics) John Rutherford 2003 0.99
## 3 Oh, the Places You'll Go! Dr. Suess 1990 9.99
## 4 The Cat in The Hat Dr. Suess 1957 7.99
html
#loading HTML file
books_html = readHTMLTable(html)$'NULL'
#HTML already loads nicely however there are two authors for the first entry
head(books_html)
## title author year cost
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes,John Rutherford 2003 0.99
## 2 Oh, the Places You'll Go! Dr. Suess 1990 9.99
## 3 The Cat in The Hat Dr. Suess 1957 7.99
#splitting authors by comma
authors = books_html$author %>%
strsplit(',')
title = c()
year = c()
cost = c()
author = unlist(authors)
for (i in seq(length(authors))){
len = length(authors[[i]])
title = c(title,rep(books_html$'title'[i],len))
year = c(year,rep(books_html$'year'[i],len))
cost = c(cost,rep(books_html$'cost'[i],len))
}
#new df with author split
books_html2 = data.frame(title,author,year,cost)
head(books_html2)
## title author year cost
## 1 Don Quioxte (Penguin Classics) Miguel de Cervantes 2003 0.99
## 2 Don Quioxte (Penguin Classics) John Rutherford 2003 0.99
## 3 Oh, the Places You'll Go! Dr. Suess 1990 9.99
## 4 The Cat in The Hat Dr. Suess 1957 7.99
Comparing tables
As you can see, all the dataframes are the same