Shana Green

DATA 607 - Homework 7

Due Date: 10/10/2020

library(jsonlite)
library(rvest)
## Loading required package: xml2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RCurl)
library(stringr)

Loading books from an HTML file

# Loading from HTML
book_a<-as.data.frame(read_html("https://raw.githubusercontent.com/sagreen131/DATA-607-HW-7/main/books.html")%>% html_table(fill=TRUE))
htmldf <- book_a %>% `colnames<-`(c('Title', 'Authors', 'ISBN', 'Edition', 'Topics'))

htmldf <- select(htmldf, Title, Authors, ISBN, Edition, Topics)
htmldf

Loading books from an XML file

library(XML)
## 
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
## 
##     xml
# Loading from XML
book_b <- read_xml("https://raw.githubusercontent.com/sagreen131/DATA-607-HW-7/main/books.xml")

book_b_2 <-xmlParse(book_b)

xmldf <- xmlToDataFrame(book_b_2)
xmldf <- xmldf %>% `colnames<-`(c('Title', 'Authors', 'ISBN', 'Edition', 'Topics'))
xmldf

Loading books from an JSON file

# Loading from JSON
book_c <- fromJSON("https://raw.githubusercontent.com/sagreen131/DATA-607-HW-7/main/books.json")
jsondf <- as.data.frame(book_c)
jsondf <- jsondf %>% `colnames<-`(c('Title', 'Authors', 'ISBN', 'Edition', 'Topics'))
jsondf

Summary

The data frames are very similar. The only difference I saw was that the HTML df displayed the ISBN as a double with exponents, as opposed to the character string. I thought it would make sense to assign column names in HTML and XML.

Github here

Rpubs here