library(tidyverse)
library(wrapr)
library(httr)
library(XML)
library(jsonlite)
library(RCurl)
html_url <- "https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/Assignments/Week%207_20190313/data/books.html"
html <- GET(html_url) %.>%
rawToChar(.$content) %>%
htmlParse(.) %>%
readHTMLTable %>%
data.frame
names(html) <- c("Title", "Author", "Genre", "Year", "ISBN")
html
## Title Author
## 1 On the Origin of Species Charles Darwin
## 2 The Communist Manifesto Karl Marx, Friedrich Engels
## 3 Civilization and Its Discontents Sigmund Freud
## Genre Year ISBN
## 1 Anti-Religious Study 1859 9781515383284
## 2 Political Thriller, Satire 1848 9780140447576
## 3 Clinical History of the West 1930 9781453833896
xml_url <- "https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/Assignments/Week%207_20190313/data/books.xml"
xml <- GET(xml_url) %.>%
rawToChar(.$content) %>%
xmlParse %>%
xmlToDataFrame
xml
## Title Author
## 1 On the Origin of Species Charles Darwin
## 2 The Communist Manifesto Karl Marx, Friedrich Engels
## 3 Civilization and Its Discontents Sigmund Freud
## Genre Year ISBN
## 1 Anti-Religious Study 1859 9781515383284
## 2 Political Thriller, Satire 1848 9780140447576
## 3 Clinical History of the West 1930 9781453833896
json_url <- "https://raw.githubusercontent.com/myvioletrose/school_of_professional_studies/master/607.%20Data%20Acquisition%20and%20Management/Assignments/Week%207_20190313/data/books.json"
json <- GET(json_url) %.>%
rawToChar(.$content) %>%
fromJSON %>%
data.frame
json
## Title Author
## 1 On the Origin of Species Charles Darwin
## 2 The Communist Manifesto Karl Marx, Friedrich Engels
## 3 Civilization and Its Discontents Sigmund Freud
## Genre Year ISBN
## 1 Anti-Religious Study 1859 9781515383284
## 2 Political Thriller, Satire 1848 9780140447576
## 3 Clinical History of the West 1930 9781453833896
all(html == xml, xml == json, json == html)
## [1] TRUE
All three input files (books.html, books.xml & books.json) produce an idenical output.