#Pick three of your favorite books on one of your favorite subjects. At least one of the books should have more than one author. For each book, include the title, authors, and two or three other attributes that you find interesting.
#Take the information that you’ve selected about these three books, and separately create three files which store the book’s information in HTML (using an html table), XML, and JSON formats (e.g. “books.html”, “books.xml”, and “books.json”). To help you better understand the different file structures, I’d prefer that you create each of these files “by hand” unless you’re already very comfortable with the file formats.
#Write R code, using your packages of choice, to load the information from each of the three sources into separate R data frames. Are the three data frames identical?
#Your deliverable is the three source files and the R code. If you can, package your assignment solution up into an .Rmd file and publish to rpubs.com. [This will also require finding a way to make your three text files accessible from the web].
#First I uploaded the needed libraries
library(jsonlite)
library(knitr)
library(RJSONIO)
##
## Attaching package: 'RJSONIO'
## The following objects are masked from 'package:jsonlite':
##
## fromJSON, toJSON
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.1 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ RJSONIO::fromJSON() masks jsonlite::fromJSON()
## ✖ dplyr::lag() masks stats::lag()
## ✖ RJSONIO::toJSON() masks jsonlite::toJSON()
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(dplyr)
library(XML)
#to clean the memory
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 821639 43.9 1442291 77.1 1168576 62.5
## Vcells 1246248 9.6 2060183 15.8 1926584 14.7
#I have manually created the three files html, xml and json.
#Making a connection between the url and r studio
path <- "https://raw.githubusercontent.com/VioletaStoyanova/DATA607_Assignment7/master/books.json"
download.file(path, destfile = "~/books.json")
path <- file.path("books.json")
booksJSON <- fromJSON(content = path)
#class is a character vecor giving the names of the classes from which the object inherits.
class(booksJSON)
## [1] "list"
glimpse(booksJSON)
## List of 1
## $ books:List of 3
## ..$ :List of 5
## .. ..$ BookTitle: chr "A Brief History of Time"
## .. ..$ AuthorS : chr "Stephen Hawking"
## .. ..$ Genre : chr "Non Fiction"
## .. ..$ Year : num 1988
## .. ..$ Language : chr "English"
## ..$ :List of 5
## .. ..$ BookTitle: chr "The Little Prince"
## .. ..$ AuthorS : chr "Antoine de Saint-Exupery"
## .. ..$ Genre : chr "Fiction"
## .. ..$ Year : num 1943
## .. ..$ Language : chr "French"
## ..$ :List of 5
## .. ..$ BookTitle: chr "Astronomy:A Beginner's Guide to the Universe"
## .. ..$ AuthorS : chr "Eric Chaisson, Steve McMillan"
## .. ..$ Genre : chr "Non Fiction"
## .. ..$ Year : num 2012
## .. ..$ Language : chr "English"
#creating a data frame
#json_df<- ldply(booksJSON, data.frame)
#tidying the data to remove the id column
#json_df<- json_df %>% dplyr::select(2:6)
#json_df
booksJSON_df = as.data.frame(booksJSON)
booksJSON_df
## books.BookTitle books.AuthorS books.Genre books.Year
## 1 A Brief History of Time Stephen Hawking Non Fiction 1988
## books.Language books.BookTitle.1 books.AuthorS.1 books.Genre.1
## 1 English The Little Prince Antoine de Saint-Exupery Fiction
## books.Year.1 books.Language.1
## 1 1943 French
## books.BookTitle.2
## 1 Astronomy:A Beginner's Guide to the Universe
## books.AuthorS.2 books.Genre.2 books.Year.2
## 1 Eric Chaisson, Steve McMillan Non Fiction 2012
## books.Language.2
## 1 English
#Json created 1 list
#uploading the html file
path1 <- "https://raw.githubusercontent.com/VioletaStoyanova/DATA607_Assignment7/master/books.html"
download.file(path1, destfile = "~/books.html")
path1 <- file.path("books.html")
#parsing the file
booksHtml <- htmlParse(path1)
#creating a table
html_tb <- readHTMLTable(booksHtml, stringAsFactors = FALSE)
df.html <- html_tb[[1]] %>% tbl_df()
df.html
## # A tibble: 3 x 5
## BookTitle AuthorS Genre Year Language
## <fct> <fct> <fct> <fct> <fct>
## 1 A Brief History of Time Stephen Hawking Non Fic… 1988 English
## 2 The Little Prince Antoine de Saint-… Fiction 1943 French
## 3 Astronomy: A Beginner's Guid… Eric Chaisson, St… Non Fic… 2012 English
path <- "https://raw.githubusercontent.com/VioletaStoyanova/DATA607_Assignment7/master/books.xml"
download.file(path, destfile = "~/books.xml")
path <- file.path("books.xml")
booksXML <- xmlParse(path)
root <- xmlRoot(booksXML)
xmlName(root)
## [1] "books"
root["book"]
## $book
## <book id="1">
## <BookTitle>A Brief History of Time</BookTitle>
## <AuthorS>Stephen Hawking</AuthorS>
## <Genre>Non Fiction</Genre>
## <Year>1988</Year>
## <Language>English</Language>
## </book>
##
## $book
## <book id="2">
## <BookTitle>The Little Prince</BookTitle>
## <AuthorS>Antoine de Saint-Exupery</AuthorS>
## <Genre>Fiction</Genre>
## <Year>1943</Year>
## <Language>French</Language>
## </book>
##
## $book
## <book id="3">
## <BookTitle>Astronomy:A Beginner's Guide to the Universe</BookTitle>
## <AuthorS>Eric Chaisson,Steve McMillan</AuthorS>
## <Genre>Non Fiction</Genre>
## <Year>2012</Year>
## <Language>English</Language>
## </book>
##
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
df.xml <- xmlToDataFrame(root, stringsAsFactors = F) %>% tbl_df()
df.xml
## # A tibble: 3 x 5
## BookTitle AuthorS Genre Year Language
## <chr> <chr> <chr> <chr> <chr>
## 1 A Brief History of Time Stephen Hawking Non Fic… 1988 English
## 2 The Little Prince Antoine de Saint-… Fiction 1943 French
## 3 Astronomy:A Beginner's Guide… Eric Chaisson,Ste… Non Fic… 2012 English
#The three data frames were a bit different. With Json the data frame came out as one list. XML and HTML are more similar to each other except the columns in the XML data frame are characters and in the HTML data frames the columns are factors.