Libraries
library(RCurl)
library(xml2)
library(dplyr)
library(tidyr)
library(jsonlite)
library(rvest)
Get all 3 book files from GitHub
#Book 1 - XML file
xmlFile <- "https://raw.githubusercontent.com/ltcancel/Homework7_Data607_F20/main/Book1.xml"
book1 <- read_xml(xmlFile)
#Book 2 - HTML file
htmlFile <- "https://raw.githubusercontent.com/ltcancel/Homework7_Data607_F20/main/Book2.html"
book2 <- read_html(htmlFile)
#Book 3 - Json file
jsonFile <- "https://raw.githubusercontent.com/ltcancel/Homework7_Data607_F20/main/Book3.json"
book3 <- fromJSON(jsonFile)
# Original file from GitHub
book1
## {xml_document}
## <note>
## [1] <title>FIRE CIDER!</title>
## [2] <author>Rosemary Gladstar</author>
## [3] <pages>204 pages</pages>
## [4] <ISBN-13>9781635861808</ISBN-13>
## [5] <publisher>Storey Publishing, LLC</publisher>
# get each part of the book's information from the xml file
book1_title <- xml_text(xml_find_all(book1,"//title"))
book1_author <- xml_text(xml_find_all(book1,"//author"))
book1_pages <- xml_text(xml_find_all(book1,"//pages"))
book1_isbn <- xml_text(xml_find_all(book1,"//ISBN-13"))
book1_pub <- xml_text(xml_find_all(book1,"//publisher"))
# combine each xml part to form a dataframe
book1_df <- cbind(book1_title,book1_author,book1_pages,book1_isbn,book1_pub)
book1_df <- as.data.frame(book1_df)
# final dataframe
book1_df
# original file from GitHub
book2
## {html_document}
## <html>
## [1] <body><table><tbody>\n<tr>\n<th>Title:</th>\n\t\t\t<td>Run Fast. Coo ...
# get book information from the HTML file
book2_df <- book2 %>%
html_nodes("table") %>%
html_table(fill = TRUE)
book2_df
## [[1]]
## X1
## 1 Title:
## 2 Author:
## 3 Pages:
## 4 ISBN:
## 5 Publisher:
## X2
## 1 Run Fast. Cook Fast. Eat Slow: Quick-Fix Recipes for Hangry Athletes: A Cookbook
## 2 Shalane Flanagan & Elyse Kopecky
## 3 256 pages
## 4 9781635651911
## 5 Potter/Ten Speed/Harmony/Rodale
# convert list to dataframe
book2_df <- as.data.frame(book2_df)
book2_df
# spread dataframe so each element is a column
book2_df <- book2_df %>%
spread(X1, X2)
# final dataframe
book2_df
# original file
book3
## $title
## [1] "Master Recipes from the Herbal Apothecary"
##
## $author
## [1] "JJ Pursell"
##
## $page
## [1] "288"
##
## $isbn
## [1] "9781604698527"
##
## $publisher
## [1] "Timber Press, Incorporated"
# get book information from Json file
book3_df <- do.call("rbind",book3)
book3_df
## [,1]
## title "Master Recipes from the Herbal Apothecary"
## author "JJ Pursell"
## page "288"
## isbn "9781604698527"
## publisher "Timber Press, Incorporated"
# convert matrix to dataframe
book3_df <- as.data.frame(book3_df)
book3_df
# the matrix used the Json tags as row names. we need to convert the row names to a column
book3_df <- tibble::rownames_to_column(book3_df,"Book")
book3_df
# spread dataframe so each element is a column
book3_df <- book3_df %>% spread(Book, V1)
# final dataframe
book3_df