Libraries

library(RCurl)
library(xml2)
library(dplyr)
library(tidyr)
library(jsonlite)
library(rvest)

Import Data

Get all 3 book files from GitHub

#Book 1 - XML file
xmlFile <- "https://raw.githubusercontent.com/ltcancel/Homework7_Data607_F20/main/Book1.xml"
book1 <- read_xml(xmlFile)

#Book 2 - HTML file
htmlFile <- "https://raw.githubusercontent.com/ltcancel/Homework7_Data607_F20/main/Book2.html"
book2 <- read_html(htmlFile)

#Book 3 - Json file
jsonFile <- "https://raw.githubusercontent.com/ltcancel/Homework7_Data607_F20/main/Book3.json"
book3 <- fromJSON(jsonFile)

Create dataframes

Book 1 - XML File

# Original file from GitHub
book1

## {xml_document}
## <note>
## [1] <title>FIRE CIDER!</title>
## [2] <author>Rosemary Gladstar</author>
## [3] <pages>204 pages</pages>
## [4] <ISBN-13>9781635861808</ISBN-13>
## [5] <publisher>Storey Publishing, LLC</publisher>

# get each part of the book's information from the xml file
book1_title <- xml_text(xml_find_all(book1,"//title"))
book1_author <- xml_text(xml_find_all(book1,"//author"))
book1_pages <- xml_text(xml_find_all(book1,"//pages"))
book1_isbn <- xml_text(xml_find_all(book1,"//ISBN-13"))
book1_pub <- xml_text(xml_find_all(book1,"//publisher"))

# combine each xml part to form a dataframe 
book1_df <- cbind(book1_title,book1_author,book1_pages,book1_isbn,book1_pub)

book1_df <- as.data.frame(book1_df)

# final dataframe
book1_df

Book 2 - HTML File

# original file from GitHub
book2

## {html_document}
## <html>
## [1] <body><table><tbody>\n<tr>\n<th>Title:</th>\n\t\t\t<td>Run Fast. Coo ...

# get book information from the HTML file
book2_df <- book2 %>% 
  html_nodes("table") %>%
  html_table(fill = TRUE)

book2_df

## [[1]]
##           X1
## 1     Title:
## 2    Author:
## 3     Pages:
## 4      ISBN:
## 5 Publisher:
##                                                                                 X2
## 1 Run Fast. Cook Fast. Eat Slow: Quick-Fix Recipes for Hangry Athletes: A Cookbook
## 2                                                 Shalane Flanagan & Elyse Kopecky
## 3                                                                        256 pages
## 4                                                                    9781635651911
## 5                                                  Potter/Ten Speed/Harmony/Rodale

# convert list to dataframe
book2_df <- as.data.frame(book2_df)

book2_df

# spread dataframe so each element is a column
book2_df <- book2_df %>%
  spread(X1, X2)

# final dataframe
book2_df

Book 3 - Json File

# original file 
book3

## $title
## [1] "Master Recipes from the Herbal Apothecary"
## 
## $author
## [1] "JJ Pursell"
## 
## $page
## [1] "288"
## 
## $isbn
## [1] "9781604698527"
## 
## $publisher
## [1] "Timber Press, Incorporated"

# get book information from Json file
book3_df <- do.call("rbind",book3)
book3_df

##           [,1]                                       
## title     "Master Recipes from the Herbal Apothecary"
## author    "JJ Pursell"                               
## page      "288"                                      
## isbn      "9781604698527"                            
## publisher "Timber Press, Incorporated"

# convert matrix to dataframe
book3_df <- as.data.frame(book3_df)
book3_df

# the matrix used the Json tags as row names. we need to convert the row names to a column
book3_df <- tibble::rownames_to_column(book3_df,"Book")
book3_df

# spread dataframe so each element is a column
book3_df <- book3_df %>% spread(Book, V1)

# final dataframe
book3_df

Homework7

LeTicia Cancel

10/10/2020

Import Data

Create dataframes

Book 1 - XML File

Book 2 - HTML File

Book 3 - Json File