library(knitr)
library(XML)
library(dplyr)
library(tidyr)
library(xml2)
library(rjson)
library(jsonlite)
Pick three of your favorite books on one of your favorite subjects. At least one of the books should have more than one author. For each book, include the title, authors, and two or three other attributes that you find interesting. Take the information that you’ve selected about these three books, and separately create three files which store the book’s information in HTML (using an html table), XML, and JSON formats (e.g. “books.html”, “books.xml”, and “books.json”). To help you better understand the different file structures, I’d prefer that you create each of these files “by hand” unless you’re already very comfortable with the file formats. Write R code, using your packages of choice, to load the information from each of the three sources into separate R data frames. Are the three data frames identical?
# Parse the XML file
xml_novel <- xmlTreeParse("C:/Data 607/Week7 HW/xml_JaneAustin.xml")
#check if R is recognizing the file as a xml file
class(xml_novel)
## [1] "XMLDocument" "XMLAbstractDocument"
#Access the top node, view the contents & formatting
top_node_novel <- xmlRoot(xml_novel)
top_node_novel
## <Novels>
## <Novel_1>
## <Title>Pride and Prejudice</Title>
## <Authors>
## <Author1>Jane Austen</Author1>
## <Author2>|</Author2>
## </Authors>
## <Year>1813</Year>
## <Pages>423</Pages>
## <Genre>Romance</Genre>
## <Main_Characters>
## <Character1>Elizabeth Bennet|</Character1>
## <Character2>Mr. Darcy</Character2>
## </Main_Characters>
## </Novel_1>
## <Novel_2>
## <Title>Sense and Sensibility</Title>
## <Authors>
## <Author1>Originally Published Anonymously|</Author1>
## <Author2>Jane Austen</Author2>
## </Authors>
## <Year>1811</Year>
## <Pages>352</Pages>
## <Genre>Romance</Genre>
## <Main_Characters>
## <Character1>Elinor Dashwood|</Character1>
## <Character2>Marianne Dashwood</Character2>
## </Main_Characters>
## </Novel_2>
## <Novel_3>
## <Title>Emma</Title>
## <Author>Jane Austen</Author>
## <Year>1815</Year>
## <Pages>544</Pages>
## <Genre>Romance</Genre>
## <Main_Characters>
## <Character1>Emma Woodhouse|</Character1>
## <Character2>Mr. George Knightley</Character2>
## </Main_Characters>
## </Novel_3>
## </Novels>
# To put the xml data in a data frame, I extracted the XML values using the function xmlSApply():
top_node_novel <- xmlSApply(top_node_novel ,
function(x) xmlSApply(x, xmlValue))
top_node_novel
## Novel_1
## Title "Pride and Prejudice"
## Authors "Jane Austen|"
## Year "1813"
## Pages "423"
## Genre "Romance"
## Main_Characters "Elizabeth Bennet|Mr. Darcy"
## Novel_2
## Title "Sense and Sensibility"
## Authors "Originally Published Anonymously|Jane Austen"
## Year "1811"
## Pages "352"
## Genre "Romance"
## Main_Characters "Elinor Dashwood|Marianne Dashwood"
## Novel_3
## Title "Emma"
## Authors "Jane Austen"
## Year "1815"
## Pages "544"
## Genre "Romance"
## Main_Characters "Emma Woodhouse|Mr. George Knightley"
#Converted the top node object into a dataframe using data.frame. Formatting required dplyr.
xml_novel_df <- data.frame(t(top_node_novel),
row.names=NULL)%>%
separate(Authors, c("Author_1", "Author_2"), "\\|", extra = "merge")%>%
separate(Main_Characters, c("Character_1", "Character_2"), "\\|", extra = "merge")%>%
replace(., is.na(.), "")
kable(xml_novel_df, caption = "XML Jane Austin Dataframe")
| Title | Author_1 | Author_2 | Year | Pages | Genre | Character_1 | Character_2 |
|---|---|---|---|---|---|---|---|
| Pride and Prejudice | Jane Austen | 1813 | 423 | Romance | Elizabeth Bennet | Mr. Darcy | |
| Sense and Sensibility | Originally Published Anonymously | Jane Austen | 1811 | 352 | Romance | Elinor Dashwood | Marianne Dashwood |
| Emma | Jane Austen | 1815 | 544 | Romance | Emma Woodhouse | Mr. George Knightley |
#View the contents & formatting of the HTML object
html_novel <- "C:/Data 607/Week7 HW/html_JaneAustin_v2.html"
html_novel<- htmlTreeParse(html_novel , error=function(...){}, useInternalNodes = TRUE)
html_novel
## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html><body>
## <table style="width:100%">
## <tr>
## <th>Title</th>
## <th colspan="2">Author(s)</th>
## <th>Year Released</th>
## <th>Page #</th>
## <th>Genre</th>
## <th colspan="2">Main Character(s)</th>
## </tr>
## <tr>
## <td>Pride and Prejudice</td>
## <td>Jane Austen</td>
## <td>-</td>
## <td>1813</td>
## <td>432</td>
## <td>Romance</td>
## <td>Elizabeth Bennet</td>
## <td> Mr. Darcy</td>
## </tr>
## <tr>
## <td>Sense and Sensibility</td>
## <td>Originally Published Anonymous</td>
## <td>Jane Austen</td>
## <td>1811</td>
## <td>352</td>
## <td>Romance</td>
## <td> Elinor Dashwood</td>
## <td> Marianne Dashwood</td>
## </tr>
## <tr>
## <td>Emma</td>
## <td>Jane Austen</td>
## <td>-</td>
## <td>1815</td>
## <td>544</td>
## <td>Romance</td>
## <td> Emma Woodhouse</td>
## <td> Mr. George Knightley </td>
## </tr>
## </table>
## </body></html>
##
#Converted the HTML object into a dataframe. Converted all the factor columns into characters & renamed the columns. I used the dplyr method.
html_novel_df <-data.frame(readHTMLTable(html_novel))%>%
mutate_if(is.factor,as.character)%>%
rename(Novel = NULL.V1, Author_1 = NULL.V2, Author_2 = NULL.V3, Year_Relased = NULL.V4, Pages =NULL.V5,
Genre = NULL.V6, Main_Character1 =NULL.V7, Main_Character2 =NULL.V8)
kable(html_novel_df, caption = "HTML Jane Austin Dataframe")
| Novel | Author_1 | Author_2 | Year_Relased | Pages | Genre | Main_Character1 | Main_Character2 |
|---|---|---|---|---|---|---|---|
| Pride and Prejudice | Jane Austen | - | 1813 | 432 | Romance | Elizabeth Bennet | Mr. Darcy |
| Sense and Sensibility | Originally Published Anonymous | Jane Austen | 1811 | 352 | Romance | Elinor Dashwood | Marianne Dashwood |
| Emma | Jane Austen | - | 1815 | 544 | Romance | Emma Woodhouse | Mr. George Knightley |
#view the contents/formatting of the JSON object
json_novel_file <- "C:/Data 607/Week7 HW/json_JaneAustin.json.txt"
json_data <- fromJSON(json_novel_file)
json_data
$Title [1] “Pride and Prejudice” “Sense and Sensibility” “Emma”
$Author1 [1] “Jane Austin” “Originally Published Anonymously” [3] “Jane Austin”
$Author2 [1] “” “Jane Austin” “”
$Year Released [1] 1813 1811 1815
$Page # [1] 432 352 544
$Genre [1] “Romance” “Romance” “Romance”
$Main Character1 [1] “Elizabeth Bennet” “Elinor Dashwood” “Emma Woodhouse”
$Main Character2 [1] “Mr. Darcy” “Marianne Dashwood” “Mr. George Knightley”
#Converted the json object into a dataframe object.
json_novel_file <- "C:/Data 607/Week7 HW/json_JaneAustin.json.txt"
json_novel_file <- data.frame(fromJSON(json_novel_file , flatten=TRUE))%>%
mutate_if(is.factor,as.character)
kable(json_novel_file, caption = "JSON Jane Austin Dataframe")
| Title | Author1 | Author2 | Year.Released | Page.. | Genre | Main.Character1 | Main.Character2 |
|---|---|---|---|---|---|---|---|
| Pride and Prejudice | Jane Austin | 1813 | 432 | Romance | Elizabeth Bennet | Mr. Darcy | |
| Sense and Sensibility | Originally Published Anonymously | Jane Austin | 1811 | 352 | Romance | Elinor Dashwood | Marianne Dashwood |
| Emma | Jane Austin | 1815 | 544 | Romance | Emma Woodhouse | Mr. George Knightley |
My Findings: while the three objects store the same information, the ingestion into R required different amounts of effort. For me, JSON was the easiest and most straightforward.