Week 7 Assignment

library(knitr)
library(XML)
library(dplyr)
library(tidyr)
library(xml2)
library(rjson)
library(jsonlite)

R Markdown

Pick three of your favorite books on one of your favorite subjects. At least one of the books should have more than one author. For each book, include the title, authors, and two or three other attributes that you find interesting. Take the information that you’ve selected about these three books, and separately create three files which store the book’s information in HTML (using an html table), XML, and JSON formats (e.g. “books.html”, “books.xml”, and “books.json”). To help you better understand the different file structures, I’d prefer that you create each of these files “by hand” unless you’re already very comfortable with the file formats. Write R code, using your packages of choice, to load the information from each of the three sources into separate R data frames. Are the three data frames identical?

XML Version- Jane Austin Novels

# Parse the XML file
xml_novel <- xmlTreeParse("C:/Data 607/Week7 HW/xml_JaneAustin.xml")

#check if R is recognizing the file as a xml file 
class(xml_novel)

## [1] "XMLDocument"         "XMLAbstractDocument"

#Access the top node, view the contents & formatting 
top_node_novel <- xmlRoot(xml_novel)
top_node_novel

## <Novels>
##  <Novel_1>
##   <Title>Pride and Prejudice</Title>
##   <Authors>
##    <Author1>Jane Austen</Author1>
##    <Author2>|</Author2>
##   </Authors>
##   <Year>1813</Year>
##   <Pages>423</Pages>
##   <Genre>Romance</Genre>
##   <Main_Characters>
##    <Character1>Elizabeth Bennet|</Character1>
##    <Character2>Mr. Darcy</Character2>
##   </Main_Characters>
##  </Novel_1>
##  <Novel_2>
##   <Title>Sense and Sensibility</Title>
##   <Authors>
##    <Author1>Originally Published Anonymously|</Author1>
##    <Author2>Jane Austen</Author2>
##   </Authors>
##   <Year>1811</Year>
##   <Pages>352</Pages>
##   <Genre>Romance</Genre>
##   <Main_Characters>
##    <Character1>Elinor Dashwood|</Character1>
##    <Character2>Marianne Dashwood</Character2>
##   </Main_Characters>
##  </Novel_2>
##  <Novel_3>
##   <Title>Emma</Title>
##   <Author>Jane Austen</Author>
##   <Year>1815</Year>
##   <Pages>544</Pages>
##   <Genre>Romance</Genre>
##   <Main_Characters>
##    <Character1>Emma Woodhouse|</Character1>
##    <Character2>Mr. George Knightley</Character2>
##   </Main_Characters>
##  </Novel_3>
## </Novels>

# To put the xml data in a data frame, I extracted the XML values using the function xmlSApply():
top_node_novel  <- xmlSApply(top_node_novel ,
                    function(x) xmlSApply(x, xmlValue))
top_node_novel

##                 Novel_1                     
## Title           "Pride and Prejudice"       
## Authors         "Jane Austen|"              
## Year            "1813"                      
## Pages           "423"                       
## Genre           "Romance"                   
## Main_Characters "Elizabeth Bennet|Mr. Darcy"
##                 Novel_2                                       
## Title           "Sense and Sensibility"                       
## Authors         "Originally Published Anonymously|Jane Austen"
## Year            "1811"                                        
## Pages           "352"                                         
## Genre           "Romance"                                     
## Main_Characters "Elinor Dashwood|Marianne Dashwood"           
##                 Novel_3                              
## Title           "Emma"                               
## Authors         "Jane Austen"                        
## Year            "1815"                               
## Pages           "544"                                
## Genre           "Romance"                            
## Main_Characters "Emma Woodhouse|Mr. George Knightley"

#Converted the top node object into a dataframe using data.frame. Formatting required dplyr.
xml_novel_df <- data.frame(t(top_node_novel),
                     row.names=NULL)%>% 
                      separate(Authors, c("Author_1", "Author_2"), "\\|", extra = "merge")%>% 
                      separate(Main_Characters, c("Character_1", "Character_2"), "\\|", extra = "merge")%>%
                      replace(., is.na(.), "")
kable(xml_novel_df,  caption = "XML Jane Austin Dataframe")

XML Jane Austin Dataframe
Title	Author_1	Author_2	Year	Pages	Genre	Character_1	Character_2
Pride and Prejudice	Jane Austen		1813	423	Romance	Elizabeth Bennet	Mr. Darcy
Sense and Sensibility	Originally Published Anonymously	Jane Austen	1811	352	Romance	Elinor Dashwood	Marianne Dashwood
Emma	Jane Austen		1815	544	Romance	Emma Woodhouse	Mr. George Knightley

HTML Novels-Jane Austin

#View the contents & formatting of the HTML object 
html_novel <- "C:/Data 607/Week7 HW/html_JaneAustin_v2.html"
html_novel<- htmlTreeParse(html_novel , error=function(...){}, useInternalNodes = TRUE)
html_novel

## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html><body>
## <table style="width:100%">
## <tr>
## <th>Title</th>
##     <th colspan="2">Author(s)</th>
##     <th>Year Released</th>
##     <th>Page #</th>
##     <th>Genre</th>
##     <th colspan="2">Main Character(s)</th>
##   </tr>
## <tr>
## <td>Pride and Prejudice</td>
##     <td>Jane Austen</td>
##     <td>-</td>
##     <td>1813</td>
##     <td>432</td>
##     <td>Romance</td>
##     <td>Elizabeth Bennet</td>
##     <td> Mr. Darcy</td>
##   </tr>
## <tr>
## <td>Sense and Sensibility</td>
##     <td>Originally Published Anonymous</td>
##     <td>Jane Austen</td>
##     <td>1811</td>
##     <td>352</td>
##     <td>Romance</td>
##     <td> Elinor Dashwood</td>
##     <td> Marianne Dashwood</td>
##   </tr>
## <tr>
## <td>Emma</td>
##     <td>Jane Austen</td>
##     <td>-</td>
##     <td>1815</td>
##     <td>544</td>
##     <td>Romance</td>
##     <td> Emma Woodhouse</td>
##     <td> Mr. George Knightley </td>
##   </tr>
## </table>
## </body></html>
##

#Converted the HTML object into a dataframe. Converted all the factor columns into characters & renamed the columns. I used the dplyr method. 
html_novel_df <-data.frame(readHTMLTable(html_novel))%>%
  mutate_if(is.factor,as.character)%>%
  rename(Novel = NULL.V1, Author_1 = NULL.V2, Author_2 = NULL.V3, Year_Relased = NULL.V4, Pages =NULL.V5, 
Genre = NULL.V6, Main_Character1 =NULL.V7, Main_Character2 =NULL.V8)

kable(html_novel_df,  caption = "HTML Jane Austin Dataframe")

HTML Jane Austin Dataframe
Novel	Author_1	Author_2	Year_Relased	Pages	Genre	Main_Character1	Main_Character2
Pride and Prejudice	Jane Austen	-	1813	432	Romance	Elizabeth Bennet	Mr. Darcy
Sense and Sensibility	Originally Published Anonymous	Jane Austen	1811	352	Romance	Elinor Dashwood	Marianne Dashwood
Emma	Jane Austen	-	1815	544	Romance	Emma Woodhouse	Mr. George Knightley

JSON Novels-Jane Austin

#view the contents/formatting of the JSON object 
json_novel_file <- "C:/Data 607/Week7 HW/json_JaneAustin.json.txt"
json_data <- fromJSON(json_novel_file)
json_data

$Title [1] “Pride and Prejudice” “Sense and Sensibility” “Emma”

$Author1 [1] “Jane Austin” “Originally Published Anonymously” [3] “Jane Austin”

$Author2 [1] “” “Jane Austin” “”

$Year Released [1] 1813 1811 1815

$Page # [1] 432 352 544

$Genre [1] “Romance” “Romance” “Romance”

$Main Character1 [1] “Elizabeth Bennet” “Elinor Dashwood” “Emma Woodhouse”

$Main Character2 [1] “Mr. Darcy” “Marianne Dashwood” “Mr. George Knightley”

#Converted the json object into a dataframe object.  
json_novel_file <- "C:/Data 607/Week7 HW/json_JaneAustin.json.txt"
json_novel_file <- data.frame(fromJSON(json_novel_file  , flatten=TRUE))%>%
  mutate_if(is.factor,as.character)

kable(json_novel_file,  caption = "JSON Jane Austin Dataframe")

JSON Jane Austin Dataframe
Title	Author1	Author2	Year.Released	Page..	Genre	Main.Character1	Main.Character2
Pride and Prejudice	Jane Austin		1813	432	Romance	Elizabeth Bennet	Mr. Darcy
Sense and Sensibility	Originally Published Anonymously	Jane Austin	1811	352	Romance	Elinor Dashwood	Marianne Dashwood
Emma	Jane Austin		1815	544	Romance	Emma Woodhouse	Mr. George Knightley

My Findings: while the three objects store the same information, the ingestion into R required different amounts of effort. For me, JSON was the easiest and most straightforward.

Week 7 Assignment

Meaghan Burke

March 17, 2018

R Markdown

XML Version- Jane Austin Novels

HTML Novels-Jane Austin

JSON Novels-Jane Austin