library(knitr)
library(XML)
library(dplyr)
library(tidyr)
library(xml2)
library(rjson)
library(jsonlite)

R Markdown

Pick three of your favorite books on one of your favorite subjects. At least one of the books should have more than one author. For each book, include the title, authors, and two or three other attributes that you find interesting. Take the information that you’ve selected about these three books, and separately create three files which store the book’s information in HTML (using an html table), XML, and JSON formats (e.g. “books.html”, “books.xml”, and “books.json”). To help you better understand the different file structures, I’d prefer that you create each of these files “by hand” unless you’re already very comfortable with the file formats. Write R code, using your packages of choice, to load the information from each of the three sources into separate R data frames. Are the three data frames identical?

XML Version- Jane Austin Novels

# Parse the XML file
xml_novel <- xmlTreeParse("C:/Data 607/Week7 HW/xml_JaneAustin.xml")

#check if R is recognizing the file as a xml file 
class(xml_novel)
## [1] "XMLDocument"         "XMLAbstractDocument"
#Access the top node, view the contents & formatting 
top_node_novel <- xmlRoot(xml_novel)
top_node_novel 
## <Novels>
##  <Novel_1>
##   <Title>Pride and Prejudice</Title>
##   <Authors>
##    <Author1>Jane Austen</Author1>
##    <Author2>|</Author2>
##   </Authors>
##   <Year>1813</Year>
##   <Pages>423</Pages>
##   <Genre>Romance</Genre>
##   <Main_Characters>
##    <Character1>Elizabeth Bennet|</Character1>
##    <Character2>Mr. Darcy</Character2>
##   </Main_Characters>
##  </Novel_1>
##  <Novel_2>
##   <Title>Sense and Sensibility</Title>
##   <Authors>
##    <Author1>Originally Published Anonymously|</Author1>
##    <Author2>Jane Austen</Author2>
##   </Authors>
##   <Year>1811</Year>
##   <Pages>352</Pages>
##   <Genre>Romance</Genre>
##   <Main_Characters>
##    <Character1>Elinor Dashwood|</Character1>
##    <Character2>Marianne Dashwood</Character2>
##   </Main_Characters>
##  </Novel_2>
##  <Novel_3>
##   <Title>Emma</Title>
##   <Author>Jane Austen</Author>
##   <Year>1815</Year>
##   <Pages>544</Pages>
##   <Genre>Romance</Genre>
##   <Main_Characters>
##    <Character1>Emma Woodhouse|</Character1>
##    <Character2>Mr. George Knightley</Character2>
##   </Main_Characters>
##  </Novel_3>
## </Novels>
# To put the xml data in a data frame, I extracted the XML values using the function xmlSApply():
top_node_novel  <- xmlSApply(top_node_novel ,
                    function(x) xmlSApply(x, xmlValue))
top_node_novel
##                 Novel_1                     
## Title           "Pride and Prejudice"       
## Authors         "Jane Austen|"              
## Year            "1813"                      
## Pages           "423"                       
## Genre           "Romance"                   
## Main_Characters "Elizabeth Bennet|Mr. Darcy"
##                 Novel_2                                       
## Title           "Sense and Sensibility"                       
## Authors         "Originally Published Anonymously|Jane Austen"
## Year            "1811"                                        
## Pages           "352"                                         
## Genre           "Romance"                                     
## Main_Characters "Elinor Dashwood|Marianne Dashwood"           
##                 Novel_3                              
## Title           "Emma"                               
## Authors         "Jane Austen"                        
## Year            "1815"                               
## Pages           "544"                                
## Genre           "Romance"                            
## Main_Characters "Emma Woodhouse|Mr. George Knightley"
#Converted the top node object into a dataframe using data.frame. Formatting required dplyr.
xml_novel_df <- data.frame(t(top_node_novel),
                     row.names=NULL)%>% 
                      separate(Authors, c("Author_1", "Author_2"), "\\|", extra = "merge")%>% 
                      separate(Main_Characters, c("Character_1", "Character_2"), "\\|", extra = "merge")%>%
                      replace(., is.na(.), "")
kable(xml_novel_df,  caption = "XML Jane Austin Dataframe")
XML Jane Austin Dataframe
Title Author_1 Author_2 Year Pages Genre Character_1 Character_2
Pride and Prejudice Jane Austen 1813 423 Romance Elizabeth Bennet Mr. Darcy
Sense and Sensibility Originally Published Anonymously Jane Austen 1811 352 Romance Elinor Dashwood Marianne Dashwood
Emma Jane Austen 1815 544 Romance Emma Woodhouse Mr. George Knightley

HTML Novels-Jane Austin

#View the contents & formatting of the HTML object 
html_novel <- "C:/Data 607/Week7 HW/html_JaneAustin_v2.html"
html_novel<- htmlTreeParse(html_novel , error=function(...){}, useInternalNodes = TRUE)
html_novel
## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html><body>
## <table style="width:100%">
## <tr>
## <th>Title</th>
##     <th colspan="2">Author(s)</th>
##     <th>Year Released</th>
##     <th>Page #</th>
##     <th>Genre</th>
##     <th colspan="2">Main Character(s)</th>
##   </tr>
## <tr>
## <td>Pride and Prejudice</td>
##     <td>Jane Austen</td>
##     <td>-</td>
##     <td>1813</td>
##     <td>432</td>
##     <td>Romance</td>
##     <td>Elizabeth Bennet</td>
##     <td> Mr. Darcy</td>
##   </tr>
## <tr>
## <td>Sense and Sensibility</td>
##     <td>Originally Published Anonymous</td>
##     <td>Jane Austen</td>
##     <td>1811</td>
##     <td>352</td>
##     <td>Romance</td>
##     <td> Elinor Dashwood</td>
##     <td> Marianne Dashwood</td>
##   </tr>
## <tr>
## <td>Emma</td>
##     <td>Jane Austen</td>
##     <td>-</td>
##     <td>1815</td>
##     <td>544</td>
##     <td>Romance</td>
##     <td> Emma Woodhouse</td>
##     <td> Mr. George Knightley </td>
##   </tr>
## </table>
## </body></html>
## 
#Converted the HTML object into a dataframe. Converted all the factor columns into characters & renamed the columns. I used the dplyr method. 
html_novel_df <-data.frame(readHTMLTable(html_novel))%>%
  mutate_if(is.factor,as.character)%>%
  rename(Novel = NULL.V1, Author_1 = NULL.V2, Author_2 = NULL.V3, Year_Relased = NULL.V4, Pages =NULL.V5, 
Genre = NULL.V6, Main_Character1 =NULL.V7, Main_Character2 =NULL.V8)

kable(html_novel_df,  caption = "HTML Jane Austin Dataframe")
HTML Jane Austin Dataframe
Novel Author_1 Author_2 Year_Relased Pages Genre Main_Character1 Main_Character2
Pride and Prejudice Jane Austen - 1813 432 Romance Elizabeth Bennet Mr. Darcy
Sense and Sensibility Originally Published Anonymous Jane Austen 1811 352 Romance Elinor Dashwood Marianne Dashwood
Emma Jane Austen - 1815 544 Romance Emma Woodhouse Mr. George Knightley

JSON Novels-Jane Austin

#view the contents/formatting of the JSON object 
json_novel_file <- "C:/Data 607/Week7 HW/json_JaneAustin.json.txt"
json_data <- fromJSON(json_novel_file)
json_data 

$Title [1] “Pride and Prejudice” “Sense and Sensibility” “Emma”

$Author1 [1] “Jane Austin” “Originally Published Anonymously” [3] “Jane Austin”

$Author2 [1] “” “Jane Austin” “”

$Year Released [1] 1813 1811 1815

$Page # [1] 432 352 544

$Genre [1] “Romance” “Romance” “Romance”

$Main Character1 [1] “Elizabeth Bennet” “Elinor Dashwood” “Emma Woodhouse”

$Main Character2 [1] “Mr. Darcy” “Marianne Dashwood” “Mr. George Knightley”

#Converted the json object into a dataframe object.  
json_novel_file <- "C:/Data 607/Week7 HW/json_JaneAustin.json.txt"
json_novel_file <- data.frame(fromJSON(json_novel_file  , flatten=TRUE))%>%
  mutate_if(is.factor,as.character)

kable(json_novel_file,  caption = "JSON Jane Austin Dataframe")
JSON Jane Austin Dataframe
Title Author1 Author2 Year.Released Page.. Genre Main.Character1 Main.Character2
Pride and Prejudice Jane Austin 1813 432 Romance Elizabeth Bennet Mr. Darcy
Sense and Sensibility Originally Published Anonymously Jane Austin 1811 352 Romance Elinor Dashwood Marianne Dashwood
Emma Jane Austin 1815 544 Romance Emma Woodhouse Mr. George Knightley

My Findings: while the three objects store the same information, the ingestion into R required different amounts of effort. For me, JSON was the easiest and most straightforward.