Read xml file from github.In this file mutliple authors are listed as coma separated.
xml_url <-"https://raw.githubusercontent.com/jjohn81/Data607/master/Week6/Books.xml"
xml_data <- read_xml(xml_url) %>% xmlParse()
xml_data
## <?xml version="1.0" encoding="UTF-8"?>
## <Books>
## <Book>
## <name>An Absolutely Remarkable Thing: A Novel</name>
## <author>Hank Green</author>
## <genres>Fiction, Fantasy</genres>
## <pages> 352</pages>
## </Book>
## <Book>
## <name>Transcription: A Novel</name>
## <author>Kate Atkinson</author>
## <genres>Thriller,Military</genres>
## <pages>352</pages>
## </Book>
## <Book>
## <name>The Talisman</name>
## <author>Stephen King,Peter Straub</author>
## <genres>Suspense, Horror</genres>
## <pages>921</pages>
## </Book>
## </Books>
##
xml_df <- xmlToDataFrame(xml_data)
xml_df
## name author
## 1 An Absolutely Remarkable Thing: A Novel Hank Green
## 2 Transcription: A Novel Kate Atkinson
## 3 The Talisman Stephen King,Peter Straub
## genres pages
## 1 Fiction, Fantasy 352
## 2 Thriller,Military 352
## 3 Suspense, Horror 921
Read xml file from github. In this file authors are listed as elements.‘xmlToDataFrame’ cant parse this xml.
xml_url <-"https://raw.githubusercontent.com/jjohn81/Data607/master/Week6/Books_muli_authors_element.xml"
xml_data <- read_xml(xml_url) %>% xmlParse()
xml_data
## <?xml version="1.0" encoding="UTF-8"?>
## <Books>
## <Book>
## <name>An Absolutely Remarkable Thing: A Novel</name>
## <author>Hank Green</author>
## <genre>Fiction, Fantasy</genre>
## <pages> 352</pages>
## </Book>
## <Book>
## <name>Transcription: A Novel</name>
## <author>Kate Atkinson</author>
## <genre>Thriller,Military</genre>
## <pages>352</pages>
## </Book>
## <Book>
## <name>The Talisman</name>
## <author>Stephen King</author>
## <author>Peter Straub</author>
## <genre>Suspense, Horror</genre>
## <pages>921</pages>
## </Book>
## </Books>
##
#using plyr moethod. This method creates two columns for authors.
xml_df_plyr <- ldply(xmlToList(xml_data), data.frame)
xml_df_plyr
## .id name author
## 1 Book An Absolutely Remarkable Thing: A Novel Hank Green
## 2 Book Transcription: A Novel Kate Atkinson
## 3 Book The Talisman Stephen King
## genre pages author.1
## 1 Fiction, Fantasy 352 <NA>
## 2 Thriller,Military 352 <NA>
## 3 Suspense, Horror 921 Peter Straub
Read xml file from github. In this file authors are listed in “Authors” node. Dataframe structure depneds on the method used.
xml_url <-"https://raw.githubusercontent.com/jjohn81/Data607/master/Week6/Books_authors_node.xml"
xml_data <- read_xml(xml_url) %>% xmlParse()
xml_data
## <?xml version="1.0" encoding="UTF-8"?>
## <Books>
## <Book>
## <name>An Absolutely Remarkable Thing: A Novel</name>
## <Authors>
## <author>Hank Green</author>
## </Authors>
## <genre>Fiction, Fantasy</genre>
## <pages> 352</pages>
## </Book>
## <Book>
## <name>Transcription: A Novel</name>
## <Authors>
## <author>Kate Atkinson</author>
## </Authors>
## <genre>Thriller,Military</genre>
## <pages>352</pages>
## </Book>
## <Book>
## <name>The Talisman</name>
## <Authors>
## <author>Stephen King</author>
## <author>Peter Straub</author>
## </Authors>
## <genre>Suspense, Horror</genre>
## <pages>921</pages>
## </Book>
## </Books>
##
#Multiple authors are listed under "Authors" column.
xml_df <- xmlToDataFrame(xml_data)
xml_df
## name Authors
## 1 An Absolutely Remarkable Thing: A Novel Hank Green
## 2 Transcription: A Novel Kate Atkinson
## 3 The Talisman Stephen KingPeter Straub
## genre pages
## 1 Fiction, Fantasy 352
## 2 Thriller,Military 352
## 3 Suspense, Horror 921
#This creates two columns for authors, Authors.author and Authors.author1
xml_df <- ldply(xmlToList(xml_data), data.frame)
xml_df
## .id name author
## 1 Book An Absolutely Remarkable Thing: A Novel Hank Green
## 2 Book Transcription: A Novel Kate Atkinson
## 3 Book The Talisman <NA>
## genre pages Authors.author Authors.author.1
## 1 Fiction, Fantasy 352 <NA> <NA>
## 2 Thriller,Military 352 <NA> <NA>
## 3 Suspense, Horror 921 Stephen King Peter Straub
Read html file from github.
html_url <- "https://raw.githubusercontent.com/jjohn81/Data607/master/Week6/Books.html"
html_data <- read_html(html_url) %>% html_node("table")
html_data
## {xml_node}
## <table style="width:100%">
## [1] <tr>\n<th>Name</th>\n <th>Author</th> \n <th>Genres</th>\n ...
## [2] <tr>\n<td>An Absolutely Remarkable Thing: A Novel</td>\n <td>Hank ...
## [3] <tr>\n<td>Transcription: A Novel</td>\n <td>Kate Atkinson</td>\n ...
## [4] <tr>\n<td>The Talisman</td>\n <td>Stephen King, Peter Straub</td> ...
html_df <-as.data.frame( html_table(html_data,fill=TRUE))
html_df
## Name Author
## 1 An Absolutely Remarkable Thing: A Novel Hank Green
## 2 Transcription: A Novel Kate Atkinson
## 3 The Talisman Stephen King, Peter Straub
## Genres Pages
## 1 Fiction, Fantasy 352
## 2 Thriller, Military, War 352
## 3 Suspense,Horro,Fantasy 944944
Read JSON file from github. In this file mutliple authors are listed as coma separated.
json_url <-"https://raw.githubusercontent.com/jjohn81/Data607/master/Week6/Books.json"
json_data <- read_json(json_url)
json_data
## $Books
## $Books$Book
## $Books$Book[[1]]
## $Books$Book[[1]]$name
## [1] "An Absolutely Remarkable Thing: A Novel"
##
## $Books$Book[[1]]$author
## [1] "Hank Green"
##
## $Books$Book[[1]]$genres
## [1] "Fiction, Fantasy"
##
## $Books$Book[[1]]$pages
## [1] "352"
##
##
## $Books$Book[[2]]
## $Books$Book[[2]]$name
## [1] "Transcription: A Novel"
##
## $Books$Book[[2]]$author
## [1] "Kate Atkinson"
##
## $Books$Book[[2]]$genres
## [1] "Thriller,Military"
##
## $Books$Book[[2]]$pages
## [1] "352"
##
##
## $Books$Book[[3]]
## $Books$Book[[3]]$name
## [1] "The Talisman"
##
## $Books$Book[[3]]$author
## [1] "Stephen King,Peter Straub"
##
## $Books$Book[[3]]$genres
## [1] "Suspense, Horror"
##
## $Books$Book[[3]]$pages
## [1] "921"
json_df <-fromJSON(json_url)
json_df
## $Books
## $Books$Book
## name author
## 1 An Absolutely Remarkable Thing: A Novel Hank Green
## 2 Transcription: A Novel Kate Atkinson
## 3 The Talisman Stephen King,Peter Straub
## genres pages
## 1 Fiction, Fantasy 352
## 2 Thriller,Military 352
## 3 Suspense, Horror 921
Read JSON file from github. In this file mutliple authors.
json_url <-"https://raw.githubusercontent.com/jjohn81/Data607/master/Week6/books_authors.json"
json_data <- read_json(json_url)
json_data
## $Books
## $Books$Book
## $Books$Book[[1]]
## $Books$Book[[1]]$name
## [1] "An Absolutely Remarkable Thing: A Novel"
##
## $Books$Book[[1]]$author
## [1] "Hank Green"
##
## $Books$Book[[1]]$genre
## [1] "Fiction, Fantasy"
##
## $Books$Book[[1]]$pages
## [1] " 352"
##
##
## $Books$Book[[2]]
## $Books$Book[[2]]$name
## [1] "Transcription: A Novel"
##
## $Books$Book[[2]]$author
## [1] "Kate Atkinson"
##
## $Books$Book[[2]]$genre
## [1] "Thriller,Military"
##
## $Books$Book[[2]]$pages
## [1] "352"
##
##
## $Books$Book[[3]]
## $Books$Book[[3]]$name
## [1] "The Talisman"
##
## $Books$Book[[3]]$Authors
## $Books$Book[[3]]$Authors$author
## $Books$Book[[3]]$Authors$author[[1]]
## [1] "Stephen King"
##
## $Books$Book[[3]]$Authors$author[[2]]
## [1] "Peter Straub"
##
##
##
## $Books$Book[[3]]$genre
## [1] "Suspense, Horror"
##
## $Books$Book[[3]]$pages
## [1] "921"
# creates two author columns.
json_df <-fromJSON(json_url)
json_df
## $Books
## $Books$Book
## name author genre
## 1 An Absolutely Remarkable Thing: A Novel Hank Green Fiction, Fantasy
## 2 Transcription: A Novel Kate Atkinson Thriller,Military
## 3 The Talisman <NA> Suspense, Horror
## pages author
## 1 352 NULL
## 2 352 NULL
## 3 921 Stephen King, Peter Straub
HTML tables are for displaying data while XML and JSON formats are used for storing and transporting data. JSON is lightweight and easy to parse whereas XML is verbose and difficult to parse. Data frame structure depends on the parser/method used. For example, nested authors node might be stored as a data frame column with comma separated names/names concatenated or as multiple columns .