Homework 7

I created three tables containing information on books in three formats: HTML, XML and JSON. Our goal is to load the files into R and parse them, creating dataframes.

Load Libraries

Load required libraries

library(kableExtra)
library(readxl)
library(XML)

A helper function for displaying tables

showtable <- function(data, title) {
  kable(data, caption = title) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), latex_options = "scale_down")
}

HTML

Reading html from url as text

url <- "https://raw.githubusercontent.com/Vthomps000/DATA607_VT/master/books.html"
html_text <- readLines(con = url)

Parse the HTML document and keep only the body section.

h1 <- list("head" = function(x){NULL})
parsed_books_html <- htmlTreeParse(html_text, handlers = h1, asTree = TRUE)

Navigate to the table child and inspect it

root <- xmlRoot(parsed_books_html)
books_child <- xmlChildren(root)
table <- xmlChildren(books_child[[1]])
table

## $table
## <table>
##  <tr>
##   <th>title</th>
##   <th>author</th>
##   <th>yearpub</th>
##   <th>pages</th>
##   <th>price</th>
##  </tr>
##  <tr>
##   <td>The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life</td>
##   <td>Mark Mason</td>
##   <td>2016</td>
##   <td>224</td>
##   <td>23.95</td>
##  </tr>
##  <tr>
##   <td>The Art of Seduction</td>
##   <td>Robert Greene</td>
##   <td>2001</td>
##   <td>468</td>
##   <td>24.49</td>
##  </tr>
##  <tr>
##   <td>Disarming the Narcissist: Surviving and Thriving with the Self-Absorbed</td>
##   <td>Wendy T. Behary, Daniel J. Siegel</td>
##   <td>2013</td>
##   <td>249</td>
##   <td>20.99</td>
##  </tr>
## </table>
## 
## attr(,"class")
## [1] "XMLNodeList"

Get the column headers from the first tr element and the values from the others

n <- xmlSApply(books_child[[1]], xmlSize)
# get the column names stored in the first element
headers <- xmlSApply(table[[1]][[1]], xmlValue)
# initialize a data frame
books_df_html <- data.frame(character(), character(), integer(), integer(), double(), stringsAsFactors = FALSE)
# name the column headers
colnames(books_df_html) <- headers
# fill in data frame
for (i in seq(2, n, 1)) {
  books_df_html[nrow(books_df_html) + 1,] <- xmlSApply(table[[1]][[i]], xmlValue)
}

Table

showtable(books_df_html, "Books Extracted from HTML")

Books Extracted from HTML
title	author	yearpub	pages	price
The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life	Mark Mason	2016	224	23.95
The Art of Seduction	Robert Greene	2001	468	24.49
Disarming the Narcissist: Surviving and Thriving with the Self-Absorbed	Wendy T. Behary, Daniel J. Siegel	2013	249	20.99

XPath

Creating the HTML dataframe via xpath

url <- "https://raw.githubusercontent.com/Vthomps000/DATA607_VT/master/books.html"
html_text <- readLines(con = url)

Parse the HTML document. Note that we are not using htmlTreeParse() this time.

parsed_books_html <- htmlParse(html_text)

Get the column headers from the path to /th and the values from the path to /td

# get the column headers
headers <- xpathSApply(doc = parsed_books_html, path = "/html/body/table/tr/th", fun = xmlValue)
# initialize data frame
books_df_html2 <- data.frame(character(), character(), integer(), integer(), double(), stringsAsFactors = FALSE)
# name the column headers
colnames(books_df_html2) <- headers
# extract the values as a list
values <- xpathSApply(doc = parsed_books_html, path = "/html/body/table/tr/td", fun = xmlValue)
# insert the relevant list items into the data frame
for (i in seq(0, 2, 1)) {
  books_df_html2[nrow(books_df_html2) + 1,] <- values[(1+5*i):(5+5*i)]
}

Check the data is loaded correctly

showtable(books_df_html2, "Books Extracted from HTML via XPath")

Books Extracted from HTML via XPath
title	author	yearpub	pages	price
The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life	Mark Mason	2016	224	23.95
The Art of Seduction	Robert Greene	2001	468	24.49
Disarming the Narcissist: Surviving and Thriving with the Self-Absorbed	Wendy T. Behary, Daniel J. Siegel	2013	249	20.99

XML

Read and parse the XML document and inspect the contents of the root node

url <- "https://github.com/Vthomps000/DATA607_VT/blob/master/books.xml"
parsed_books_xml <- xmlTreeParse("books.xml")
root <- xmlRoot(parsed_books_xml)
root

## <books>
##  <book>
##   <title>The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life</title>
##   <author>Mark Manson</author>
##   <yearpub>2016</yearpub>
##   <pages>224</pages>
##   <price>23.95</price>
##  </book>
##  <book>
##   <title>The Art of Seduction</title>
##   <author>Robert Greene</author>
##   <yearpub>2001</yearpub>
##   <pages>468</pages>
##   <price>24.49</price>
##  </book>
##  <book>
##   <title>Disarming the Narcissist: Surviving and Thriving with the Self-Absorbed</title>
##   <author>
##    <first>Wendy T. Behary</first>
##    <second>Daniel J. Siegel</second>
##   </author>
##   <yearpub>2013</yearpub>
##   <pages>249</pages>
##   <price>20.99</price>
##  </book>
## </books>

Extract data from nodes. The names of the elements are retained.

# navigate to the books container
books <- xmlChildren(root)
# stack the book values together and create a data frame
df <- rbind(xmlSApply(books[[1]], xmlValue), xmlSApply(books[[2]], xmlValue), xmlSApply(books[[3]], xmlValue))
books_df_xml <- data.frame(df)

Table

showtable(books_df_xml, "Books Extracted from XML")

Books Extracted from XML
title	author	yearpub	pages	price
The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life	Mark Manson	2016	224	23.95
The Art of Seduction	Robert Greene	2001	468	24.49
Disarming the Narcissist: Surviving and Thriving with the Self-Absorbed	Wendy T. BeharyDaniel J. Siegel	2013	249	20.99

JSON

library(jsonlite)

url <- "https://raw.githubusercontent.com/Vthomps000/DATA607_VT/master/books.json"
books_jsonlite <- jsonlite::fromJSON(url, simplifyVector = TRUE)
books_df_json <- data.frame(books_jsonlite)

showtable(books_df_json, "Books Extracted from JSON")

Books Extracted from JSON
books.title	books.author	books.published	books.pages	books.cost
The Subtle Art of Not Giving a F*ck: A Counterintuitive Approach to Living a Good Life	Mark Manson	2016	224	23.95
Disarming the Narcissist: Surviving and Thriving with the Self-Absorbed	c(“Wendy T. Behary”, “Daniel J. Siegel”)	NA	249	20.99
The Art of Seduction	Robert Greene	2001	468	24.49

Conclusion

The data frames are not all identical.The XML table is missing a comma in the case where there are two authors and the JSON one is showing a vector with the two author names and well as column headers that contain the “books.” prefix.

DATA607- HW 7

Vanita Thompson

3/13/2020

Homework 7

Load Libraries

HTML

Table

XPath

XML

Table

JSON

Conclusion