books_html_url <- "https://raw.githubusercontent.com/murphystout/data-607/master/books.html"
books_xml_url <- "https://raw.githubusercontent.com/murphystout/data-607/master/books.xml"
books_json_url <- "https://raw.githubusercontent.com/murphystout/data-607/master/books.json"
books_html_data <- read_html(books_html_url)
books_html_data
## {xml_document}
## <html>
## [1] <body>\r\n<table>\n<tr>\n<th>Book Title</th>\n<th>Author 1</th>\n<th ...
books_xml_data <- getURL(books_xml_url)
books_xml <- xmlParse(file = books_xml_data[1])
books_xml
## <?xml version="1.0"?>
## <books>
## <book>
## <title>Data Science for Business</title>
## <author>Foster Provost, Tom Fawcett</author>
## <publisher>O'reilly</publisher>
## <subtitle>What you need to know about data mining and data-analytic thinking.</subtitle>
## <isbn>978-1-449-36132-7</isbn>
## </book>
## <book>
## <title>Data Smart</title>
## <author>John W. Forman</author>
## <publisher>Wiley</publisher>
## <subtitle>Using data science to transform information into insights.</subtitle>
## <isbn>978-1-118-66146-8</isbn>
## </book>
## <book>
## <title>R for Data Science</title>
## <author>Hadley Wickham, Garret Grolemund</author>
## <publisher>Oreilly</publisher>
## <subtitle>Import, Tidy, Transform, Visualize and Model Data</subtitle>
## <isbn>978-1-491-91039-9</isbn>
## </book>
## </books>
##
books_json_data <- getURL(books_json_url)
books_json <- fromJSON(txt = books_json_data)
books_json
## $books
## title authors publisher
## 1 Data Science for Business Foster Provost, Tom Fawcett Oreilly
## 2 Data Smart John W. Forman Wiley
## 3 R for Data Science Hadley Wickham, Garret Grolemund Oreilly
## subtitle
## 1 What you need to know about data mining and data-analytic thinking.
## 2 Using data science to transform information into insights.
## 3 Import, Tidy, Transform, Visualize and Model Data
## isbn
## 1 978-1-449-36132-7
## 2 978-1-118-66146-8
## 3 978-1-491-91039-9
books_table <- html_nodes(books_html_data, "table")
books_table <- html_table(books_table)
table_df <- as.data.frame(books_table)
table_df
## Book.Title Author.1 Author.2 Publisher
## 1 Data Science for Business Foster Provost Tom Fawcett O'reilly
## 2 Data Smart John W. Forman Wiley
## 3 R for Data Science Hadley Wickham Garret Grolemund O'reilly
## Subtitle
## 1 What you need to know about data mining and data-analytic thinking.
## 2 Using data science to transform information into insights.
## 3 Import, Tidy, Transform, Visualize and Model Data
## ISBN
## 1 978-1-449-36132-7
## 2 978-1-118-66146-8
## 3 978-1-491-91039-9
books_xml_root <- xmlRoot(books_xml)
books_xml_df <- xmlToDataFrame(books_xml_root)
books_xml_df
## title author publisher
## 1 Data Science for Business Foster Provost, Tom Fawcett O'reilly
## 2 Data Smart John W. Forman Wiley
## 3 R for Data Science Hadley Wickham, Garret Grolemund Oreilly
## subtitle
## 1 What you need to know about data mining and data-analytic thinking.
## 2 Using data science to transform information into insights.
## 3 Import, Tidy, Transform, Visualize and Model Data
## isbn
## 1 978-1-449-36132-7
## 2 978-1-118-66146-8
## 3 978-1-491-91039-9
We see the authors are contained in one column, let’s split them out:
authors <- str_split(as.character(books_xml_df$author), ",", simplify = TRUE)
author.1 <- authors[,1]
author.2 <- authors[,2]
books_xml_df$author.1 <- author.1
books_xml_df$author.2 <- author.2
books_xml_df$author <- NULL
books_xml_df
## title publisher
## 1 Data Science for Business O'reilly
## 2 Data Smart Wiley
## 3 R for Data Science Oreilly
## subtitle
## 1 What you need to know about data mining and data-analytic thinking.
## 2 Using data science to transform information into insights.
## 3 Import, Tidy, Transform, Visualize and Model Data
## isbn author.1 author.2
## 1 978-1-449-36132-7 Foster Provost Tom Fawcett
## 2 978-1-118-66146-8 John W. Forman
## 3 978-1-491-91039-9 Hadley Wickham Garret Grolemund
books_json <- as.data.frame(books_json$books)
books_json
## title authors publisher
## 1 Data Science for Business Foster Provost, Tom Fawcett Oreilly
## 2 Data Smart John W. Forman Wiley
## 3 R for Data Science Hadley Wickham, Garret Grolemund Oreilly
## subtitle
## 1 What you need to know about data mining and data-analytic thinking.
## 2 Using data science to transform information into insights.
## 3 Import, Tidy, Transform, Visualize and Model Data
## isbn
## 1 978-1-449-36132-7
## 2 978-1-118-66146-8
## 3 978-1-491-91039-9
Authors are listed in one column, let’s split them out:
authors <- str_split(as.character(books_json$authors), ",", simplify = TRUE)
authors
## [,1] [,2]
## [1,] "c(\"Foster Provost\"" " \"Tom Fawcett\")"
## [2,] "John W. Forman" ""
## [3,] "c(\"Hadley Wickham\"" " \"Garret Grolemund\")"
##Coercing the data caused unwanted characters, let's clear them out.
authors <- gsub("c\\(","", authors)
authors <- gsub("\"","",authors)
authors <- gsub("\\)","", authors)
authors
## [,1] [,2]
## [1,] "Foster Provost" " Tom Fawcett"
## [2,] "John W. Forman" ""
## [3,] "Hadley Wickham" " Garret Grolemund"
author.1 <- authors[,1]
author.2 <- authors[,2]
author.2 <- str_trim(author.2)
books_json$author.1 <- author.1
books_json$author.2 <- author.2
books_json$authors <- NULL
books_json
## title publisher
## 1 Data Science for Business Oreilly
## 2 Data Smart Wiley
## 3 R for Data Science Oreilly
## subtitle
## 1 What you need to know about data mining and data-analytic thinking.
## 2 Using data science to transform information into insights.
## 3 Import, Tidy, Transform, Visualize and Model Data
## isbn author.1 author.2
## 1 978-1-449-36132-7 Foster Provost Tom Fawcett
## 2 978-1-118-66146-8 John W. Forman
## 3 978-1-491-91039-9 Hadley Wickham Garret Grolemund
No, we can see there are several differences amongst the various data frames.
The HTML table was written by hand, and as data frame take tabulur formats, its essentially just a copy of the HTML table in a R Data Frame.
For this reason the Authors each have their own column from the outset.
For XML, the data is largely the same, but the authors are listed as a single string, separated by a comma. This needed to be split using string manipulation functions.
For the JSON, authors were listed as their own unnammed list, and when this was coerced to a dataframe it resulted in a list, with multiple authors as members of a given row. This was handled with string manipulation functions as well, although somewhat differently than the XML.