We will be intaking HTML, XML, and JSON data and turning it into a dataframe. Let us start with our HTML data.
library(XML)
library(rjson)
library(knitr)
if (!(file.exists('books.html'))){
url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.html?token=ALKCMBIMTDFUMQW6TCJWWVC6NTZPE'
download.file(url, 'books.html')
}
htmlbooks <- htmlParse(file = 'books.html')
print(htmlbooks)
## <!DOCTYPE html>
## <html>
## <head><title>Chess Books</title></head>
## <body>
## <p>
## </p>
## <dl>
## <dt>Title</dt>
## <dd>Dvoretsky's Endgame Manual</dd>
## <dt>Author</dt>
## <dd>Mark Dvoretsky</dd>
## <dt>Topic</dt>
## <dd>Endgame</dd>
## <dt>Difficulty</dt>
## <dd>Hard</dd>
## <dt>Rating</dt>
## <dd>5/5</dd>
## </dl>
## <p>
## </p>
## <dl>
## <dt>Title</dt>
## <dd>Chess Openings for Black, Explained</dd>
## <dt>Author</dt>
## <dd>Lev Alburt</dd>
## <dd>Roman Dzindzichashvili</dd>
## <dd>Eugene Perelshteyn</dd>
## <dt>Topic</dt>
## <dd>Openings</dd>
## <dt>Difficulty</dt>
## <dd>Medium</dd>
## <dt>Rating</dt>
## <dd>3/5</dd>
## </dl>
## <p>
## </p>
## <dl>
## <dt>Title</dt>
## <dd>Logical Chess: Move by Move</dd>
## <dt>Author</dt>
## <dd>Irving Chernev</dd>
## <dt>Topic</dt>
## <dd>Middlegame</dd>
## <dt>Difficulty</dt>
## <dd>Easy</dd>
## <dt>Rating</dt>
## <dd>5/5</dd>
## </dl>
## </body>
## </html>
##
values <- xpathSApply(doc = htmlbooks, path = "//dd", fun = xmlValue)
cols <- xpathSApply(doc = htmlbooks, path = '//dt', fun = xmlValue)
makeDF <- function(cols, values){
cols <- cols[1:5]
book1 <- values[1:5]
book2 <- values[6:12]
authors <- list(book2[2:4])
book2[2] <- authors
book2 <- book2[-3:-4]
book3 <- values[13:17]
df <- rbind(book1, book2, book3)
df <- as.data.frame(df)
names(df) <- cols
df
}
htmldf <- makeDF(cols, values)
kable(htmldf)
| Title | Author | Topic | Difficulty | Rating | |
|---|---|---|---|---|---|
| book1 | Dvoretsky’s Endgame Manual | Mark Dvoretsky | Endgame | Hard | 5/5 |
| book2 | Chess Openings for Black, Explained | c(“Lev Alburt”, “Roman Dzindzichashvili”, “Eugene Perelshteyn”) | Openings | Medium | 3/5 |
| book3 | Logical Chess: Move by Move | Irving Chernev | Middlegame | Easy | 5/5 |
Above we see the unstructured and the final structured data.
Next on this list is XML data.
if (!(file.exists('books.xml'))){
url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.xml?token=ALKCMBMAOWXTSUQDGALR42K6NTZAG'
download.file(url, 'books.xml')
}
xmlbooks <- xmlParse(file = 'books.xml')
print(xmlbooks)
## <?xml version="1.0"?>
## <chess_books>
## <book id="1">
## <title>Dvoretsky's Endgame Manual</title>
## <author>Mark Dvoretsky</author>
## <topic>Endgame</topic>
## <difficulty>Hard</difficulty>
## <rating>5/5</rating>
## </book>
## <book id="2">
## <title>Chess Openings for Black, Explained</title>
## <author id="1">Lev Alburt</author>
## <author id="2">Roman Dzindzichashvili</author>
## <author id="3">Eugene Perelshteyn</author>
## <topic>Openings</topic>
## <difficulty>Medium</difficulty>
## <rating>3/5</rating>
## </book>
## <book id="3">
## <title>Logical Chess: Move by Move</title>
## <author>Irving Chernev</author>
## <topic>Middlegame</topic>
## <difficulty>Easy</difficulty>
## <rating>5/5</rating>
## </book>
## </chess_books>
##
nodes <- xpathSApply(doc = xmlbooks, path = "//book", fun = xmlChildren)
cols <- c()
vals <- c()
for (i in 1:xmlSize(nodes)){
for (j in 1:xmlSize(nodes[[i]])){
cols <- append(cols, xmlName(nodes[[1]][[i]]))
vals <- append(vals, xmlValue(nodes[[i]][[j]]))
}
}
xmlDF <- makeDF(cols, vals)
kable(xmlDF)
| title | title | title | title | title | |
|---|---|---|---|---|---|
| book1 | Dvoretsky’s Endgame Manual | Mark Dvoretsky | Endgame | Hard | 5/5 |
| book2 | Chess Openings for Black, Explained | c(“Lev Alburt”, “Roman Dzindzichashvili”, “Eugene Perelshteyn”) | Openings | Medium | 3/5 |
| book3 | Logical Chess: Move by Move | Irving Chernev | Middlegame | Easy | 5/5 |
Once again, you can compare the raw XML data to the tidied dataframe.
Last data type is JSON data.
if (!(file.exists('books.json'))){
url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.json?token=ALKCMBM4ANKRSWNBCDHGOJK6NTZFM'
download.file(url, 'books.json')
}
jsonbooks <- fromJSON(file = 'books.json')
print(jsonbooks)
## $`chess books`
## $`chess books`[[1]]
## $`chess books`[[1]]$title
## [1] "Dvoretsky's Endgame Manual"
##
## $`chess books`[[1]]$author
## [1] "Mark Dvoretsky"
##
## $`chess books`[[1]]$topic
## [1] "Endgame"
##
## $`chess books`[[1]]$difficulty
## [1] "Hard"
##
## $`chess books`[[1]]$rating
## [1] "5/5"
##
##
## $`chess books`[[2]]
## $`chess books`[[2]]$title
## [1] "Chess Openings for Black, Explained"
##
## $`chess books`[[2]]$author
## [1] "Lev Alburt" "Roman Dzindzichashvili" "Eugene Perelshteyn"
##
## $`chess books`[[2]]$topic
## [1] "Openings"
##
## $`chess books`[[2]]$difficulty
## [1] "Medium"
##
## $`chess books`[[2]]$rating
## [1] "3/5"
##
##
## $`chess books`[[3]]
## $`chess books`[[3]]$title
## [1] "Logical Chess: Move by Move"
##
## $`chess books`[[3]]$author
## [1] "Irving Chernev"
##
## $`chess books`[[3]]$topic
## [1] "Middlegame"
##
## $`chess books`[[3]]$difficulty
## [1] "Easy"
##
## $`chess books`[[3]]$rating
## [1] "5/5"
jsondf <- as.data.frame(jsonbooks)
library(tidyverse)
## -- Attaching packages -------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
jsondf1 <- jsondf[,1:5]
jsondf2 <- jsondf[,6:10]
jsondf3 <- jsondf[,11:15]
jsondf1 <- jsondf1 %>% distinct()
jsondf2 <- jsondf2 %>% distinct()
jsondf3 <- jsondf3 %>% distinct()
colnames <- names(jsondf1)
names(jsondf2) <- colnames
names(jsondf3) <- colnames
jsondf <- rbind(jsondf1, jsondf2, jsondf3)
kable(jsondf)
| chess.books.title | chess.books.author | chess.books.topic | chess.books.difficulty | chess.books.rating |
|---|---|---|---|---|
| Dvoretsky’s Endgame Manual | Mark Dvoretsky | Endgame | Hard | 5/5 |
| Chess Openings for Black, Explained | Lev Alburt | Openings | Medium | 3/5 |
| Chess Openings for Black, Explained | Roman Dzindzichashvili | Openings | Medium | 3/5 |
| Chess Openings for Black, Explained | Eugene Perelshteyn | Openings | Medium | 3/5 |
| Logical Chess: Move by Move | Irving Chernev | Middlegame | Easy | 5/5 |
You can see the raw JSON data and the tidied dataframe. The JSON dataframe is slightly different from the HTML and XML dataframe, as HTML and XML are highly related in structure and therefore experienced very similar transformations. I could have worked more with the JSON data to transform it to be exactly like the HTML data, but I thought that this was satisfactory and that the JSON dataframe is still clean.