Web Data Intake

HTML

We will be intaking HTML, XML, and JSON data and turning it into a dataframe. Let us start with our HTML data.

library(XML)
library(rjson)
library(knitr)


if (!(file.exists('books.html'))){
  url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.html?token=ALKCMBIMTDFUMQW6TCJWWVC6NTZPE'
  download.file(url, 'books.html')
}
htmlbooks <- htmlParse(file = 'books.html')

print(htmlbooks)
## <!DOCTYPE html>
## <html>
## <head><title>Chess Books</title></head>
## <body>
##    <p>
##     </p>
## <dl>
## <dt>Title</dt>
##      <dd>Dvoretsky's Endgame Manual</dd>
##      <dt>Author</dt> 
##      <dd>Mark Dvoretsky</dd>
##      <dt>Topic</dt>
##      <dd>Endgame</dd>
##      <dt>Difficulty</dt>
##      <dd>Hard</dd>
##      <dt>Rating</dt>
##      <dd>5/5</dd>
##     </dl>
## <p>
##     </p>
## <dl>
## <dt>Title</dt>
##      <dd>Chess Openings for Black, Explained</dd>
##      <dt>Author</dt> 
##      <dd>Lev Alburt</dd>
##      <dd>Roman Dzindzichashvili</dd>
##      <dd>Eugene Perelshteyn</dd>
##      <dt>Topic</dt>
##      <dd>Openings</dd>
##      <dt>Difficulty</dt>
##      <dd>Medium</dd>
##      <dt>Rating</dt>
##      <dd>3/5</dd>
##     </dl>
## <p>
##     </p>
## <dl>
## <dt>Title</dt>
##      <dd>Logical Chess: Move by Move</dd>
##      <dt>Author</dt> 
##      <dd>Irving Chernev</dd>
##      <dt>Topic</dt>
##      <dd>Middlegame</dd>
##      <dt>Difficulty</dt>
##      <dd>Easy</dd>
##      <dt>Rating</dt>
##      <dd>5/5</dd>
##     </dl>
## </body>
## </html>
## 
values <- xpathSApply(doc = htmlbooks, path = "//dd", fun = xmlValue)
cols <- xpathSApply(doc = htmlbooks, path = '//dt', fun = xmlValue)



makeDF <- function(cols, values){
  cols <- cols[1:5]
  book1 <- values[1:5]
  book2 <- values[6:12]
  authors <- list(book2[2:4])
  book2[2] <- authors
  book2 <- book2[-3:-4]

  book3 <- values[13:17]
  df <- rbind(book1, book2, book3)
  df <- as.data.frame(df)
  names(df) <- cols
  df
}
htmldf <- makeDF(cols, values)
kable(htmldf)
Title Author Topic Difficulty Rating
book1 Dvoretsky’s Endgame Manual Mark Dvoretsky Endgame Hard 5/5
book2 Chess Openings for Black, Explained c(“Lev Alburt”, “Roman Dzindzichashvili”, “Eugene Perelshteyn”) Openings Medium 3/5
book3 Logical Chess: Move by Move Irving Chernev Middlegame Easy 5/5

Above we see the unstructured and the final structured data.

XML

Next on this list is XML data.

if (!(file.exists('books.xml'))){
  url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.xml?token=ALKCMBMAOWXTSUQDGALR42K6NTZAG'
  download.file(url, 'books.xml')
}
xmlbooks <- xmlParse(file = 'books.xml')

print(xmlbooks)
## <?xml version="1.0"?>
## <chess_books>
##   <book id="1">
##     <title>Dvoretsky's Endgame Manual</title>
##     <author>Mark Dvoretsky</author>
##     <topic>Endgame</topic>
##     <difficulty>Hard</difficulty>
##     <rating>5/5</rating>
##   </book>
##   <book id="2">
##     <title>Chess Openings for Black, Explained</title>
##     <author id="1">Lev Alburt</author>
##     <author id="2">Roman Dzindzichashvili</author>
##     <author id="3">Eugene Perelshteyn</author>
##     <topic>Openings</topic>
##     <difficulty>Medium</difficulty>
##     <rating>3/5</rating>
##   </book>
##   <book id="3">
##     <title>Logical Chess: Move by Move</title>
##     <author>Irving Chernev</author>
##     <topic>Middlegame</topic>
##     <difficulty>Easy</difficulty>
##     <rating>5/5</rating>
##   </book>
## </chess_books>
## 
nodes <- xpathSApply(doc = xmlbooks, path = "//book", fun = xmlChildren)
cols <- c()
vals <- c()
for (i in 1:xmlSize(nodes)){
  for (j in 1:xmlSize(nodes[[i]])){
    cols <- append(cols, xmlName(nodes[[1]][[i]]))
    vals <- append(vals, xmlValue(nodes[[i]][[j]]))
  }
}
xmlDF <- makeDF(cols, vals)
kable(xmlDF)
title title title title title
book1 Dvoretsky’s Endgame Manual Mark Dvoretsky Endgame Hard 5/5
book2 Chess Openings for Black, Explained c(“Lev Alburt”, “Roman Dzindzichashvili”, “Eugene Perelshteyn”) Openings Medium 3/5
book3 Logical Chess: Move by Move Irving Chernev Middlegame Easy 5/5

Once again, you can compare the raw XML data to the tidied dataframe.

JSON

Last data type is JSON data.

if (!(file.exists('books.json'))){
  url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.json?token=ALKCMBM4ANKRSWNBCDHGOJK6NTZFM'
  download.file(url, 'books.json')
}
jsonbooks <- fromJSON(file = 'books.json')

print(jsonbooks)
## $`chess books`
## $`chess books`[[1]]
## $`chess books`[[1]]$title
## [1] "Dvoretsky's Endgame Manual"
## 
## $`chess books`[[1]]$author
## [1] "Mark Dvoretsky"
## 
## $`chess books`[[1]]$topic
## [1] "Endgame"
## 
## $`chess books`[[1]]$difficulty
## [1] "Hard"
## 
## $`chess books`[[1]]$rating
## [1] "5/5"
## 
## 
## $`chess books`[[2]]
## $`chess books`[[2]]$title
## [1] "Chess Openings for Black, Explained"
## 
## $`chess books`[[2]]$author
## [1] "Lev Alburt"             "Roman Dzindzichashvili" "Eugene Perelshteyn"    
## 
## $`chess books`[[2]]$topic
## [1] "Openings"
## 
## $`chess books`[[2]]$difficulty
## [1] "Medium"
## 
## $`chess books`[[2]]$rating
## [1] "3/5"
## 
## 
## $`chess books`[[3]]
## $`chess books`[[3]]$title
## [1] "Logical Chess: Move by Move"
## 
## $`chess books`[[3]]$author
## [1] "Irving Chernev"
## 
## $`chess books`[[3]]$topic
## [1] "Middlegame"
## 
## $`chess books`[[3]]$difficulty
## [1] "Easy"
## 
## $`chess books`[[3]]$rating
## [1] "5/5"
jsondf <- as.data.frame(jsonbooks)
library(tidyverse)
## -- Attaching packages -------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
jsondf1 <- jsondf[,1:5]
jsondf2 <- jsondf[,6:10]
jsondf3 <- jsondf[,11:15]

jsondf1 <- jsondf1 %>% distinct()
jsondf2 <- jsondf2 %>% distinct()
jsondf3 <- jsondf3 %>% distinct()

colnames <- names(jsondf1)
names(jsondf2) <- colnames
names(jsondf3) <- colnames

jsondf <- rbind(jsondf1, jsondf2, jsondf3)
kable(jsondf)
chess.books.title chess.books.author chess.books.topic chess.books.difficulty chess.books.rating
Dvoretsky’s Endgame Manual Mark Dvoretsky Endgame Hard 5/5
Chess Openings for Black, Explained Lev Alburt Openings Medium 3/5
Chess Openings for Black, Explained Roman Dzindzichashvili Openings Medium 3/5
Chess Openings for Black, Explained Eugene Perelshteyn Openings Medium 3/5
Logical Chess: Move by Move Irving Chernev Middlegame Easy 5/5

You can see the raw JSON data and the tidied dataframe. The JSON dataframe is slightly different from the HTML and XML dataframe, as HTML and XML are highly related in structure and therefore experienced very similar transformations. I could have worked more with the JSON data to transform it to be exactly like the HTML data, but I thought that this was satisfactory and that the JSON dataframe is still clean.