Web Data Intake

HTML

We will be intaking HTML, XML, and JSON data and turning it into a dataframe. Let us start with our HTML data.

library(XML)
library(rjson)
library(knitr)


if (!(file.exists('books.html'))){
  url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.html?token=ALKCMBIMTDFUMQW6TCJWWVC6NTZPE'
  download.file(url, 'books.html')
}
htmlbooks <- htmlParse(file = 'books.html')

print(htmlbooks)

## <!DOCTYPE html>
## <html>
## <head><title>Chess Books</title></head>
## <body>
##    <p>
##     </p>
## <dl>
## <dt>Title</dt>
##      <dd>Dvoretsky's Endgame Manual</dd>
##      <dt>Author</dt> 
##      <dd>Mark Dvoretsky</dd>
##      <dt>Topic</dt>
##      <dd>Endgame</dd>
##      <dt>Difficulty</dt>
##      <dd>Hard</dd>
##      <dt>Rating</dt>
##      <dd>5/5</dd>
##     </dl>
## <p>
##     </p>
## <dl>
## <dt>Title</dt>
##      <dd>Chess Openings for Black, Explained</dd>
##      <dt>Author</dt> 
##      <dd>Lev Alburt</dd>
##      <dd>Roman Dzindzichashvili</dd>
##      <dd>Eugene Perelshteyn</dd>
##      <dt>Topic</dt>
##      <dd>Openings</dd>
##      <dt>Difficulty</dt>
##      <dd>Medium</dd>
##      <dt>Rating</dt>
##      <dd>3/5</dd>
##     </dl>
## <p>
##     </p>
## <dl>
## <dt>Title</dt>
##      <dd>Logical Chess: Move by Move</dd>
##      <dt>Author</dt> 
##      <dd>Irving Chernev</dd>
##      <dt>Topic</dt>
##      <dd>Middlegame</dd>
##      <dt>Difficulty</dt>
##      <dd>Easy</dd>
##      <dt>Rating</dt>
##      <dd>5/5</dd>
##     </dl>
## </body>
## </html>
##

values <- xpathSApply(doc = htmlbooks, path = "//dd", fun = xmlValue)
cols <- xpathSApply(doc = htmlbooks, path = '//dt', fun = xmlValue)



makeDF <- function(cols, values){
  cols <- cols[1:5]
  book1 <- values[1:5]
  book2 <- values[6:12]
  authors <- list(book2[2:4])
  book2[2] <- authors
  book2 <- book2[-3:-4]

  book3 <- values[13:17]
  df <- rbind(book1, book2, book3)
  df <- as.data.frame(df)
  names(df) <- cols
  df
}
htmldf <- makeDF(cols, values)
kable(htmldf)

	Title	Author	Topic	Difficulty	Rating
book1	Dvoretsky’s Endgame Manual	Mark Dvoretsky	Endgame	Hard	5/5
book2	Chess Openings for Black, Explained	c(“Lev Alburt”, “Roman Dzindzichashvili”, “Eugene Perelshteyn”)	Openings	Medium	3/5
book3	Logical Chess: Move by Move	Irving Chernev	Middlegame	Easy	5/5

Above we see the unstructured and the final structured data.

XML

Next on this list is XML data.

if (!(file.exists('books.xml'))){
  url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.xml?token=ALKCMBMAOWXTSUQDGALR42K6NTZAG'
  download.file(url, 'books.xml')
}
xmlbooks <- xmlParse(file = 'books.xml')

print(xmlbooks)

## <?xml version="1.0"?>
## <chess_books>
##   <book id="1">
##     <title>Dvoretsky's Endgame Manual</title>
##     <author>Mark Dvoretsky</author>
##     <topic>Endgame</topic>
##     <difficulty>Hard</difficulty>
##     <rating>5/5</rating>
##   </book>
##   <book id="2">
##     <title>Chess Openings for Black, Explained</title>
##     <author id="1">Lev Alburt</author>
##     <author id="2">Roman Dzindzichashvili</author>
##     <author id="3">Eugene Perelshteyn</author>
##     <topic>Openings</topic>
##     <difficulty>Medium</difficulty>
##     <rating>3/5</rating>
##   </book>
##   <book id="3">
##     <title>Logical Chess: Move by Move</title>
##     <author>Irving Chernev</author>
##     <topic>Middlegame</topic>
##     <difficulty>Easy</difficulty>
##     <rating>5/5</rating>
##   </book>
## </chess_books>
##

nodes <- xpathSApply(doc = xmlbooks, path = "//book", fun = xmlChildren)
cols <- c()
vals <- c()
for (i in 1:xmlSize(nodes)){
  for (j in 1:xmlSize(nodes[[i]])){
    cols <- append(cols, xmlName(nodes[[1]][[i]]))
    vals <- append(vals, xmlValue(nodes[[i]][[j]]))
  }
}
xmlDF <- makeDF(cols, vals)
kable(xmlDF)

	title	title	title	title	title
book1	Dvoretsky’s Endgame Manual	Mark Dvoretsky	Endgame	Hard	5/5
book2	Chess Openings for Black, Explained	c(“Lev Alburt”, “Roman Dzindzichashvili”, “Eugene Perelshteyn”)	Openings	Medium	3/5
book3	Logical Chess: Move by Move	Irving Chernev	Middlegame	Easy	5/5

Once again, you can compare the raw XML data to the tidied dataframe.

JSON

Last data type is JSON data.

if (!(file.exists('books.json'))){
  url <- 'https://raw.githubusercontent.com/sbellows1/607/master/Week7/books.json?token=ALKCMBM4ANKRSWNBCDHGOJK6NTZFM'
  download.file(url, 'books.json')
}
jsonbooks <- fromJSON(file = 'books.json')

print(jsonbooks)

## $`chess books`
## $`chess books`[[1]]
## $`chess books`[[1]]$title
## [1] "Dvoretsky's Endgame Manual"
## 
## $`chess books`[[1]]$author
## [1] "Mark Dvoretsky"
## 
## $`chess books`[[1]]$topic
## [1] "Endgame"
## 
## $`chess books`[[1]]$difficulty
## [1] "Hard"
## 
## $`chess books`[[1]]$rating
## [1] "5/5"
## 
## 
## $`chess books`[[2]]
## $`chess books`[[2]]$title
## [1] "Chess Openings for Black, Explained"
## 
## $`chess books`[[2]]$author
## [1] "Lev Alburt"             "Roman Dzindzichashvili" "Eugene Perelshteyn"    
## 
## $`chess books`[[2]]$topic
## [1] "Openings"
## 
## $`chess books`[[2]]$difficulty
## [1] "Medium"
## 
## $`chess books`[[2]]$rating
## [1] "3/5"
## 
## 
## $`chess books`[[3]]
## $`chess books`[[3]]$title
## [1] "Logical Chess: Move by Move"
## 
## $`chess books`[[3]]$author
## [1] "Irving Chernev"
## 
## $`chess books`[[3]]$topic
## [1] "Middlegame"
## 
## $`chess books`[[3]]$difficulty
## [1] "Easy"
## 
## $`chess books`[[3]]$rating
## [1] "5/5"

jsondf <- as.data.frame(jsonbooks)
library(tidyverse)

## -- Attaching packages -------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

jsondf1 <- jsondf[,1:5]
jsondf2 <- jsondf[,6:10]
jsondf3 <- jsondf[,11:15]

jsondf1 <- jsondf1 %>% distinct()
jsondf2 <- jsondf2 %>% distinct()
jsondf3 <- jsondf3 %>% distinct()

colnames <- names(jsondf1)
names(jsondf2) <- colnames
names(jsondf3) <- colnames

jsondf <- rbind(jsondf1, jsondf2, jsondf3)
kable(jsondf)

chess.books.title	chess.books.author	chess.books.topic	chess.books.difficulty	chess.books.rating
Dvoretsky’s Endgame Manual	Mark Dvoretsky	Endgame	Hard	5/5
Chess Openings for Black, Explained	Lev Alburt	Openings	Medium	3/5
Chess Openings for Black, Explained	Roman Dzindzichashvili	Openings	Medium	3/5
Chess Openings for Black, Explained	Eugene Perelshteyn	Openings	Medium	3/5
Logical Chess: Move by Move	Irving Chernev	Middlegame	Easy	5/5

You can see the raw JSON data and the tidied dataframe. The JSON dataframe is slightly different from the HTML and XML dataframe, as HTML and XML are highly related in structure and therefore experienced very similar transformations. I could have worked more with the JSON data to transform it to be exactly like the HTML data, but I thought that this was satisfactory and that the JSON dataframe is still clean.

HW7

Web Data Intake

HTML

XML

JSON