Set up environment and load libraries

rm(list = ls())
library(kableExtra)
library(dplyr)
library(class)
library(knitr)
library(RCurl)
library(XML)
library(jsonlite)

HTML

urlHTML <- getURL("https://raw.githubusercontent.com/stipton/CUNY-SPS/master/DATA%20607/Week%207/books.html")
htmlParse(file = urlHTML)
## <!DOCTYPE html>
## <html>
## <head><title>Books!</title></head>
## <body>
##         <h1>My Favorite Books</h1>
##      <table style="width:100%">
## <tr>
## <th>Title</th>
##              <th>Author1</th>
##              <th>Author2</th>
##              <th>Publisher</th>
##              <th>Publication Date</th>
##              <th>Number of Pages</th>
##          </tr>
## <tr>
## <td>War and Peace</td>
##              <td>Leo Tolstoy</td> 
##              <td></td>
##              <td>Penguin Books</td>
##              <td>2005</td>
##              <td>1408</td>
##          </tr>
## <tr>
## <td>The Talented Mr. Ripley</td>
##              <td>Patricia Highsmith</td> 
##              <td></td>
##              <td>1955</td>
##              <td>Everyman's Library</td>
##              <td>290</td>
##          </tr>
## <tr>
## <td>The Whole World Was Watching</td>
##              <td>Patrick Hinds</td> 
##              <td>Romaine Patterson</td>
##              <td>Advocate Books</td>
##              <td>2005</td>
##              <td>286</td>
##          </tr>
## </table>
## </body>
## </html>
## 
tables <- readHTMLTable(urlHTML)
tables
## $`NULL`
##                          Title            Author1           Author2
## 1                War and Peace        Leo Tolstoy                  
## 2      The Talented Mr. Ripley Patricia Highsmith                  
## 3 The Whole World Was Watching      Patrick Hinds Romaine Patterson
##        Publisher   Publication Date Number of Pages
## 1  Penguin Books               2005            1408
## 2           1955 Everyman's Library             290
## 3 Advocate Books               2005             286

JSON

urlJSON <- fromJSON("https://raw.githubusercontent.com/stipton/CUNY-SPS/master/DATA%20607/Week%207/books.json")
urlJSON[[1]] %>%  kable("html", caption = "JSON File") %>%
  kable_styling(bootstrap_options = c("striped"))
JSON File
title authors publisher publication_date number_pages
War and Peace Leo Tolstoy Penguin Books 2005 1408
The Talented Mr. Ripley Patricia Highsmith Everyman’s Library 1955 290
The Whole World Was Watching c(“Patrick Hinds”, “Romaine Patterson”) Advocate Books 2005 286

XML

urlXML <- getURL("https://raw.githubusercontent.com/stipton/CUNY-SPS/master/DATA%20607/Week%207/books.xml")
x.parsed <- xmlParse(file = urlXML)
x.root <- xmlRoot(x.parsed)
xmlName(x.root)
## [1] "books"
xmlSize(x.root)
## [1] 3
x.root[[1]][[1]]
## <title>War and Peace</title>
x.df <- xmlToDataFrame(x.root) 
x.df %>%  kable("html", caption = "XML File") %>%
  kable_styling(bootstrap_options = c("striped"))
XML File
title author publisher publication_date number_pages
War and Peace Leo Tolstoy Penguin Books 2005 1408
The Talented Mr. Ripley Patricia Highsmith Everyman’s Library 1955 290
The Whole World Was Watching Patrick Hinds, Romaine Patterson Advocate Books 2005 286