Summary

I have used the following libraries in this assignment to read HTML, XML and JSON format files.

XML

rjson

All the files are stored in Github repository DATA 607 Week 7 Assignment - Working with XML and JSON in R

books.html - HTML format file

books.xml - XML format file

books.json - JSON format file

Please note that the conversion to Dataframe is performed only for the HTML file however the same type of conversion can be done for XML and JSON formats.

Load Libraries

# Clear the console
cat("\014")

# Check if the package is installed. If not, install the package
if(!require('XML')) {
  install.packages('XML')
  library(XML)
}
## Loading required package: XML
# Check if the package is installed. If not, install the package
if(!require('rjson')) {
  install.packages('rjson')
  library(rjson)
}
## Loading required package: rjson

Read HTML

# Read the books.html fle
booksHTMLFile <- readLines("https://raw.githubusercontent.com/kalyanparthasarathy/DATA607/master/books.html")

# Find the lines that matches the pattern "<td> .... </td>"
tdPattern = "<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>"

# Extract the lines that matches the pattern
matchingLines = grep(tdPattern, booksHTMLFile[1:length(booksHTMLFile)], value=TRUE)

# Extract onlt the information that we need from <td> tag
booksInformation <- trimws(unlist(sub("<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>", "\\1", matchingLines)))

# Books HTML data as it is - Before formatting
booksInformation
##  [1] "Book Title"                        
##  [2] "Author(s)"                         
##  [3] "Year Published"                    
##  [4] "Publisher"                         
##  [5] "ISBN"                              
##  [6] "Who Moved My Cheese"               
##  [7] "Spencer Johnson"                   
##  [8] "1998"                              
##  [9] "G.P. Putnam's Sons"                
## [10] "0-399-14446-3"                     
## [11] "The One Minute Manager"            
## [12] "Kenneth Blanchard, Spencer Johnson"
## [13] "1983"                              
## [14] "Berkley Trade"                     
## [15] "0-425-09847-8"                     
## [16] "Rich Dad Poor Dad"                 
## [17] "Robert Kiyosaki, Sharon L. Lechter"
## [18] "2000"                              
## [19] "Warner Books Ed"                   
## [20] "0-446-67745-0"

R Structure of HTML

str(booksInformation)
##  chr [1:20] "Book Title" "Author(s)" "Year Published" "Publisher" ...

Conversion to Dataframe

# Books information as data frame
booksInformationDF <- cbind.data.frame(split(booksInformation, rep(1:5, times=length(booksInformation)/5)), stringsAsFactors=F)
names(booksInformationDF) <- c("Title", "Author", "Publication_Year", "Publisher", "ISBN")
booksInformationDF = booksInformationDF[-1,]

# HTML Contents as Dataframe
booksInformationDF
##                    Title                             Author
## 2    Who Moved My Cheese                    Spencer Johnson
## 3 The One Minute Manager Kenneth Blanchard, Spencer Johnson
## 4      Rich Dad Poor Dad Robert Kiyosaki, Sharon L. Lechter
##   Publication_Year          Publisher          ISBN
## 2             1998 G.P. Putnam's Sons 0-399-14446-3
## 3             1983      Berkley Trade 0-425-09847-8
## 4             2000    Warner Books Ed 0-446-67745-0
# Structure of HTML Books
str(booksInformationDF)
## 'data.frame':    3 obs. of  5 variables:
##  $ Title           : chr  "Who Moved My Cheese" "The One Minute Manager" "Rich Dad Poor Dad"
##  $ Author          : chr  "Spencer Johnson" "Kenneth Blanchard, Spencer Johnson" "Robert Kiyosaki, Sharon L. Lechter"
##  $ Publication_Year: chr  "1998" "1983" "2000"
##  $ Publisher       : chr  "G.P. Putnam's Sons" "Berkley Trade" "Warner Books Ed"
##  $ ISBN            : chr  "0-399-14446-3" "0-425-09847-8" "0-446-67745-0"

Read XML

# Reading Books XML File
# First download the file from Github repository and save to local file system
# Then use XML parse 
download.file("https://raw.githubusercontent.com/kalyanparthasarathy/DATA607/master/books.xml", destfile = "Books_XML_File.xml")
booksXMLFile <- xmlParse("Books_XML_File.xml")
booksXMLDF <- xmlToDataFrame(booksXMLFile)

# XML Contents
booksXMLDF
##                               author                  title price
## 1                    Spencer Johnson    Who Moved My Cheese 44.95
## 2 Kenneth Blanchard, Spencer Johnson The One Minute Manager 44.95
## 3 Robert Kiyosaki, Sharon L. Lechter      Rich Dad Poor Dad 44.95
##   published_year          publisher          isbn
## 1           1998 G.P. Putnam's Sons 0-399-14446-3
## 2           1983      Berkley Trade 0-425-09847-8
## 3           2000    Warner Books Ed 0-446-67745-0

R Structure of XML

# Structure of XML Books
str(booksXMLDF)
## 'data.frame':    3 obs. of  6 variables:
##  $ author        : Factor w/ 3 levels "Kenneth Blanchard, Spencer Johnson",..: 3 1 2
##  $ title         : Factor w/ 3 levels "Rich Dad Poor Dad",..: 3 2 1
##  $ price         : Factor w/ 1 level "44.95": 1 1 1
##  $ published_year: Factor w/ 3 levels "1983","1998",..: 2 1 3
##  $ publisher     : Factor w/ 3 levels "Berkley Trade",..: 2 1 3
##  $ isbn          : Factor w/ 3 levels "0-399-14446-3",..: 1 2 3

Read JSON

# Reading Books JSON File
jsonData <- rjson::fromJSON(file="https://raw.githubusercontent.com/kalyanparthasarathy/DATA607/master/books.json")

R Structure of JSON

# Structure of JSON data
str(jsonData)
## List of 3
##  $ :List of 6
##   ..$ title         : chr "Who Moved My Cheese"
##   ..$ author        : chr "Spencer Johnson"
##   ..$ published_year: num 1988
##   ..$ publisher     : chr "G.P. Putnam's Sons"
##   ..$ isbn          : chr "0-399-14446-3"
##   ..$ price         : num 9.6
##  $ :List of 6
##   ..$ title         : chr "The One Minute Manager"
##   ..$ author        : chr "Kenneth Blanchard, Spencer Johnson"
##   ..$ published_year: num 1983
##   ..$ publisher     : chr "Berkley Trade"
##   ..$ isbn          : chr "0-425-09847-8"
##   ..$ price         : num 16.3
##  $ :List of 6
##   ..$ title         : chr "Rich Dad Poor Dad"
##   ..$ author        : chr "Robert Kiyosaki, Sharon L. Lechter"
##   ..$ published_year: num 2000
##   ..$ publisher     : chr "Warner Books Ed"
##   ..$ isbn          : chr "0-446-67745-0"
##   ..$ price         : num 15