I have used the following libraries in this assignment to read HTML, XML and JSON format files.
XML
rjson
All the files are stored in Github repository DATA 607 Week 7 Assignment - Working with XML and JSON in R
books.html - HTML format file
books.xml - XML format file
books.json - JSON format file
Please note that the conversion to Dataframe is performed only for the HTML file however the same type of conversion can be done for XML and JSON formats.
# Clear the console
cat("\014")
# Check if the package is installed. If not, install the package
if(!require('XML')) {
install.packages('XML')
library(XML)
}
## Loading required package: XML
# Check if the package is installed. If not, install the package
if(!require('rjson')) {
install.packages('rjson')
library(rjson)
}
## Loading required package: rjson
# Read the books.html fle
booksHTMLFile <- readLines("https://raw.githubusercontent.com/kalyanparthasarathy/DATA607/master/books.html")
# Find the lines that matches the pattern "<td> .... </td>"
tdPattern = "<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>"
# Extract the lines that matches the pattern
matchingLines = grep(tdPattern, booksHTMLFile[1:length(booksHTMLFile)], value=TRUE)
# Extract onlt the information that we need from <td> tag
booksInformation <- trimws(unlist(sub("<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>", "\\1", matchingLines)))
# Books HTML data as it is - Before formatting
booksInformation
## [1] "Book Title"
## [2] "Author(s)"
## [3] "Year Published"
## [4] "Publisher"
## [5] "ISBN"
## [6] "Who Moved My Cheese"
## [7] "Spencer Johnson"
## [8] "1998"
## [9] "G.P. Putnam's Sons"
## [10] "0-399-14446-3"
## [11] "The One Minute Manager"
## [12] "Kenneth Blanchard, Spencer Johnson"
## [13] "1983"
## [14] "Berkley Trade"
## [15] "0-425-09847-8"
## [16] "Rich Dad Poor Dad"
## [17] "Robert Kiyosaki, Sharon L. Lechter"
## [18] "2000"
## [19] "Warner Books Ed"
## [20] "0-446-67745-0"
str(booksInformation)
## chr [1:20] "Book Title" "Author(s)" "Year Published" "Publisher" ...
# Books information as data frame
booksInformationDF <- cbind.data.frame(split(booksInformation, rep(1:5, times=length(booksInformation)/5)), stringsAsFactors=F)
names(booksInformationDF) <- c("Title", "Author", "Publication_Year", "Publisher", "ISBN")
booksInformationDF = booksInformationDF[-1,]
# HTML Contents as Dataframe
booksInformationDF
## Title Author
## 2 Who Moved My Cheese Spencer Johnson
## 3 The One Minute Manager Kenneth Blanchard, Spencer Johnson
## 4 Rich Dad Poor Dad Robert Kiyosaki, Sharon L. Lechter
## Publication_Year Publisher ISBN
## 2 1998 G.P. Putnam's Sons 0-399-14446-3
## 3 1983 Berkley Trade 0-425-09847-8
## 4 2000 Warner Books Ed 0-446-67745-0
# Structure of HTML Books
str(booksInformationDF)
## 'data.frame': 3 obs. of 5 variables:
## $ Title : chr "Who Moved My Cheese" "The One Minute Manager" "Rich Dad Poor Dad"
## $ Author : chr "Spencer Johnson" "Kenneth Blanchard, Spencer Johnson" "Robert Kiyosaki, Sharon L. Lechter"
## $ Publication_Year: chr "1998" "1983" "2000"
## $ Publisher : chr "G.P. Putnam's Sons" "Berkley Trade" "Warner Books Ed"
## $ ISBN : chr "0-399-14446-3" "0-425-09847-8" "0-446-67745-0"
# Reading Books XML File
# First download the file from Github repository and save to local file system
# Then use XML parse
download.file("https://raw.githubusercontent.com/kalyanparthasarathy/DATA607/master/books.xml", destfile = "Books_XML_File.xml")
booksXMLFile <- xmlParse("Books_XML_File.xml")
booksXMLDF <- xmlToDataFrame(booksXMLFile)
# XML Contents
booksXMLDF
## author title price
## 1 Spencer Johnson Who Moved My Cheese 44.95
## 2 Kenneth Blanchard, Spencer Johnson The One Minute Manager 44.95
## 3 Robert Kiyosaki, Sharon L. Lechter Rich Dad Poor Dad 44.95
## published_year publisher isbn
## 1 1998 G.P. Putnam's Sons 0-399-14446-3
## 2 1983 Berkley Trade 0-425-09847-8
## 3 2000 Warner Books Ed 0-446-67745-0
# Structure of XML Books
str(booksXMLDF)
## 'data.frame': 3 obs. of 6 variables:
## $ author : Factor w/ 3 levels "Kenneth Blanchard, Spencer Johnson",..: 3 1 2
## $ title : Factor w/ 3 levels "Rich Dad Poor Dad",..: 3 2 1
## $ price : Factor w/ 1 level "44.95": 1 1 1
## $ published_year: Factor w/ 3 levels "1983","1998",..: 2 1 3
## $ publisher : Factor w/ 3 levels "Berkley Trade",..: 2 1 3
## $ isbn : Factor w/ 3 levels "0-399-14446-3",..: 1 2 3
# Reading Books JSON File
jsonData <- rjson::fromJSON(file="https://raw.githubusercontent.com/kalyanparthasarathy/DATA607/master/books.json")
# Structure of JSON data
str(jsonData)
## List of 3
## $ :List of 6
## ..$ title : chr "Who Moved My Cheese"
## ..$ author : chr "Spencer Johnson"
## ..$ published_year: num 1988
## ..$ publisher : chr "G.P. Putnam's Sons"
## ..$ isbn : chr "0-399-14446-3"
## ..$ price : num 9.6
## $ :List of 6
## ..$ title : chr "The One Minute Manager"
## ..$ author : chr "Kenneth Blanchard, Spencer Johnson"
## ..$ published_year: num 1983
## ..$ publisher : chr "Berkley Trade"
## ..$ isbn : chr "0-425-09847-8"
## ..$ price : num 16.3
## $ :List of 6
## ..$ title : chr "Rich Dad Poor Dad"
## ..$ author : chr "Robert Kiyosaki, Sharon L. Lechter"
## ..$ published_year: num 2000
## ..$ publisher : chr "Warner Books Ed"
## ..$ isbn : chr "0-446-67745-0"
## ..$ price : num 15