Working with XML and JSON inR

Load Packages

knitr::opts_chunk$set(warning=FALSE, 
                      message=FALSE,
                      tidy=F,
                      #comment = "",
                      dev="png", 
                      dev.args=list(type="cairo"))
library(XML)
library(rjson)
library(RCurl)
## Loading required package: bitops
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.4.2
## 
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
library(RJSONIO)
## 
## Attaching package: 'RJSONIO'
## The following objects are masked from 'package:jsonlite':
## 
##     fromJSON, toJSON
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
suppressMessages(library(XML))
suppressWarnings(library(XML))
suppressMessages(library(rjson))
suppressMessages(library(jsonlite))
suppressWarnings(library(jsonlite))
suppressPackageStartupMessages(library(jsonlite))
suppressMessages(library(RJSONIO))
suppressWarnings(library(RJSONIO))

Reading a HTML file

books.html <- readLines("https://raw.githubusercontent.com/doradu8030/Data607/master/books.html")

###Finding the lines matching the pattern "<td>...</td>" 
html.pattern = "<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>"

###Tyding the file 
b.content = grep(html.pattern, books.html[1:length(books.html)], value = TRUE)
b.info <- trimws(unlist(sub("<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>", "\\1", b.content)))

b.info
##  [1] "Book Title"                                                                     
##  [2] "Author(s)"                                                                      
##  [3] "Topic"                                                                          
##  [4] "Year Publisher"                                                                 
##  [5] "ISBN"                                                                           
##  [6] "Think Big: Unleashing Your Potential for Excellence"                            
##  [7] "Ben Carson M.D,Cecil Murphey (Contributor)"                                     
##  [8] "Personal Formula for success,Self Motivation,Hope for Good things and be honest"
##  [9] "2015"                                                                           
## [10] "978-0310343363"                                                                 
## [11] "You Can't Teach a Kid to Ride a Bike at a Seminar"                              
## [12] "David H.Sandler,John Hayes"                                                     
## [13] "Self-Confidence,Higher achivement,buyer-seller dance"                           
## [14] "1996"                                                                           
## [15] "978-0967179902"                                                                 
## [16] "R for Data Science"                                                             
## [17] "Garrett Grolemund, Hadley Wickham"                                              
## [18] "Clean data,Transforming data,Visualise"                                         
## [19] "2017"                                                                           
## [20] "978-1491910399"

Convert in a Data.frame

b.infoDF <- cbind.data.frame(split(b.info, rep(1:5, times=length(b.info)/5)), stringsAsFactors=F)
names(b.infoDF) <- c("Title", "Author", "Topic", "Publication_Year", "ISBN")
b.infoDF = b.infoDF[-1,]

# HTML Contents as Dataframe
b.infoDF
##                                                 Title
## 2 Think Big: Unleashing Your Potential for Excellence
## 3   You Can't Teach a Kid to Ride a Bike at a Seminar
## 4                                  R for Data Science
##                                       Author
## 2 Ben Carson M.D,Cecil Murphey (Contributor)
## 3                 David H.Sandler,John Hayes
## 4          Garrett Grolemund, Hadley Wickham
##                                                                             Topic
## 2 Personal Formula for success,Self Motivation,Hope for Good things and be honest
## 3                            Self-Confidence,Higher achivement,buyer-seller dance
## 4                                          Clean data,Transforming data,Visualise
##   Publication_Year           ISBN
## 2             2015 978-0310343363
## 3             1996 978-0967179902
## 4             2017 978-1491910399
str(b.infoDF)
## 'data.frame':    3 obs. of  5 variables:
##  $ Title           : chr  "Think Big: Unleashing Your Potential for Excellence" "You Can't Teach a Kid to Ride a Bike at a Seminar" "R for Data Science"
##  $ Author          : chr  "Ben Carson M.D,Cecil Murphey (Contributor)" "David H.Sandler,John Hayes" "Garrett Grolemund, Hadley Wickham"
##  $ Topic           : chr  "Personal Formula for success,Self Motivation,Hope for Good things and be honest" "Self-Confidence,Higher achivement,buyer-seller dance" "Clean data,Transforming data,Visualise"
##  $ Publication_Year: chr  "2015" "1996" "2017"
##  $ ISBN            : chr  "978-0310343363" "978-0967179902" "978-1491910399"

Reading a XML file

#books.json = JSON.parse()
download.file("https://raw.githubusercontent.com/doradu8030/Data607/master/books.XML", destfile = "book_XML.xml")
  
book_XML.xml <- xmlParse("book_XML.xml")
books.XMLDF <- xmlToDataFrame(book_XML.xml)
books.XMLDF
##                                                 title
## 1 Think Big: Unleashing Your Potential for Excellence
## 2   You Can't Teach a Kid to Ride a Bike at a Seminar
## 3                                  R for Data Science
##                                       author
## 1 Ben Carson M.D,Cecil Murphey (Contributor)
## 2                 David H.Sandler,John Hayes
## 3          Garrett Grolemund, Hadley Wickham
##                                                                             topic
## 1 Personal Formula for success,Self Motivation,Hope for Good things and be honest
## 2                            Self-Confidence,Higher achivement,buyer-seller dance
## 3                                          Clean data,Transforming data,Visualise
##   published_year           ISBN
## 1           2015 978-0310343363
## 2           1996 978-0967179902
## 3           2017 978-1491910399
#str(books.XMLDF)

Reading a JSON file

url.json <-"https://raw.githubusercontent.com/doradu8030/Data607/master/books3.json"

json.Df <- fromJSON(url.json)
json.Df
## [[1]]
## [[1]]$title
## [1] "Think Big: Unleashing Your Potential for Excellence"
## 
## [[1]]$authors
## [[1]]$authors[[1]]
## named list()
## 
## [[1]]$authors[[2]]
##                        author 
## "Cecil Murphey (Contributor)" 
## 
## 
## [[1]]$topic
## [1] "Self Motivation"
## 
## [[1]]$published_year
## [1] "2015"
## 
## [[1]]$ISBN
## [1] "978-0310343363"
## 
## 
## [[2]]
## [[2]]$title
## [1] "You Can't Teach a Kid to Ride a Bike at a Seminar"
## 
## [[2]]$authors
## [[2]]$authors[[1]]
##           author: 
## "David H Sandler" 
## 
## [[2]]$authors[[2]]
##       author 
## "John Hayes" 
## 
## 
## [[2]]$topic
## [1] "Self-Confidence"
## 
## [[2]]$published_year
## [1] "1996"
## 
## [[2]]$ISBN
## [1] "978-0967179902"
## 
## 
## [[3]]
## NULL
#DF.json <- as.data.frame(json.Df)
#DF.json
#names(b.infoDF) <- c("Title", "Author", "Topic", "Publication_Year", "ISBN")
#Parsing JSON content
#head(json.Df[[1]], n=3)
knitr::kable(json.Df)
Think Big: Unleashing Your Potential for Excellence
author Cecil Murphey (Contributor)
Self Motivation
2015
978-0310343363
You Can’t Teach a Kid to Ride a Bike at a Seminar
author: David H Sandler
author John Hayes
Self-Confidence
1996
978-0967179902
#names(bs) = c("Title", "Author", "Topic", "Published Year","ISBN-13")

```