Working with HTML, XML and JSON in R

#Location

These are the files used in this assignment.

Link to HTML file

Link to XML file

Link to JSON file

#Loading libraries

if (!require('XML')) install.packages('XML')
## Loading required package: XML
if (!require('jsonlite')) install.packages('jsonlite')
## Loading required package: jsonlite
library(XML)
library(jsonlite)

#1:HTML

#Reading HTML file

# Reading HTML file from github
con <-'https://raw.githubusercontent.com/bsvmelo/CUNY/master/Sci-Fi_bookshelf1.html'
bookshelf_raw<-readLines(con, warn = FALSE)
# Printing
head(bookshelf_raw)
## [1] "<h1>Sci-fi Bookshelf</h1>"                            
## [2] "<p>This is a sample of my sci-fi book collection.</p>"
## [3] "<p>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;</p>"      
## [4] "<table>"                                              
## [5] "<thead>"                                              
## [6] "<tr>"

#Parsing HTML

parsed_bookshelf<-htmlParse(bookshelf_raw)
print(parsed_bookshelf)
## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html><body>
## <h1>Sci-fi Bookshelf</h1>
## <p>This is a sample of my sci-fi book collection.</p>
## <p>          </p>
## <table>
## <thead><tr>
## <td>Title</td>
## <td>Author</td>
## <td>Publisher</td>
## <td>Publication Date</td>
## <td>Pages</td>
## <td>ISBN</td>
## </tr></thead>
## <tbody>
## <tr>
## <td>Aurora</td>
## <td>Kim Stanley Robinson</td>
## <td>Orbit</td>
## <td>July 2015</td>
## <td>512</td>
## <td>0316526991</td>
## </tr>
## <tr>
## <td>Use of Weapons</td>
## <td>Iain M. Banks</td>
## <td>MacDonald &amp; Co</td>
## <td>1990</td>
## <td>510</td>
## <td>0316030570</td>
## </tr>
## <tr>
## <td>Good Omens</td>
## <td>Terry Pratchett &amp; Neil Gaiman</td>
## <td>William Morrow Paperbacks</td>
## <td>March 2019</td>
## <td>400</td>
## <td>0062697250</td>
## </tr>
## </tbody>
## </table>
## </body></html>
## 

#Converting to R data.frame

df_bookshelf <- readHTMLTable(parsed_bookshelf, stringsAsFactors=FALSE)

df_bookshelf[[1]]
##            Title                        Author                 Publisher
## 1         Aurora          Kim Stanley Robinson                     Orbit
## 2 Use of Weapons                 Iain M. Banks            MacDonald & Co
## 3     Good Omens Terry Pratchett & Neil Gaiman William Morrow Paperbacks
##   Publication Date Pages       ISBN
## 1        July 2015   512 0316526991
## 2             1990   510 0316030570
## 3       March 2019   400 0062697250

#Displaying

names(df_bookshelf[[1]])
## [1] "Title"            "Author"           "Publisher"        "Publication Date"
## [5] "Pages"            "ISBN"
df_bookshelf[[1]]$Title
## [1] "Aurora"         "Use of Weapons" "Good Omens"

#2:XML

#Reading XML file

# Reading XML file from github
con <-'https://raw.githubusercontent.com/bsvmelo/CUNY/master/Sci-Fi_bookshelf1.xml'
bookshelf_raw_xml<-readLines(con, warn = FALSE)
# Printing
head(bookshelf_raw_xml)
## [1] "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"      
## [2] "<Sci-Fi_bookshelf>"                               
## [3] "        <book>"                                   
## [4] "            <Title>Aurora</Title>"                
## [5] "            <Author>Kim Stanley Robinson</Author>"
## [6] "            <Publisher>Orbit</Publisher>"

#Parsing XML

parsed_bookshelf_xml<-xmlParse(bookshelf_raw_xml)
print(parsed_bookshelf_xml)
## <?xml version="1.0" encoding="UTF-8"?>
## <Sci-Fi_bookshelf>
##   <book>
##     <Title>Aurora</Title>
##     <Author>Kim Stanley Robinson</Author>
##     <Publisher>Orbit</Publisher>
##     <Publication-Date>July 2015</Publication-Date>
##     <Pages>512</Pages>
##     <ISBN>0316526991</ISBN>
##   </book>
##   <book>
##     <Title>Use of Weapons</Title>
##     <Author>Iain M. Banks</Author>
##     <Publisher>MacDonald &amp; Co</Publisher>
##     <Publication-Date>1990</Publication-Date>
##     <Pages>510</Pages>
##     <ISBN>0316030570</ISBN>
##   </book>
##   <book>
##     <Title>Good Omens</Title>
##     <Author>Terry Pratchett &amp; Neil Gaiman</Author>
##     <Publisher>William Morrow Paperbacks</Publisher>
##     <Publication-Date>March 2019</Publication-Date>
##     <Pages>400</Pages>
##     <ISBN>0062697250</ISBN>
##   </book>
## </Sci-Fi_bookshelf>
## 

#Converting to R data.frame

(df_bookshelf_xml <- xmlToDataFrame(parsed_bookshelf_xml))
##            Title                        Author                 Publisher
## 1         Aurora          Kim Stanley Robinson                     Orbit
## 2 Use of Weapons                 Iain M. Banks            MacDonald & Co
## 3     Good Omens Terry Pratchett & Neil Gaiman William Morrow Paperbacks
##   Publication-Date Pages       ISBN
## 1        July 2015   512 0316526991
## 2             1990   510 0316030570
## 3       March 2019   400 0062697250

#Displaying

df_bookshelf_xml$Title
## [1] Aurora         Use of Weapons Good Omens    
## Levels: Aurora Good Omens Use of Weapons
df_bookshelf_xml$`Publication-Date`
## [1] July 2015  1990       March 2019
## Levels: 1990 July 2015 March 2019
df_bookshelf_xml$Author
## [1] Kim Stanley Robinson          Iain M. Banks                
## [3] Terry Pratchett & Neil Gaiman
## Levels: Iain M. Banks Kim Stanley Robinson Terry Pratchett & Neil Gaiman

#3:JSON

#Reading JSON

# Reading JSON file from github
con <-'https://raw.githubusercontent.com/bsvmelo/CUNY/master/sci-fi_bookshelf2.json'
bookshelf_raw_js<-readLines(con, warn = FALSE)
# Printing
head(bookshelf_raw_js)
## [1] "{"                                                    
## [2] "    \"Sci-Fi_bookshelf\":"                            
## [3] "        ["                                            
## [4] "            {"                                        
## [5] "                \"Title\": \"Aurora\","               
## [6] "                \"Author\": \"Kim Stanley Robinson\","

#Converting to R data.frame

bookshelf_js<-fromJSON(bookshelf_raw_js)
bookshelf_js
## $`Sci-Fi_bookshelf`
##            Title                        Author                 Publisher
## 1         Aurora          Kim Stanley Robinson                     Orbit
## 2 Use of Weapons                 Iain M. Banks            MacDonald & Co
## 3     Good Omens Terry Pratchett & Neil Gaiman William Morrow Paperbacks
##   Publication-Date Pages       ISBN
## 1        July 2015   512 0316526991
## 2             1990   510 0316030570
## 3       March 2019   400 0062697250

#Displaying

bookshelf_js$`Sci-Fi_bookshelf`$Title
## [1] "Aurora"         "Use of Weapons" "Good Omens"
bookshelf_js$`Sci-Fi_bookshelf`$Publisher
## [1] "Orbit"                     "MacDonald & Co"           
## [3] "William Morrow Paperbacks"

Conclusion

All data frames are identical.