Assignment 09

Using DropConvas, the source files are located as per below: The XML file is located in the shared address “http://dropcanvas.com/l37e0/1 The HTTP file is located in the shared address”http://dropcanvas.com/w55m0/1" The JSON file is located in the shared address “http://dropcanvas.com/r79vh/1”

#install.packages("XML", dependencies = TRUE)
#install.packages("RJSONIO", dependencies = TRUE)
#install.packages("jsonlite", dependencies = TRUE)
#install.packages("(RCurl", dependencies = TRUE)
library(XML)

## Warning: package 'XML' was built under R version 3.1.3

library(RJSONIO)

## Warning: package 'RJSONIO' was built under R version 3.1.3

library(jsonlite)

## Warning: package 'jsonlite' was built under R version 3.1.3

## 
## Attaching package: 'jsonlite'
## 
## The following objects are masked from 'package:RJSONIO':
## 
##     fromJSON, toJSON
## 
## The following object is masked from 'package:utils':
## 
##     View

library (RCurl)

## Loading required package: bitops

Parsing XML content: The XML file is located in the shared address “http://dropcanvas.com/l37e0/1”

Instead of parsing content as an internal C-level structure, we can parse it into an R structure by specifying the parameter useInternalNodes = FALSE

parsing an xml document into an R structure

XML_doc<- xmlParse("http://dropcanvas.com/l37e0/1", useInternalNodes = FALSE)
XML_doc

## $doc
## $file
## [1] "http://dropcanvas.com/l37e0/1"
## 
## $version
## [1] "1.0"
## 
## $children
## $children$MyfavoriteBooks
## <MyfavoriteBooks>
##  <book1>
##   <Title>Introduction to Algorithms</Title>
##   <Author>Thomas H. Cormen; Charles E.Leiserson; 
## Ronald L. Rivest; 
## Clifford Stein</Author>
##   <DatePublished>July 31, 2009</DatePublished>
##   <ISBN>ISBN-13: 978-0262033848</ISBN>
##  </book1>
##  <book2>
##   <Title>Operating System Concepts</Title>
##   <Author>Abraham Silberschatz; Peter B. Galvin; Greg Gagne</Author>
##   <DatePublished>July 29, 2008</DatePublished>
##   <ISBN>ISBN-13: 978-0470128725</ISBN>
##  </book2>
##  <book3>
##   <Title>Fundamentals of Database Systems</Title>
##   <Author>Ramez Elmasri; Shamkant B. Navathe</Author>
##   <DatePublished>April 9, 2010</DatePublished>
##   <ISBN>ISBN-13: 978-0136086208</ISBN>
##  </book3>
## </MyfavoriteBooks>
## 
## 
## attr(,"class")
## [1] "XMLDocumentContent"
## 
## $dtd
## $external
## NULL
## 
## $internal
## NULL
## 
## attr(,"class")
## [1] "DTDList"
## 
## attr(,"class")
## [1] "XMLDocument"         "XMLAbstractDocument"

#Class
class(XML_doc)

## [1] "XMLDocument"         "XMLAbstractDocument"

# Or we can use the function xmlTreeParse() as a convenient synonym for xmlParse(file, useInternalNodes = FALSE)
XML_doc<- xmlTreeParse("http://dropcanvas.com/l37e0/1")
XML_doc

## $doc
## $file
## [1] "http://dropcanvas.com/l37e0/1"
## 
## $version
## [1] "1.0"
## 
## $children
## $children$MyfavoriteBooks
## <MyfavoriteBooks>
##  <book1>
##   <Title>Introduction to Algorithms</Title>
##   <Author>Thomas H. Cormen; Charles E.Leiserson; 
## Ronald L. Rivest; 
## Clifford Stein</Author>
##   <DatePublished>July 31, 2009</DatePublished>
##   <ISBN>ISBN-13: 978-0262033848</ISBN>
##  </book1>
##  <book2>
##   <Title>Operating System Concepts</Title>
##   <Author>Abraham Silberschatz; Peter B. Galvin; Greg Gagne</Author>
##   <DatePublished>July 29, 2008</DatePublished>
##   <ISBN>ISBN-13: 978-0470128725</ISBN>
##  </book2>
##  <book3>
##   <Title>Fundamentals of Database Systems</Title>
##   <Author>Ramez Elmasri; Shamkant B. Navathe</Author>
##   <DatePublished>April 9, 2010</DatePublished>
##   <ISBN>ISBN-13: 978-0136086208</ISBN>
##  </book3>
## </MyfavoriteBooks>
## 
## 
## attr(,"class")
## [1] "XMLDocumentContent"
## 
## $dtd
## $external
## NULL
## 
## $internal
## NULL
## 
## attr(,"class")
## [1] "DTDList"
## 
## attr(,"class")
## [1] "XMLDocument"         "XMLAbstractDocument"

#class
class(XML_doc)

## [1] "XMLDocument"         "XMLAbstractDocument"

# Get the root node
XML_root<- xmlRoot(XML_doc)
class(XML_root)

## [1] "XMLNode"          "RXMLAbstractNode" "XMLAbstractNode" 
## [4] "oldClass"

XML_root

## <MyfavoriteBooks>
##  <book1>
##   <Title>Introduction to Algorithms</Title>
##   <Author>Thomas H. Cormen; Charles E.Leiserson; 
## Ronald L. Rivest; 
## Clifford Stein</Author>
##   <DatePublished>July 31, 2009</DatePublished>
##   <ISBN>ISBN-13: 978-0262033848</ISBN>
##  </book1>
##  <book2>
##   <Title>Operating System Concepts</Title>
##   <Author>Abraham Silberschatz; Peter B. Galvin; Greg Gagne</Author>
##   <DatePublished>July 29, 2008</DatePublished>
##   <ISBN>ISBN-13: 978-0470128725</ISBN>
##  </book2>
##  <book3>
##   <Title>Fundamentals of Database Systems</Title>
##   <Author>Ramez Elmasri; Shamkant B. Navathe</Author>
##   <DatePublished>April 9, 2010</DatePublished>
##   <ISBN>ISBN-13: 978-0136086208</ISBN>
##  </book3>
## </MyfavoriteBooks>

# Get children of root node
XML_childn<- xmlChildren(XML_root)
class(XML_childn)

## [1] "XMLNodeList"

XML_childn

## $book1
## <book1>
##  <Title>Introduction to Algorithms</Title>
##  <Author>Thomas H. Cormen; Charles E.Leiserson; 
## Ronald L. Rivest; 
## Clifford Stein</Author>
##  <DatePublished>July 31, 2009</DatePublished>
##  <ISBN>ISBN-13: 978-0262033848</ISBN>
## </book1>
## 
## $book2
## <book2>
##  <Title>Operating System Concepts</Title>
##  <Author>Abraham Silberschatz; Peter B. Galvin; Greg Gagne</Author>
##  <DatePublished>July 29, 2008</DatePublished>
##  <ISBN>ISBN-13: 978-0470128725</ISBN>
## </book2>
## 
## $book3
## <book3>
##  <Title>Fundamentals of Database Systems</Title>
##  <Author>Ramez Elmasri; Shamkant B. Navathe</Author>
##  <DatePublished>April 9, 2010</DatePublished>
##  <ISBN>ISBN-13: 978-0136086208</ISBN>
## </book3>
## 
## attr(,"class")
## [1] "XMLNodeList"

# length
sapply(XML_childn, length)

## book1 book2 book3 
##     4     4     4

sapply(XML_childn, xmlSize)

## book1 book2 book3 
##     4     4     4

# Names in child nodes
sapply(XML_childn, names)

##               book1           book2           book3          
## Title         "Title"         "Title"         "Title"        
## Author        "Author"        "Author"        "Author"       
## DatePublished "DatePublished" "DatePublished" "DatePublished"
## ISBN          "ISBN"          "ISBN"          "ISBN"

# Values in cild nodes
sapply(XML_childn, xmlValue)

##                                                                                                                                       book1 
## "Introduction to AlgorithmsThomas H. Cormen; Charles E.Leiserson; \nRonald L. Rivest; \nClifford SteinJuly 31, 2009ISBN-13: 978-0262033848" 
##                                                                                                                                       book2 
##                            "Operating System ConceptsAbraham Silberschatz; Peter B. Galvin; Greg GagneJuly 29, 2008ISBN-13: 978-0470128725" 
##                                                                                                                                       book3 
##                                    "Fundamentals of Database SystemsRamez Elmasri; Shamkant B. NavatheApril 9, 2010ISBN-13: 978-0136086208"

# attribute values of nodes in book 1
xmlSApply(XML_root[[1]], xmlValue)

##                                                                         Title 
##                                                  "Introduction to Algorithms" 
##                                                                        Author 
## "Thomas H. Cormen; Charles E.Leiserson; \nRonald L. Rivest; \nClifford Stein" 
##                                                                 DatePublished 
##                                                               "July 31, 2009" 
##                                                                          ISBN 
##                                                     "ISBN-13: 978-0262033848"

# attribute values of nodes in book 2
xmlSApply(XML_root[[2]], xmlValue)

##                                               Title 
##                         "Operating System Concepts" 
##                                              Author 
## "Abraham Silberschatz; Peter B. Galvin; Greg Gagne" 
##                                       DatePublished 
##                                     "July 29, 2008" 
##                                                ISBN 
##                           "ISBN-13: 978-0470128725"

# attribute values of nodes in book 3
xmlSApply(XML_root[[3]], xmlValue)

##                                Title                               Author 
##   "Fundamentals of Database Systems" "Ramez Elmasri; Shamkant B. Navathe" 
##                        DatePublished                                 ISBN 
##                      "April 9, 2010"            "ISBN-13: 978-0136086208"

Parsing HTML content: The HTTP file is located in the shared address “http://dropcanvas.com/w55m0/1”

# parsing an html document with 'xmlParse()' with isHTML = TRUE
HTML_doc<- xmlParse("http://dropcanvas.com/w55m0/1",  isHTML = TRUE)

# Another option is to use the function htmlParse() which is equivalent to xmlParse(file, isHTML = TRUE)
HTML_doc <-  htmlParse("http://dropcanvas.com/w55m0/1")

# Class
class(HTML_doc)

## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" 
## [4] "XMLAbstractDocument"

HTML_doc

## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html><body><table border="3" cellspacing="1" cellpadding="1">
## <caption>My Favorate Books</caption>
## 
## 
## 
## <tr>
## <th> Title</th>
## <th> Author </th>
## <th> Date Published</th>
## <th> ISBN </th>
## </tr>
## <tr>
## <td align="center"> Introduction to Algorithms </td>
## <td align="center"> Thomas H. Cormen;
## Charles E. Leiserson; 
## Ronald L. Rivest; 
## Clifford Stein   
##   </td>
## <td align="center"> July 31, 2009 </td>
## <td align="center"> ISBN-13: 978-0262033848 </td>
## </tr>
## <tr>
## <td align="center"> Operating System Concepts</td>
## <td align="center"> Abraham Silberschatz; Peter B. Galvin; Greg Gagne
##   </td>
## <td align="center"> July 29, 2008</td>
## <td align="center"> ISBN-13: 978-0470128725  </td>
## 
## </tr>
## <tr>
## <td align="center"> Fundamentals of Database Systems </td>
## <td align="center"> Ramez Elmasri; Shamkant B. Navathe  </td>
## <td align="center"> April 9, 2010 </td>
## <td align="center"> ISBN-13: 978-0136086208  </td>
## </tr>
## </table></body></html>
##

HTML_doc <-  htmlParse("http://dropcanvas.com/w55m0/1", useInternalNodes = FALSE)
class(HTML_doc)

## [1] "XMLDocumentContent"

HTML_doc

## $file
## [1] "http://dropcanvas.com/w55m0/1"
## 
## $version
## [1] ""
## 
## $children
## $children$html
## <html>
##  <body>
##   <table border="3" cellspacing="1" cellpadding="1">
##    <caption>My Favorate Books</caption>
##    <tr>
##     <th>Title</th>
##     <th>Author</th>
##     <th>Date Published</th>
##     <th>ISBN</th>
##    </tr>
##    <tr>
##     <td align="center">Introduction to Algorithms</td>
##     <td align="center">Thomas H. Cormen;
## Charles E. Leiserson; 
## Ronald L. Rivest; 
## Clifford Stein</td>
##     <td align="center">July 31, 2009</td>
##     <td align="center">ISBN-13: 978-0262033848</td>
##    </tr>
##    <tr>
##     <td align="center">Operating System Concepts</td>
##     <td align="center">Abraham Silberschatz; Peter B. Galvin; Greg Gagne</td>
##     <td align="center">July 29, 2008</td>
##     <td align="center">ISBN-13: 978-0470128725</td>
##    </tr>
##    <tr>
##     <td align="center">Fundamentals of Database Systems</td>
##     <td align="center">Ramez Elmasri; Shamkant B. Navathe</td>
##     <td align="center">April 9, 2010</td>
##     <td align="center">ISBN-13: 978-0136086208</td>
##    </tr>
##   </table>
##  </body>
## </html>
## 
## 
## attr(,"class")
## [1] "XMLDocumentContent"

To parse content into an R structure we have to use htmlTreeParse() which is equivalent to htmlParse(file,useInternalNodes = FALSE)

RHTML_doc = htmlTreeParse("http://dropcanvas.com/w55m0/1")
class(RHTML_doc)

## [1] "XMLDocumentContent"

RHTML_doc

## $file
## [1] "http://dropcanvas.com/w55m0/1"
## 
## $version
## [1] ""
## 
## $children
## $children$html
## <html>
##  <body>
##   <table border="3" cellspacing="1" cellpadding="1">
##    <caption>My Favorate Books</caption>
##    <tr>
##     <th>Title</th>
##     <th>Author</th>
##     <th>Date Published</th>
##     <th>ISBN</th>
##    </tr>
##    <tr>
##     <td align="center">Introduction to Algorithms</td>
##     <td align="center">Thomas H. Cormen;
## Charles E. Leiserson; 
## Ronald L. Rivest; 
## Clifford Stein</td>
##     <td align="center">July 31, 2009</td>
##     <td align="center">ISBN-13: 978-0262033848</td>
##    </tr>
##    <tr>
##     <td align="center">Operating System Concepts</td>
##     <td align="center">Abraham Silberschatz; Peter B. Galvin; Greg Gagne</td>
##     <td align="center">July 29, 2008</td>
##     <td align="center">ISBN-13: 978-0470128725</td>
##    </tr>
##    <tr>
##     <td align="center">Fundamentals of Database Systems</td>
##     <td align="center">Ramez Elmasri; Shamkant B. Navathe</td>
##     <td align="center">April 9, 2010</td>
##     <td align="center">ISBN-13: 978-0136086208</td>
##    </tr>
##   </table>
##  </body>
## </html>
## 
## 
## attr(,"class")
## [1] "XMLDocumentContent"

Parsing JSON content: The JSON file is located in the shared address “http://dropcanvas.com/r79vh/1” convert JSON string to R list

JSON_doc <- fromJSON("http://dropcanvas.com/r79vh/1")
class(JSON_doc)

## [1] "list"

JSON_doc

## $Title
## [1] "Introduction to Algorithms"       "Operating System Concepts"       
## [3] "Fundamentals of Database Systems"
## 
## $Author
## [1] "Thomas H. Cormen; Charles E. Leiserson; Ronald L. Rivest; Clifford Stein "
## [2] "Abraham Silberschatz; Peter B. Galvin; Greg Gagne"                        
## [3] " Ramez Elmasri; Shamkant B. Navathe"                                      
## 
## $`Date Published`
## [1] "July 31, 2009" "July 29, 2008" "April 9, 2010"
## 
## $ISBN
## [1] "978-0262033848 " "978-0470128725"  "978-0136086208"

Assignment 09

Mohamed Elmoudni

Sunday, March 29, 2015