Using DropConvas, the source files are located as per below: The XML file is located in the shared address “http://dropcanvas.com/l37e0/1 The HTTP file is located in the shared address”http://dropcanvas.com/w55m0/1" The JSON file is located in the shared address “http://dropcanvas.com/r79vh/1”
#install.packages("XML", dependencies = TRUE)
#install.packages("RJSONIO", dependencies = TRUE)
#install.packages("jsonlite", dependencies = TRUE)
#install.packages("(RCurl", dependencies = TRUE)
library(XML)
## Warning: package 'XML' was built under R version 3.1.3
library(RJSONIO)
## Warning: package 'RJSONIO' was built under R version 3.1.3
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.1.3
##
## Attaching package: 'jsonlite'
##
## The following objects are masked from 'package:RJSONIO':
##
## fromJSON, toJSON
##
## The following object is masked from 'package:utils':
##
## View
library (RCurl)
## Loading required package: bitops
Parsing XML content: The XML file is located in the shared address “http://dropcanvas.com/l37e0/1”
Instead of parsing content as an internal C-level structure, we can parse it into an R structure by specifying the parameter useInternalNodes = FALSE
parsing an xml document into an R structure
XML_doc<- xmlParse("http://dropcanvas.com/l37e0/1", useInternalNodes = FALSE)
XML_doc
## $doc
## $file
## [1] "http://dropcanvas.com/l37e0/1"
##
## $version
## [1] "1.0"
##
## $children
## $children$MyfavoriteBooks
## <MyfavoriteBooks>
## <book1>
## <Title>Introduction to Algorithms</Title>
## <Author>Thomas H. Cormen; Charles E.Leiserson;
## Ronald L. Rivest;
## Clifford Stein</Author>
## <DatePublished>July 31, 2009</DatePublished>
## <ISBN>ISBN-13: 978-0262033848</ISBN>
## </book1>
## <book2>
## <Title>Operating System Concepts</Title>
## <Author>Abraham Silberschatz; Peter B. Galvin; Greg Gagne</Author>
## <DatePublished>July 29, 2008</DatePublished>
## <ISBN>ISBN-13: 978-0470128725</ISBN>
## </book2>
## <book3>
## <Title>Fundamentals of Database Systems</Title>
## <Author>Ramez Elmasri; Shamkant B. Navathe</Author>
## <DatePublished>April 9, 2010</DatePublished>
## <ISBN>ISBN-13: 978-0136086208</ISBN>
## </book3>
## </MyfavoriteBooks>
##
##
## attr(,"class")
## [1] "XMLDocumentContent"
##
## $dtd
## $external
## NULL
##
## $internal
## NULL
##
## attr(,"class")
## [1] "DTDList"
##
## attr(,"class")
## [1] "XMLDocument" "XMLAbstractDocument"
#Class
class(XML_doc)
## [1] "XMLDocument" "XMLAbstractDocument"
# Or we can use the function xmlTreeParse() as a convenient synonym for xmlParse(file, useInternalNodes = FALSE)
XML_doc<- xmlTreeParse("http://dropcanvas.com/l37e0/1")
XML_doc
## $doc
## $file
## [1] "http://dropcanvas.com/l37e0/1"
##
## $version
## [1] "1.0"
##
## $children
## $children$MyfavoriteBooks
## <MyfavoriteBooks>
## <book1>
## <Title>Introduction to Algorithms</Title>
## <Author>Thomas H. Cormen; Charles E.Leiserson;
## Ronald L. Rivest;
## Clifford Stein</Author>
## <DatePublished>July 31, 2009</DatePublished>
## <ISBN>ISBN-13: 978-0262033848</ISBN>
## </book1>
## <book2>
## <Title>Operating System Concepts</Title>
## <Author>Abraham Silberschatz; Peter B. Galvin; Greg Gagne</Author>
## <DatePublished>July 29, 2008</DatePublished>
## <ISBN>ISBN-13: 978-0470128725</ISBN>
## </book2>
## <book3>
## <Title>Fundamentals of Database Systems</Title>
## <Author>Ramez Elmasri; Shamkant B. Navathe</Author>
## <DatePublished>April 9, 2010</DatePublished>
## <ISBN>ISBN-13: 978-0136086208</ISBN>
## </book3>
## </MyfavoriteBooks>
##
##
## attr(,"class")
## [1] "XMLDocumentContent"
##
## $dtd
## $external
## NULL
##
## $internal
## NULL
##
## attr(,"class")
## [1] "DTDList"
##
## attr(,"class")
## [1] "XMLDocument" "XMLAbstractDocument"
#class
class(XML_doc)
## [1] "XMLDocument" "XMLAbstractDocument"
# Get the root node
XML_root<- xmlRoot(XML_doc)
class(XML_root)
## [1] "XMLNode" "RXMLAbstractNode" "XMLAbstractNode"
## [4] "oldClass"
XML_root
## <MyfavoriteBooks>
## <book1>
## <Title>Introduction to Algorithms</Title>
## <Author>Thomas H. Cormen; Charles E.Leiserson;
## Ronald L. Rivest;
## Clifford Stein</Author>
## <DatePublished>July 31, 2009</DatePublished>
## <ISBN>ISBN-13: 978-0262033848</ISBN>
## </book1>
## <book2>
## <Title>Operating System Concepts</Title>
## <Author>Abraham Silberschatz; Peter B. Galvin; Greg Gagne</Author>
## <DatePublished>July 29, 2008</DatePublished>
## <ISBN>ISBN-13: 978-0470128725</ISBN>
## </book2>
## <book3>
## <Title>Fundamentals of Database Systems</Title>
## <Author>Ramez Elmasri; Shamkant B. Navathe</Author>
## <DatePublished>April 9, 2010</DatePublished>
## <ISBN>ISBN-13: 978-0136086208</ISBN>
## </book3>
## </MyfavoriteBooks>
# Get children of root node
XML_childn<- xmlChildren(XML_root)
class(XML_childn)
## [1] "XMLNodeList"
XML_childn
## $book1
## <book1>
## <Title>Introduction to Algorithms</Title>
## <Author>Thomas H. Cormen; Charles E.Leiserson;
## Ronald L. Rivest;
## Clifford Stein</Author>
## <DatePublished>July 31, 2009</DatePublished>
## <ISBN>ISBN-13: 978-0262033848</ISBN>
## </book1>
##
## $book2
## <book2>
## <Title>Operating System Concepts</Title>
## <Author>Abraham Silberschatz; Peter B. Galvin; Greg Gagne</Author>
## <DatePublished>July 29, 2008</DatePublished>
## <ISBN>ISBN-13: 978-0470128725</ISBN>
## </book2>
##
## $book3
## <book3>
## <Title>Fundamentals of Database Systems</Title>
## <Author>Ramez Elmasri; Shamkant B. Navathe</Author>
## <DatePublished>April 9, 2010</DatePublished>
## <ISBN>ISBN-13: 978-0136086208</ISBN>
## </book3>
##
## attr(,"class")
## [1] "XMLNodeList"
# length
sapply(XML_childn, length)
## book1 book2 book3
## 4 4 4
sapply(XML_childn, xmlSize)
## book1 book2 book3
## 4 4 4
# Names in child nodes
sapply(XML_childn, names)
## book1 book2 book3
## Title "Title" "Title" "Title"
## Author "Author" "Author" "Author"
## DatePublished "DatePublished" "DatePublished" "DatePublished"
## ISBN "ISBN" "ISBN" "ISBN"
# Values in cild nodes
sapply(XML_childn, xmlValue)
## book1
## "Introduction to AlgorithmsThomas H. Cormen; Charles E.Leiserson; \nRonald L. Rivest; \nClifford SteinJuly 31, 2009ISBN-13: 978-0262033848"
## book2
## "Operating System ConceptsAbraham Silberschatz; Peter B. Galvin; Greg GagneJuly 29, 2008ISBN-13: 978-0470128725"
## book3
## "Fundamentals of Database SystemsRamez Elmasri; Shamkant B. NavatheApril 9, 2010ISBN-13: 978-0136086208"
# attribute values of nodes in book 1
xmlSApply(XML_root[[1]], xmlValue)
## Title
## "Introduction to Algorithms"
## Author
## "Thomas H. Cormen; Charles E.Leiserson; \nRonald L. Rivest; \nClifford Stein"
## DatePublished
## "July 31, 2009"
## ISBN
## "ISBN-13: 978-0262033848"
# attribute values of nodes in book 2
xmlSApply(XML_root[[2]], xmlValue)
## Title
## "Operating System Concepts"
## Author
## "Abraham Silberschatz; Peter B. Galvin; Greg Gagne"
## DatePublished
## "July 29, 2008"
## ISBN
## "ISBN-13: 978-0470128725"
# attribute values of nodes in book 3
xmlSApply(XML_root[[3]], xmlValue)
## Title Author
## "Fundamentals of Database Systems" "Ramez Elmasri; Shamkant B. Navathe"
## DatePublished ISBN
## "April 9, 2010" "ISBN-13: 978-0136086208"
Parsing HTML content: The HTTP file is located in the shared address “http://dropcanvas.com/w55m0/1”
# parsing an html document with 'xmlParse()' with isHTML = TRUE
HTML_doc<- xmlParse("http://dropcanvas.com/w55m0/1", isHTML = TRUE)
# Another option is to use the function htmlParse() which is equivalent to xmlParse(file, isHTML = TRUE)
HTML_doc <- htmlParse("http://dropcanvas.com/w55m0/1")
# Class
class(HTML_doc)
## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument"
## [4] "XMLAbstractDocument"
HTML_doc
## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html><body><table border="3" cellspacing="1" cellpadding="1">
## <caption>My Favorate Books</caption>
##
##
##
## <tr>
## <th> Title</th>
## <th> Author </th>
## <th> Date Published</th>
## <th> ISBN </th>
## </tr>
## <tr>
## <td align="center"> Introduction to Algorithms </td>
## <td align="center"> Thomas H. Cormen;
## Charles E. Leiserson;
## Ronald L. Rivest;
## Clifford Stein
## </td>
## <td align="center"> July 31, 2009 </td>
## <td align="center"> ISBN-13: 978-0262033848 </td>
## </tr>
## <tr>
## <td align="center"> Operating System Concepts</td>
## <td align="center"> Abraham Silberschatz; Peter B. Galvin; Greg Gagne
## </td>
## <td align="center"> July 29, 2008</td>
## <td align="center"> ISBN-13: 978-0470128725 </td>
##
## </tr>
## <tr>
## <td align="center"> Fundamentals of Database Systems </td>
## <td align="center"> Ramez Elmasri; Shamkant B. Navathe </td>
## <td align="center"> April 9, 2010 </td>
## <td align="center"> ISBN-13: 978-0136086208 </td>
## </tr>
## </table></body></html>
##
HTML_doc <- htmlParse("http://dropcanvas.com/w55m0/1", useInternalNodes = FALSE)
class(HTML_doc)
## [1] "XMLDocumentContent"
HTML_doc
## $file
## [1] "http://dropcanvas.com/w55m0/1"
##
## $version
## [1] ""
##
## $children
## $children$html
## <html>
## <body>
## <table border="3" cellspacing="1" cellpadding="1">
## <caption>My Favorate Books</caption>
## <tr>
## <th>Title</th>
## <th>Author</th>
## <th>Date Published</th>
## <th>ISBN</th>
## </tr>
## <tr>
## <td align="center">Introduction to Algorithms</td>
## <td align="center">Thomas H. Cormen;
## Charles E. Leiserson;
## Ronald L. Rivest;
## Clifford Stein</td>
## <td align="center">July 31, 2009</td>
## <td align="center">ISBN-13: 978-0262033848</td>
## </tr>
## <tr>
## <td align="center">Operating System Concepts</td>
## <td align="center">Abraham Silberschatz; Peter B. Galvin; Greg Gagne</td>
## <td align="center">July 29, 2008</td>
## <td align="center">ISBN-13: 978-0470128725</td>
## </tr>
## <tr>
## <td align="center">Fundamentals of Database Systems</td>
## <td align="center">Ramez Elmasri; Shamkant B. Navathe</td>
## <td align="center">April 9, 2010</td>
## <td align="center">ISBN-13: 978-0136086208</td>
## </tr>
## </table>
## </body>
## </html>
##
##
## attr(,"class")
## [1] "XMLDocumentContent"
To parse content into an R structure we have to use htmlTreeParse() which is equivalent to htmlParse(file,useInternalNodes = FALSE)
RHTML_doc = htmlTreeParse("http://dropcanvas.com/w55m0/1")
class(RHTML_doc)
## [1] "XMLDocumentContent"
RHTML_doc
## $file
## [1] "http://dropcanvas.com/w55m0/1"
##
## $version
## [1] ""
##
## $children
## $children$html
## <html>
## <body>
## <table border="3" cellspacing="1" cellpadding="1">
## <caption>My Favorate Books</caption>
## <tr>
## <th>Title</th>
## <th>Author</th>
## <th>Date Published</th>
## <th>ISBN</th>
## </tr>
## <tr>
## <td align="center">Introduction to Algorithms</td>
## <td align="center">Thomas H. Cormen;
## Charles E. Leiserson;
## Ronald L. Rivest;
## Clifford Stein</td>
## <td align="center">July 31, 2009</td>
## <td align="center">ISBN-13: 978-0262033848</td>
## </tr>
## <tr>
## <td align="center">Operating System Concepts</td>
## <td align="center">Abraham Silberschatz; Peter B. Galvin; Greg Gagne</td>
## <td align="center">July 29, 2008</td>
## <td align="center">ISBN-13: 978-0470128725</td>
## </tr>
## <tr>
## <td align="center">Fundamentals of Database Systems</td>
## <td align="center">Ramez Elmasri; Shamkant B. Navathe</td>
## <td align="center">April 9, 2010</td>
## <td align="center">ISBN-13: 978-0136086208</td>
## </tr>
## </table>
## </body>
## </html>
##
##
## attr(,"class")
## [1] "XMLDocumentContent"
Parsing JSON content: The JSON file is located in the shared address “http://dropcanvas.com/r79vh/1” convert JSON string to R list
JSON_doc <- fromJSON("http://dropcanvas.com/r79vh/1")
class(JSON_doc)
## [1] "list"
JSON_doc
## $Title
## [1] "Introduction to Algorithms" "Operating System Concepts"
## [3] "Fundamentals of Database Systems"
##
## $Author
## [1] "Thomas H. Cormen; Charles E. Leiserson; Ronald L. Rivest; Clifford Stein "
## [2] "Abraham Silberschatz; Peter B. Galvin; Greg Gagne"
## [3] " Ramez Elmasri; Shamkant B. Navathe"
##
## $`Date Published`
## [1] "July 31, 2009" "July 29, 2008" "April 9, 2010"
##
## $ISBN
## [1] "978-0262033848 " "978-0470128725" "978-0136086208"