Parsing

library(httr)
url <- "http://www.r-datacollection.com/materials/ch-4-xpath/fortunes/fortunes.html"
fortune <- httr::GET(url)
fortune
## Response [http://www.r-datacollection.com/materials/ch-4-xpath/fortunes/fortunes.html]
##   Date: 2021-10-10 14:36
##   Status: 200
##   Content-Type: text/html; charset=UTF-8
##   Size: 776 B
## <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
## <html> <head>
## <title>Collected R wisdoms</title>
## </head>
## 
## <body>
## <div id="R Inventor" lang="english" date="June/2003">
##   <h1>Robert Gentleman</h1>
##   <p><i>'What we have is nice, but we need something very different'</i></p>
##   <p><b>Source: </b>Statistical Computing 2003, Reisensburg</p>
## ...
class(fortune)
## [1] "response"
library(XML)
parsed_fortune <- htmlParse(fortune)
parsed_fortune <- htmlParse(fortune, encoding = "UTF-8")
class(parsed_fortune)
## [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" 
## [4] "XMLAbstractDocument"
parsed_fortune
## <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
## <html>
## <head><title>Collected R wisdoms</title></head>
## <body>
## <div id="R Inventor" lang="english" date="June/2003">
##   <h1>Robert Gentleman</h1>
##   <p><i>'What we have is nice, but we need something very different'</i></p>
##   <p><b>Source: </b>Statistical Computing 2003, Reisensburg</p>
## </div>
## 
## <div lang="english" date="October/2011">
##   <h1>Rolf Turner</h1>
##   <p><i>'R is wonderful, but it cannot work magic'</i> <br><emph>answering a request for automatic generation of 'data from a known mean and 95% CI'</emph></p>
##   <p><b>Source: </b><a href="https://stat.ethz.ch/mailman/listinfo/r-help">R-help</a></p>
## </div>
## 
## <address>
## <a href="http://www.r-datacollectionbook.com"><i>The book homepage</i></a><a></a>
## </address>
## 
## </body>
## </html>
## 

Node relations

  • The family tree analogy

node1/relation::node2

xpathSApply(parsed_fortune, "//a/ancestor::div")
## [[1]]
## <div lang="english" date="October/2011">
##   <h1>Rolf Turner</h1>
##   <p><i>'R is wonderful, but it cannot work magic'</i> <br/><emph>answering a request for automatic generation of 'data from a known mean and 95% CI'</emph></p>
##   <p><b>Source: </b><a href="https://stat.ethz.ch/mailman/listinfo/r-help">R-help</a></p>
## </div>
xpathSApply(parsed_fortune, "//a/ancestor::div//i")
## [[1]]
## <i>'R is wonderful, but it cannot work magic'</i>
xpathSApply(parsed_fortune, "//p/preceding-sibling::h1")
## [[1]]
## <h1>Robert Gentleman</h1> 
## 
## [[2]]
## <h1>Rolf Turner</h1>
xpathSApply(parsed_fortune, "//title/parent::*")
## [[1]]
## <head>
##   <title>Collected R wisdoms</title>
## </head>
Visualizing node relations

Visualizing node relations

XPath predicates

Predicates are simple functions that are applied to a node’s name, value, or attribute, and which evaluate whether a condition is true or false.

After a node(or node set) we specify the predicate in square brackets, node1[predicate]. We select all nodes in the document that comply with the condition formulated by the predicate.

  1. Numerical predicates
xpathSApply(parsed_fortune, "//div/p[position()=1]")
## [[1]]
## <p>
##   <i>'What we have is nice, but we need something very different'</i>
## </p> 
## 
## [[2]]
## <p><i>'R is wonderful, but it cannot work magic'</i> <br/><emph>answering a request for automatic generation of 'data from a known mean and 95% CI'</emph></p>
xpathSApply(parsed_fortune, "//div/p[1]")
## [[1]]
## <p>
##   <i>'What we have is nice, but we need something very different'</i>
## </p> 
## 
## [[2]]
## <p><i>'R is wonderful, but it cannot work magic'</i> <br/><emph>answering a request for automatic generation of 'data from a known mean and 95% CI'</emph></p>
xpathSApply(parsed_fortune, "//div/p[last()]")
## [[1]]
## <p><b>Source: </b>Statistical Computing 2003, Reisensburg</p> 
## 
## [[2]]
## <p>
##   <b>Source: </b>
##   <a href="https://stat.ethz.ch/mailman/listinfo/r-help">R-help</a>
## </p>
xpathSApply(parsed_fortune, "//div/p[last()-1]")
## [[1]]
## <p>
##   <i>'What we have is nice, but we need something very different'</i>
## </p> 
## 
## [[2]]
## <p><i>'R is wonderful, but it cannot work magic'</i> <br/><emph>answering a request for automatic generation of 'data from a known mean and 95% CI'</emph></p>
xpathSApply(parsed_fortune, "//div[count(.//a)>0]")
## [[1]]
## <div lang="english" date="October/2011">
##   <h1>Rolf Turner</h1>
##   <p><i>'R is wonderful, but it cannot work magic'</i> <br/><emph>answering a request for automatic generation of 'data from a known mean and 95% CI'</emph></p>
##   <p><b>Source: </b><a href="https://stat.ethz.ch/mailman/listinfo/r-help">R-help</a></p>
## </div>

@ element retrieves the attributes from a selected node. The ./@* expression returns all the attributes, regardless of their name, from the currently selected nodes.

xpathSApply(parsed_fortune, "//div[count(./@*)>2]")
## [[1]]
## <div id="R Inventor" lang="english" date="June/2003">
##   <h1>Robert Gentleman</h1>
##   <p><i>'What we have is nice, but we need something very different'</i></p>
##   <p><b>Source: </b>Statistical Computing 2003, Reisensburg</p>
## </div>
xpathSApply(parsed_fortune, "//div[not(count(./@*)>2)]")
## [[1]]
## <div lang="english" date="October/2011">
##   <h1>Rolf Turner</h1>
##   <p><i>'R is wonderful, but it cannot work magic'</i> <br/><emph>answering a request for automatic generation of 'data from a known mean and 95% CI'</emph></p>
##   <p><b>Source: </b><a href="https://stat.ethz.ch/mailman/listinfo/r-help">R-help</a></p>
## </div>
  1. Textual predicates

Textual properties of the document are useful predicates for node selection.

xpathSApply(parsed_fortune, "//div[@date='October/2011']")
## [[1]]
## <div lang="english" date="October/2011">
##   <h1>Rolf Turner</h1>
##   <p><i>'R is wonderful, but it cannot work magic'</i> <br/><emph>answering a request for automatic generation of 'data from a known mean and 95% CI'</emph></p>
##   <p><b>Source: </b><a href="https://stat.ethz.ch/mailman/listinfo/r-help">R-help</a></p>
## </div>

string_method(text1, 'text2')

xpathSApply(parsed_fortune, "//*[contains(text(), 'magic')]")
## [[1]]
## <i>'R is wonderful, but it cannot work magic'</i>
xpathSApply(parsed_fortune, "//div[starts-with(./@id, 'R')]")
## [[1]]
## <div id="R Inventor" lang="english" date="June/2003">
##   <h1>Robert Gentleman</h1>
##   <p><i>'What we have is nice, but we need something very different'</i></p>
##   <p><b>Source: </b>Statistical Computing 2003, Reisensburg</p>
## </div>
xpathSApply(parsed_fortune, "//div[substring-after(./@date, '/')='2003']//i")
## [[1]]
## <i>'What we have is nice, but we need something very different'</i>

Extracting node elements

xpathSApply(parsed_fortune, "//title", fun = xmlValue)
## [1] "Collected R wisdoms"
xpathSApply(parsed_fortune, "//div", xmlAttrs)
## [[1]]
##           id         lang         date 
## "R Inventor"    "english"  "June/2003" 
## 
## [[2]]
##           lang           date 
##      "english" "October/2011"
xpathSApply(parsed_fortune, "//div", xmlGetAttr, "lang")
## [1] "english" "english"

Extending the fun argument

lowerCaseFun <- function(x) {
  x <- tolower(xmlValue(x))
  x
}

xpathSApply(parsed_fortune, "//div//i", fun = lowerCaseFun)
## [1] "'what we have is nice, but we need something very different'"
## [2] "'r is wonderful, but it cannot work magic'"
dateFun <- function(x) {
  require(stringr)
  date <- xmlGetAttr(node = x, name = "date")
  year <- str_extract(date, "[0-9]{4}")
  year
}

xpathSApply(parsed_fortune, "//div", dateFun)
## 필요한 패키지를 로딩중입니다: stringr
## [1] "2003" "2011"
idFun <- function(x) {
 id <- xmlGetAttr(x, "id")
 id <- ifelse(is.null(id), "not specified", id)
 return(id)
}

xpathSApply(parsed_fortune, "//div", idFun)
## [1] "R Inventor"    "not specified"

XPath helper tool