Parse XML file
xmlFileUrl <- "https://raw.githubusercontent.com/binishkurian/DATA-607/master/week-05/movie.xml"
xData <- getURL(xmlFileUrl)
(doc <- xmlParse(xData, useInternalNodes = TRUE))
## <?xml version="1.0"?>
## <books>
## <book>
## <title>R for Data Science: Import, Tidy, Transform, Visualize, and Model Data</title>
## <year>2017</year>
## <price>18.17</price>
## <isbn>1491910399</isbn>
## <authors>
## <author>Hadley Wickham</author>
## <author>Garrett Grolemund</author>
## </authors>
## </book>
## <book>
## <title>An Introduction to Statistical Learning: with Applications in R</title>
## <year>2013</year>
## <price>50.74</price>
## <isbn>1461471370</isbn>
## <authors>
## <author>Gareth James</author>
## <author>Daniel Witten</author>
## <author>Trevor Hastie</author>
## <author>Robert Tibshirani</author>
## </authors>
## </book>
## <book>
## <title>Advanced R</title>
## <year>2014</year>
## <price>43.78</price>
## <isbn>1466586966</isbn>
## <authors>
## <author>Hadley Wickham</author>
## </authors>
## </book>
## </books>
##
do.call(rbind, xpathApply(doc, "/books/book", function(node) {
title <- xmlValue(node[["title"]])
year <- xmlValue(node[["year"]])
price <- as.numeric(xmlValue(node[["price"]]))
isbn <- as.numeric(xmlValue(node[["isbn"]]))
xp <- "./authors/author"
author <- xpathSApply(node, xp, xmlValue)
data.frame(title, author, year, price, isbn, stringsAsFactors = FALSE)
}))
## title
## 1 R for Data Science: Import, Tidy, Transform, Visualize, and Model Data
## 2 R for Data Science: Import, Tidy, Transform, Visualize, and Model Data
## 3 An Introduction to Statistical Learning: with Applications in R
## 4 An Introduction to Statistical Learning: with Applications in R
## 5 An Introduction to Statistical Learning: with Applications in R
## 6 An Introduction to Statistical Learning: with Applications in R
## 7 Advanced R
## author year price isbn
## 1 Hadley Wickham 2017 18.17 1491910399
## 2 Garrett Grolemund 2017 18.17 1491910399
## 3 Gareth James 2013 50.74 1461471370
## 4 Daniel Witten 2013 50.74 1461471370
## 5 Trevor Hastie 2013 50.74 1461471370
## 6 Robert Tibshirani 2013 50.74 1461471370
## 7 Hadley Wickham 2014 43.78 1466586966
Parse HTML file
htmlFileUrl <- "https://raw.githubusercontent.com/binishkurian/DATA-607/master/week-05/movie.html"
xData <- getURL(htmlFileUrl)
(html_doc <- htmlParse(xData, useInternalNodes = TRUE))
## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html><body>
## <div>
## <h1>R for Data Science: Import, Tidy, Transform, Visualize, and Model Data</h1>
## <b>2017</b>
## <strong>18.17</strong>
## <i>1491910399</i>
## <p>
## <b>Hadley Wickham</b>
## <b>Garrett Grolemund</b>
## </p>
## </div>
## <div>
## <h1>An Introduction to Statistical Learning: with Applications in R</h1>
## <b>2013</b>
## <strong>50.74</strong>
## <i>1461471370</i>
## <p>
## <b>Gareth James</b>
## <b>Daniel Witten</b>
## <b>Trevor Hastie</b>
## <b>Robert Tibshirani</b>
## </p>
## </div>
## <div>
## <h1>Advanced R</h1>
## <b>2014</b>
## <strong>43.78</strong>
## <i>1466586966</i>
## <p>
## <b>Hadley Wickham</b>
## </p>
## </div>
## </body></html>
##
do.call(rbind, xpathApply(html_doc, "/html/body/div", function(node) {
title <- xmlValue(node[["h1"]])
year <- xmlValue(node[["b"]])
price <- as.numeric(xmlValue(node[["strong"]]))
isbn <- as.numeric(xmlValue(node[["i"]]))
xp <- "./p/b"
author <- xpathSApply(node, xp, xmlValue)
data.frame(title, year, price, isbn, author, stringsAsFactors = FALSE)
}))
## title
## 1 R for Data Science: Import, Tidy, Transform, Visualize, and Model Data
## 2 R for Data Science: Import, Tidy, Transform, Visualize, and Model Data
## 3 An Introduction to Statistical Learning: with Applications in R
## 4 An Introduction to Statistical Learning: with Applications in R
## 5 An Introduction to Statistical Learning: with Applications in R
## 6 An Introduction to Statistical Learning: with Applications in R
## 7 Advanced R
## year price isbn author
## 1 2017 18.17 1491910399 Hadley Wickham
## 2 2017 18.17 1491910399 Garrett Grolemund
## 3 2013 50.74 1461471370 Gareth James
## 4 2013 50.74 1461471370 Daniel Witten
## 5 2013 50.74 1461471370 Trevor Hastie
## 6 2013 50.74 1461471370 Robert Tibshirani
## 7 2014 43.78 1466586966 Hadley Wickham
Parse JSON file
jsonFileUrl <- "https://raw.githubusercontent.com/binishkurian/DATA-607/master/week-05/movie.json"
xData <- getURL(jsonFileUrl)
(doc <- fromJSON(xData))
## title
## 1 R for Data Science: Import, Tidy, Transform, Visualize, and Model Data
## 2 An Introduction to Statistical Learning: with Applications in R
## 3 Advanced R
## year price isbn
## 1 2017 18.17 1491910399
## 2 2013 50.74 1461471370
## 3 2014 43.78 1466586966
## authors
## 1 Hadley Wickham, Garrett Grolemund
## 2 Gareth James, Daniel Witten, Trevor Hastie, Robert Tibshirani
## 3 Hadley Wickham