Working with HTML, XML, JSON

Data acquisition and management

CUNY MSDS DATA 607

Rose Koh

2018/03/12

Load required library

library(XML)
library(RCurl)
library(rjson)
library(rvest)
library(rlist)
library(plyr)
library(jsonlite)
library(knitr)

Read .html file

html.url <- "https://raw.githubusercontent.com/silverrainb/dat607/master/week7/booklist_2.html"

# rvest
file <- read_html(html.url)
tables <- html_nodes(file, "table")
list.table <- html_table(tables[1], fill = TRUE)
list.to.df <- as.data.frame(list.table, stringAsFactors = F)
kable(list.to.df)
Title Authors Price Length Reviews Published
Declutter Your Mind: How to Stop Worrying, Relieve Anxiety, and Eliminate Negative Thinking S.J.Scott; Barrie Davenport 1.31; 11.44 157 347 2016-08-21
The Success Principles(TM) - 10th Anniversary Edition: How to Get from Where You Are to Where You Want to Be Jack Canfield; Janet Switzer 13.19; 13.59 542 1360 2015-01-27
Algorithms to Live By: The Computer Science of Human Decisions Brian Christian; Tom Griffiths 4.39; 5.98 369 236 2016-04-19
Competing Against Luck: The Story of Innovation and Customer Choice Clayton M. Christensen; Karen Dillon; Taddy Hall; David S. Duncan 14.01; 16.41 293 139 2016-10-04
Unlocking Potential: 7 Coaching Skills That Transform Individuals, Teams, and Organizations Michael K. Simpson; Dr. Marshall Goldsmith 2.19; 10.6 152 140 2014-08-12
is.data.frame(list.to.df)
## [1] TRUE
# readHTMLTable
read.html.tbl <- readHTMLTable(getURL(html.url))[[1]] %>% as.data.frame()
kable(read.html.tbl)
Title Authors Price Length Reviews Published
Declutter Your Mind: How to Stop Worrying, Relieve Anxiety, and Eliminate Negative Thinking S.J.Scott; Barrie Davenport 1.31; 11.44 157 347 2016-08-21
The Success Principles(TM) - 10th Anniversary Edition: How to Get from Where You Are to Where You Want to Be Jack Canfield; Janet Switzer 13.19; 13.59 542 1360 2015-01-27
Algorithms to Live By: The Computer Science of Human Decisions Brian Christian; Tom Griffiths 4.39; 5.98 369 236 2016-04-19
Competing Against Luck: The Story of Innovation and Customer Choice Clayton M. Christensen; Karen Dillon; Taddy Hall; David S. Duncan 14.01; 16.41 293 139 2016-10-04
Unlocking Potential: 7 Coaching Skills That Transform Individuals, Teams, and Organizations Michael K. Simpson; Dr. Marshall Goldsmith 2.19; 10.6 152 140 2014-08-12
is.data.frame(read.html.tbl)
## [1] TRUE
# rlist
url.ssl <- getURL(html.url, .opts = list(ssl.verifypeer = FALSE))
html.table <- readHTMLTable(url.ssl)[[1]] %>% as.data.frame()
kable(html.table)
Title Authors Price Length Reviews Published
Declutter Your Mind: How to Stop Worrying, Relieve Anxiety, and Eliminate Negative Thinking S.J.Scott; Barrie Davenport 1.31; 11.44 157 347 2016-08-21
The Success Principles(TM) - 10th Anniversary Edition: How to Get from Where You Are to Where You Want to Be Jack Canfield; Janet Switzer 13.19; 13.59 542 1360 2015-01-27
Algorithms to Live By: The Computer Science of Human Decisions Brian Christian; Tom Griffiths 4.39; 5.98 369 236 2016-04-19
Competing Against Luck: The Story of Innovation and Customer Choice Clayton M. Christensen; Karen Dillon; Taddy Hall; David S. Duncan 14.01; 16.41 293 139 2016-10-04
Unlocking Potential: 7 Coaching Skills That Transform Individuals, Teams, and Organizations Michael K. Simpson; Dr. Marshall Goldsmith 2.19; 10.6 152 140 2014-08-12
is.data.frame(html.table)
## [1] TRUE

Read .xml file

xml.url <- "https://raw.githubusercontent.com/silverrainb/dat607/master/week7/booklist.xml"

# use plyr
xml.data<- getURL(xml.url)
xml.doc <- xmlParse(xml.data)
xml.df <- ldply(xmlToList(xml.doc), data.frame)[-1]
kable(xml.df)
Title Authors Price Length Reviews Published
Declutter Your Mind: How to Stop Worrying, Relieve Anxiety, and Eliminate Negative Thinking S.J.Scott; Barrie Davenport 1.31; 11.44 157 347 2016-08-21
The Success Principles(TM) - 10th Anniversary Edition: How to Get from Where You Are to Where You Want to Be Jack Canfield; Janet Switzer 13.19; 13.59 542 1360 2015-01-27
Algorithms to Live By: The Computer Science of Human Decisions Brian Christian; Tom Griffiths 4.39; 5.98 369 236 2016-04-19
Competing Against Luck: The Story of Innovation and Customer Choice Clayton M. Christensen; Karen Dillon; Taddy Hall; David S. Duncan 14.01; 16.41 293 139 2016-10-04
Unlocking Potential: 7 Coaching Skills That Transform Individuals, Teams, and Organizations Michael K. Simpson; Dr. Marshall Goldsmith 2.19; 10.6 152 140 2014-08-12
is.data.frame(xml.df)
## [1] TRUE

Read .json file

json.url <- "https://raw.githubusercontent.com/silverrainb/dat607/master/week7/booklist.json"

# fromJSON(json.url) # list
json.df <- fromJSON(json.url)[[1]] %>% as.data.frame # data frame
kable(json.df)
book.Title book.Authors book.Price book.Length book.Reviews book.Published
Declutter Your Mind: How to Stop Worrying, Relieve Anxiety, and Eliminate Negative Thinking S.J.Scott; Barrie Davenport 1.31; 11.44 157 347 2016-08-21
The Success Principles(TM) - 10th Anniversary Edition: How to Get from Where You Are to Where You Want to Be Jack Canfield; Janet Switzer 13.19; 13.59 542 1360 2015-01-27
Algorithms to Live By: The Computer Science of Human Decisions Brian Christian; Tom Griffiths 4.39; 5.98 369 236 2016-04-19
Competing Against Luck: The Story of Innovation and Customer Choice Clayton M. Christensen; Karen Dillon; Taddy Hall; David S. Duncan 14.01; 16.41 293 139 2016-10-04
Unlocking Potential: 7 Coaching Skills That Transform Individuals, Teams, and Organizations Michael K. Simpson; Dr. Marshall Goldsmith 2.19; 10.6 152 140 2014-08-12
is.data.frame(json.df)
## [1] TRUE