Rpub link:

http://rpubs.com/ssufian/537135

Load all the necessary libraries

# Load necessary libraries

library(dplyr)
library(XML)
library(RCurl)
library(xml2)
library(knitr)
library(kableExtra)

HTML file

# Assign to `url`
url <- "https://raw.githubusercontent.com/ssufian/Data_607/master/Data_607_Books.html"

# Get the data
urldata <- getURL(url)

# Read the HTML table
bookdata_html <- readHTMLTable(urldata,
                      stringsAsFactors = FALSE)

#class(bookdata_html) # check to see what type is it and then convert to dataframe 

df1 <- data.frame(Reduce(rbind, bookdata_html))

kable(df1)
Title Author Publisher Publisher_Date
Hands-On Programming with R: Garrett Grolemund O’Reilly Media, Inc. - Jun 13, 2014
Data Science for Business Foster Provost, Tom Fawcett O’Reilly Media, Inc.  August 2013
R for Data Science Garrett Grolemund, Hadley Wickham O’Reilly Media, Inc.  Dec 12, 2016
R Markdown: The Definitive Guide Yihui Xie, J.J. Allaire, Garrett Grolemund CRC Press Jul 27, 2018

XML File (a): using xmlApply

url <- "https://raw.githubusercontent.com/ssufian/Data_607/master/books1.xml"

r <- getURL(url)
doc <- xmlTreeParse(r, useInternal=TRUE)
root = xmlRoot(doc)

commons = xmlSApply(root, function(x)xmlSApply(x,xmlValue))

books1_df <- as.data.frame(t(commons),row.names=NULL)

kable(books1_df)
book_title authors category year publisher
book Hands-On Programing with R Garret Grolemund Data Science 2014 O’Reilly Media, Inc. 
book.1 Data Science for Business Foster ProvostTom Fawcett Data Science 2013 O’Reilly Media, Inc. 
book.2 R for Data Science Garrett GrolemundHadley Wickham Data Science 2016 O’Reilly Media, Inc. 
book.3 R Markdown: The Definitive Guide Yihui XieJ.J. AllaireGarret Grolemund Data Science 2018 CRC Press

XML File (b): A dplyr alternative

`

xml_hw <- getURLContent("https://raw.githubusercontent.com/ssufian/Data_607/master/books1.xml")
xml_df <- xml_hw %>%
  xmlParse() %>%
  xmlToDataFrame()
kable(xml_df)
book_title authors category year publisher
Hands-On Programing with R Garret Grolemund Data Science 2014 O’Reilly Media, Inc. 
Data Science for Business Foster ProvostTom Fawcett Data Science 2013 O’Reilly Media, Inc. 
R for Data Science Garrett GrolemundHadley Wickham Data Science 2016 O’Reilly Media, Inc. 
R Markdown: The Definitive Guide Yihui XieJ.J. AllaireGarret Grolemund Data Science 2018 CRC Press

Json file

library(rjson)

json_hw <- getURLContent("https://raw.githubusercontent.com/ssufian/Data_607/master/books2.json")
json_df <- fromJSON(json_hw)
json_df <- do.call("rbind", lapply(json_df$`favorite books`, data.frame, stringsAsFactors = F))

kable(json_df)
Book.Title Authors category year publisher
Hands-On Programing with R Garret Grolemund Data Science 2014 O’Reilly Media, Inc. 
Data Science for Business Foster Provost Data Science 2013 O’Reilly Media, Inc. 
Data Science for Business Tom Fawcett Data Science 2013 O’Reilly Media, Inc. 
R for Data Science Garret Grolemund Data Science 2013 O’Reilly Media, Inc. 
R for Data Science Hadley Wickham Data Science 2013 O’Reilly Media, Inc. 
R Markdown: The Definitive Guide Yihui Xie Data Science 2018 CRC Press
R Markdown: The Definitive Guide J.J. Allaire Data Science 2018 CRC Press
R Markdown: The Definitive Guide Garret Grolemund Data Science 2018 CRC Press

Summmary

  • The HTML and xML is the same in the sense that it stores the multiple authors in one line. However, JSON stores in

separate lines