I used the following libraries and loaded data from the following github location:
https://github.com/dapolloxp/data607/tree/master/assignment6
Loading the various files into a dataframe does not generate equivalent dataframes for XML, HTML, and JSON.
JSON is the most easiest format to work with as R already has a format to import the object into a dataframe.library(flexdashboard)
library(jsonlite)
library(xml2)
library(XML)
library(rvest)
library(stringr)For XML, I used XML2 to parse the nodes. I did some clean-up to make it look similiar to the JSON dataframe.
x <- read_xml('https://raw.githubusercontent.com/dapolloxp/data607/master/assignment6/Books.xml')
# Let's store the books into a sub xml object
book.nodes <- xml_find_all(x,".//books")
# let's store the headers
header.names <- book.nodes[1] %>% xml_children() %>% xml_name()
xml.df = data.frame(matrix(vector(), 0, 7),stringsAsFactors=FALSE)
colnames(xml.df) <- header.names
# Loading xml frames into a dataframe
for(i in 1:length(book.nodes))
{
temp <- data.frame(matrix(NA, 1, 7),stringsAsFactors=FALSE)
colnames(temp) <- header.names
temp$isbn <- book.nodes[i] %>% xml_find_all(".//isbn") %>% xml_text()
temp$title <- book.nodes[i] %>% xml_find_all(".//title") %>% xml_text()
temp$author <- book.nodes[i] %>% xml_find_all(".//author") %>% xml_text()
temp$published <- book.nodes[i] %>% xml_find_all(".//published") %>% xml_text()
temp$publisher <- book.nodes[i] %>% xml_find_all(".//publisher") %>% xml_text()
temp$pages <- book.nodes[i] %>% xml_find_all(".//pages") %>% xml_text()
temp$price <- book.nodes[i] %>% xml_find_all(".//price") %>% xml_text()
temp.df <- data.frame(temp, stringsAsFactors=FALSE)
colnames(temp.df) <- header.names
xml.df <- rbind(xml.df, temp)
}
xml.df$author[2] <-xml.df$author[2] %>% str_split(",") %>% as.list()
xml.df$price <- as.numeric(xml.df$price)
xml.df$pages <- as.integer(xml.df$pages)
xml.df$published <- as.Date(xml.df$published, format = "%Y") %>% format("%Y")
xml.df isbn title
1 9780345472328 Mindset: The New Psychology of Success
2 9781942788290 The Phoenix Project
3 9781449365035 Outliers: The Story of Success
author published publisher
1 Carol S. Dweck 2007 Ballantine Books
2 Gene Kim, Kevin Behr, George Spafford 2013 IT Revolution Press
3 Malcolm Gladwell 2011 Back Bay Books
pages price
1 320 10.29
2 432 16.32
3 336 9.89
json <- fromJSON("https://raw.githubusercontent.com/dapolloxp/data607/master/assignment6/Books.json")
json$books isbn title
1 9780345472328 Mindset: The New Psychology of Success
2 9781942788290 Outliers: The Story of Success
3 9781449365035 Outliers: The Story of Success
author published publisher
1 Carol S. Dweck 2007 Ballantine Books
2 Gene Kim, Kevin Behr, George Spafford 2011 IT Revolution Press
3 Malcolm Gladwell 2011 Back Bay Books
pages price
1 320 10.29
2 432 16.32
3 336 9.89
For HTML, I used XML2 to load HTML files and parse the files.
html <- xml2::read_html('/Users/davidapolinar/Dropbox/CUNYProjects/Srping2019/Data607/project2/project3/books.html')
html.text <-html %>% xml_find_all("//tr") %>% html_text()
html.text.ws <- html.text %>% trimws()
html.df = data.frame(matrix(vector(), 0, 7),stringsAsFactors=FALSE)
names <- c(unlist(str_split(html.text.ws[1],"\n")))
names <- names %>% trimws()
colnames(html.df) <-names
for (i in 2:length(html.text.ws))
{
temp <- data.frame(matrix(unlist(str_split(html.text.ws[i], "\n")), ncol =7 ),stringsAsFactors = FALSE)
#print(temp)
colnames(temp) <-names
html.df<-rbind(html.df, temp)
}
html.df$pages <- as.integer(html.df$pages)
html.df$price <- as.numeric(html.df$price)
html.df$published <- as.Date(html.df$published, format = "%Y") %>% format("%Y")
html.df$author <- html.df$author %>% trimws()
html.df$author <- as.list(html.df$author)
html.df$author[2] <- as.list(html.df$author[2] %>% str_split(","))
html.df isbn title
1 9780345472328 Mindset: The New Psychology of Success
2 9781942788290 The Phoenix Project
3 9781449365035 Outliers: The Story of Success
author published
1 Carol S. Dweck 2007
2 Gene Kim, Kevin Behr, George Spafford 2013
3 Malcolm Gladwell 2011
publisher pages price
1 Ballantine Books 320 10.29
2 IT Revolution Press 432 16.32
3 Back Bay Books 336 9.89