Assignment6

Row

Overview

I used the following libraries and loaded data from the following github location:

https://github.com/dapolloxp/data607/tree/master/assignment6

Loading the various files into a dataframe does not generate equivalent dataframes for XML, HTML, and JSON.

JSON is the most easiest format to work with as R already has a format to import the object into a dataframe.

library(flexdashboard)
library(jsonlite)
library(xml2)
library(XML)
library(rvest)
library(stringr)

XML

For XML, I used XML2 to parse the nodes. I did some clean-up to make it look similiar to the JSON dataframe.

x <- read_xml('https://raw.githubusercontent.com/dapolloxp/data607/master/assignment6/Books.xml')

# Let's store the books into a sub xml object
book.nodes <- xml_find_all(x,".//books")
# let's store the headers
header.names <- book.nodes[1] %>% xml_children() %>% xml_name()
xml.df = data.frame(matrix(vector(), 0, 7),stringsAsFactors=FALSE)
colnames(xml.df) <- header.names

# Loading xml frames into a dataframe
for(i in 1:length(book.nodes))
{
  temp <- data.frame(matrix(NA, 1, 7),stringsAsFactors=FALSE)
  colnames(temp) <- header.names
  temp$isbn <- book.nodes[i] %>% xml_find_all(".//isbn") %>% xml_text()
  temp$title <- book.nodes[i] %>% xml_find_all(".//title") %>% xml_text()

  temp$author <-  book.nodes[i] %>% xml_find_all(".//author") %>% xml_text()
  temp$published <-  book.nodes[i] %>% xml_find_all(".//published") %>% xml_text()
  temp$publisher <-  book.nodes[i] %>% xml_find_all(".//publisher") %>% xml_text()
  temp$pages <-  book.nodes[i] %>% xml_find_all(".//pages") %>% xml_text()
  temp$price <-  book.nodes[i] %>% xml_find_all(".//price") %>% xml_text()
  temp.df <- data.frame(temp, stringsAsFactors=FALSE)
  colnames(temp.df) <- header.names
  xml.df <- rbind(xml.df, temp)
}
xml.df$author[2] <-xml.df$author[2] %>% str_split(",") %>% as.list()
xml.df$price <- as.numeric(xml.df$price)
xml.df$pages <- as.integer(xml.df$pages)
xml.df$published <- as.Date(xml.df$published, format = "%Y") %>% format("%Y")
xml.df

           isbn                                  title
1 9780345472328 Mindset: The New Psychology of Success
2 9781942788290                    The Phoenix Project
3 9781449365035         Outliers: The Story of Success
                                  author published           publisher
1                         Carol S. Dweck      2007    Ballantine Books
2 Gene Kim,  Kevin Behr, George Spafford      2013 IT Revolution Press
3                       Malcolm Gladwell      2011      Back Bay Books
  pages price
1   320 10.29
2   432 16.32
3   336  9.89

JSON

JSON is the most easiest format to work with as R already has a format to import the object into a dataframe.

json <- fromJSON("https://raw.githubusercontent.com/dapolloxp/data607/master/assignment6/Books.json")
json$books

           isbn                                  title
1 9780345472328 Mindset: The New Psychology of Success
2 9781942788290         Outliers: The Story of Success
3 9781449365035         Outliers: The Story of Success
                                 author published           publisher
1                        Carol S. Dweck      2007    Ballantine Books
2 Gene Kim, Kevin Behr, George Spafford      2011 IT Revolution Press
3                      Malcolm Gladwell      2011      Back Bay Books
  pages price
1   320 10.29
2   432 16.32
3   336  9.89

HTML

For HTML, I used XML2 to load HTML files and parse the files.

html <- xml2::read_html('/Users/davidapolinar/Dropbox/CUNYProjects/Srping2019/Data607/project2/project3/books.html')
html.text <-html %>% xml_find_all("//tr") %>% html_text()
html.text.ws <- html.text %>% trimws()

html.df = data.frame(matrix(vector(), 0, 7),stringsAsFactors=FALSE)
names <- c(unlist(str_split(html.text.ws[1],"\n")))
names <- names %>% trimws()
colnames(html.df) <-names
  
for (i in 2:length(html.text.ws))
{
    temp <- data.frame(matrix(unlist(str_split(html.text.ws[i], "\n")), ncol =7 ),stringsAsFactors = FALSE) 
    #print(temp)
    colnames(temp) <-names
    html.df<-rbind(html.df, temp)
}
html.df$pages <- as.integer(html.df$pages)
html.df$price <- as.numeric(html.df$price)
html.df$published <- as.Date(html.df$published, format = "%Y") %>% format("%Y")
html.df$author <- html.df$author %>% trimws()
html.df$author <- as.list(html.df$author)
html.df$author[2] <- as.list(html.df$author[2] %>% str_split(","))
html.df

           isbn                                        title
1 9780345472328       Mindset: The New Psychology of Success
2 9781942788290                          The Phoenix Project
3 9781449365035               Outliers: The Story of Success
                                   author published
1                          Carol S. Dweck      2007
2 Gene Kim,  Kevin Behr,  George Spafford      2013
3                        Malcolm Gladwell      2011
                  publisher pages price
1          Ballantine Books   320 10.29
2       IT Revolution Press   432 16.32
3            Back Bay Books   336  9.89