We load the data below

books.url <- file(paste(url,"books.csv", sep = ""), open="r" )
books.data <- read.csv2(books.url, sep = ",", header=TRUE, stringsAsFactors = FALSE ,  encoding="UTF-8")

This is how the books.data from csv looks like

books.data
##                                                                                 Title
## 1                                                      Discovering Statistics Using R
## 2                                                                         R Cookbook 
## 3 R for Everyone: Advanced Analytics and Graphics (Addison-Wesley Data and Analytics)
##       Author_1     Author_2  Author_3 Published_Date     Weight      Type
## 1   Andy Field Jeremy Miles Zoe Field           2012   5 pounds paperback
## 2  Paul Teetor                                  2011 1.6 pounds paperback
## 3 Jared Lander                                  2013    1 pound paperback

The following code will take our books.data file and then convert it to xml format.

library(XML)
create_xml_books <- function(dataframe){
                      for (i in seq(1:length(dataframe[,1]))) {
                            xml$addTag("book", close=FALSE)
                           for (j in colnames(dataframe)) {
                              xml$addTag(j, dataframe[i,j])
                           } 
                              xml$closeTag()
                       }
                     }

xml <- xmlOutputDOM()
xml$addTag("books", close=FALSE)
create_xml_books(books.data)
xml$closeTag()

books.xml <- saveXML(xml$value()) 
xml.output <- saveXML(xml$value(), file = "chunt_books.xml")

Now lets look at the structure of the xml file.

cat(books.xml)
## <?xml version="1.0"?>
## <doc>
##  <books>
##   <book>
##    <Title>Discovering Statistics Using R</Title>
##    <Author_1>Andy Field</Author_1>
##    <Author_2>Jeremy Miles</Author_2>
##    <Author_3>Zoe Field</Author_3>
##    <Published_Date>2012</Published_Date>
##    <Weight>5 pounds</Weight>
##    <Type>paperback</Type>
##   </book>
##   <book>
##    <Title>R Cookbook </Title>
##    <Author_1>Paul Teetor</Author_1>
##    <Author_2> </Author_2>
##    <Author_3> </Author_3>
##    <Published_Date>2011</Published_Date>
##    <Weight>1.6 pounds</Weight>
##    <Type>paperback</Type>
##   </book>
##   <book>
##    <Title>R for Everyone: Advanced Analytics and Graphics (Addison-Wesley Data and Analytics)</Title>
##    <Author_1>Jared Lander</Author_1>
##    <Author_2> </Author_2>
##    <Author_3> </Author_3>
##    <Published_Date>2013</Published_Date>
##    <Weight>1 pound</Weight>
##    <Type>paperback</Type>
##   </book>
##  </books>
## </doc>

Now that we have the data in xml we can covert to a dataframe.

books.xml.list <- xmlToList(books.xml)
books.xml.df <- as.data.frame(rbindlist(books.xml.list, fill = TRUE))
row.names(books.xml.df) <- names(books.xml.list$books$book)
kable(books.xml.df)
book book book
Title Discovering Statistics Using R R Cookbook R for Everyone: Advanced Analytics and Graphics (Addison-Wesley Data and Analytics)
Author_1 Andy Field Paul Teetor Jared Lander
Author_2 Jeremy Miles
Author_3 Zoe Field
Published_Date 2012 2011 2013
Weight 5 pounds 1.6 pounds 1 pound
Type paperback paperback paperback

Lets load the dataframe to the json format using the package df2json

books.json <- df2json::df2json(books.data)

Now we have a json format for the books data frame

cat(books.json)

[{“Title”:“Discovering Statistics Using R”,“Author_1”:“Andy Field”,“Author_2”:“Jeremy Miles”,“Author_3”:“Zoe Field”,“Published_Date”:2012,“Weight”:“5 pounds”,“Type”:“paperback”}, {“Title”:“R Cookbook”,“Author_1”:“Paul Teetor”,“Author_2”:" “,”Author_3“:” “,”Published_Date“:2011,”Weight“:”1.6 pounds“,”Type“:”paperback“}, {”Title“:”R for Everyone: Advanced Analytics and Graphics (Addison-Wesley Data and Analytics)“,”Author_1“:”Jared Lander“,”Author_2“:” “,”Author_3“:” “,”Published_Date“:2013,”Weight“:”1 pound“,”Type“:”paperback“}]

write(books.json, "chunt_books.json")

Lets now convert the json format back to a data frame using the json package RJSONIO

books.json.list <- RJSONIO::fromJSON("chunt_books.json")
books.json.df <- as.data.frame(rbindlist(books.json.list, fill = TRUE))
rownames(books.json.df) <- NULL
kable(books.json.df)
Title Author_1 Author_2 Author_3 Published_Date Weight Type
Discovering Statistics Using R Andy Field Jeremy Miles Zoe Field 2012 5 pounds paperback
R Cookbook Paul Teetor 2011 1.6 pounds paperback
R for Everyone: Advanced Analytics and Graphics (Addison-Wesley Data and Analytics) Jared Lander 2013 1 pound paperback
R2HTML::HTML(books.data, file = "chunt_books.html") #saves html file
books.html.df <-   XML::readHTMLTable("chunt_books.html")
books.html.df <- as.data.frame(books.html.df[1])
books.html.df <- books.html.df[,-c(1)]
books.html.df <- sapply(books.html.df, as.character)
colnames(books.html.df) <-  books.html.df[1,]
books.html.df <- books.html.df[-1,]
kable(books.html.df)
Title Author_1 Author_2 Author_3 Published_Date Weight Type
Discovering Statistics Using R Andy Field Jeremy Miles Zoe Field 2012 5 pounds paperback
R Cookbook Paul Teetor 2011 1.6 pounds paperback
R for Everyone: Advanced Analytics and Graphics (Addison-Wesley Data and Analytics) Jared Lander 2013 1 pound paperback

Overall, I noticed that many of the results were returned in a list format. It seems that when parsing various data format types they results will be in a list format that needs to be modified into the data frame. The packages are extremely useful in producing these results. I would say that the xml required the most amount of effort in coding and understanding, whereas json felt to be the easiest to understand. I had not previously worked with data were the encoding to UTF-8 was so critical to being able to parse the data, I now have a new understanding of this encoding.