I607 Week 7 Assignment

Load Libraries

library(XML)
library(RJSONIO)
library(dplyr)
library(jsonlite)

HTML Parse

require(RCurl)
options(stringsAsFactors = FALSE)
phtml<-htmlParse(getURL("https://raw.githubusercontent.com/apag101/CUNYSPS/master/IS607/Week7/book.html"))
phtml
## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html>
## <head><title>Books</title></head>
## <body>
## <table>
## <tr>
## <th>Title</th> <th>Authors</th> <th>Type</th> <th>Copyright</th>
## </tr>
## <tr>
## <td>The Black Swan</td> <td>Nassim Nicholas Taleb</td> <td>Non-Fiction</td> <td>2010</td>
## </tr>
## <tr>
## <td>Thinking Fast and Slow</td> <td>Daniel Kahneman</td> <td>Non-Fiction</td> <td>2011</td>
## </tr>
## <tr>
## <td>Sapiens A Brief History of Humankind</td> <td>Yuval Noah Harari</td> <td>Non-Fiction</td> <td>2015</td>
## </tr>
## <tr>
## <td>How to Create a Mind</td> <td>Ray Kurzweil, Terry Grossman</td> <td>Non-Fiction</td> <td>2012</td>
## </tr>
## </table>
## </body>
## </html>
## 
t.phtml<-readHTMLTable(phtml)
glimpse(t.phtml)
## List of 1
##  $ NULL:'data.frame':    4 obs. of  4 variables:
##   ..$ Title    : chr [1:4] "The Black Swan" "Thinking Fast and Slow" "Sapiens A Brief History of Humankind" "How to Create a Mind"
##   ..$ Authors  : chr [1:4] "Nassim Nicholas Taleb" "Daniel Kahneman" "Yuval Noah Harari" "Ray Kurzweil, Terry Grossman"
##   ..$ Type     : chr [1:4] "Non-Fiction" "Non-Fiction" "Non-Fiction" "Non-Fiction"
##   ..$ Copyright: chr [1:4] "2010" "2011" "2015" "2012"
t.phtml
## $`NULL`
##                                  Title                      Authors
## 1                       The Black Swan        Nassim Nicholas Taleb
## 2               Thinking Fast and Slow              Daniel Kahneman
## 3 Sapiens A Brief History of Humankind            Yuval Noah Harari
## 4                 How to Create a Mind Ray Kurzweil, Terry Grossman
##          Type Copyright
## 1 Non-Fiction      2010
## 2 Non-Fiction      2011
## 3 Non-Fiction      2015
## 4 Non-Fiction      2012

XML Parse

pxml<-xmlParse(getURL("https://raw.githubusercontent.com/apag101/CUNYSPS/master/IS607/Week7/book.xml"))
pxml
## <?xml version="1.0" encoding="ISO-8859-1"?>
## <books>
##   <book id="1">
##     <Title>The Black Swan</Title>
##     <Authors>Nassim Nicholas Taleb</Authors>
##     <Type>Non-Fiction</Type>
##     <Copyright>2010</Copyright>
##   </book>
##   <book id="2">
##     <Title>Thinking Fast and Slow</Title>
##     <Authors>Daniel Kahneman</Authors>
##     <Type>Non-Fiction</Type>
##     <Copyright>2011</Copyright>
##   </book>
##   <book id="3">
##     <Title>Sapiens A Brief History of Humankind</Title>
##     <Authors>Yuval Noah Harari</Authors>
##     <Type>Non-Fiction</Type>
##     <Copyright>2015</Copyright>
##   </book>
##   <book id="1">
##     <Title>How to Create a Mind</Title>
##     <Authors>"Ray Kurzweil" "Terry Grossman"</Authors>
##     <Type>Non-Fiction</Type>
##     <Copyright>2012</Copyright>
##   </book>
## </books>
## 
rpxml <-xmlRoot(pxml)
rpxml
## <books>
##   <book id="1">
##     <Title>The Black Swan</Title>
##     <Authors>Nassim Nicholas Taleb</Authors>
##     <Type>Non-Fiction</Type>
##     <Copyright>2010</Copyright>
##   </book>
##   <book id="2">
##     <Title>Thinking Fast and Slow</Title>
##     <Authors>Daniel Kahneman</Authors>
##     <Type>Non-Fiction</Type>
##     <Copyright>2011</Copyright>
##   </book>
##   <book id="3">
##     <Title>Sapiens A Brief History of Humankind</Title>
##     <Authors>Yuval Noah Harari</Authors>
##     <Type>Non-Fiction</Type>
##     <Copyright>2015</Copyright>
##   </book>
##   <book id="1">
##     <Title>How to Create a Mind</Title>
##     <Authors>"Ray Kurzweil" "Terry Grossman"</Authors>
##     <Type>Non-Fiction</Type>
##     <Copyright>2012</Copyright>
##   </book>
## </books>
df.pxml<-xmlToDataFrame(rpxml, stringsAsFactors = FALSE)
glimpse(df.pxml)
## Observations: 4
## Variables: 4
## $ Title     <chr> "The Black Swan", "Thinking Fast and Slow", "Sapiens...
## $ Authors   <chr> "Nassim Nicholas Taleb", "Daniel Kahneman", "Yuval N...
## $ Type      <chr> "Non-Fiction", "Non-Fiction", "Non-Fiction", "Non-Fi...
## $ Copyright <chr> "2010", "2011", "2015", "2012"
df.pxml
##                                  Title                         Authors
## 1                       The Black Swan           Nassim Nicholas Taleb
## 2               Thinking Fast and Slow                 Daniel Kahneman
## 3 Sapiens A Brief History of Humankind               Yuval Noah Harari
## 4                 How to Create a Mind "Ray Kurzweil" "Terry Grossman"
##          Type Copyright
## 1 Non-Fiction      2010
## 2 Non-Fiction      2011
## 3 Non-Fiction      2015
## 4 Non-Fiction      2012
#xmlName(rpxml)
#xmlSApply(rpxml, xmlValue)

Json Parse

options(stringsAsFactors = FALSE)
df.pjson<-data.frame()
df.pjson<-fromJSON(getURL("https://raw.githubusercontent.com/apag101/CUNYSPS/master/IS607/Week7/book.json"))
glimpse(df.pjson)
## List of 1
##  $ books:'data.frame':   4 obs. of  4 variables:
##   ..$ Title    : chr [1:4] "The Black Swan" "Thinking Fast and Slow" "Sapiens A Brief History of Humankind" "How to Create a Mind"
##   ..$ Authors  :List of 4
##   .. ..$ : chr "Nassim Nicholas Taleb"
##   .. ..$ : chr "Daniel Kahneman"
##   .. ..$ : chr "Yuval Noah Harari"
##   .. ..$ : chr [1:2] "Ray Kurzweil" "Terry Grossman"
##   ..$ Type     : chr [1:4] "Non-Fiction" "Non-Fiction" "Non-Fiction" "Non-Fiction"
##   ..$ Copyright: int [1:4] 2010 2011 2015 2012
df.pjson
## $books
##                                  Title                      Authors
## 1                       The Black Swan        Nassim Nicholas Taleb
## 2               Thinking Fast and Slow              Daniel Kahneman
## 3 Sapiens A Brief History of Humankind            Yuval Noah Harari
## 4                 How to Create a Mind Ray Kurzweil, Terry Grossman
##          Type Copyright
## 1 Non-Fiction      2010
## 2 Non-Fiction      2011
## 3 Non-Fiction      2015
## 4 Non-Fiction      2012

Are the 3 Files similar?

All are similar except XML adds quotes to the attribute with 2 authors.