I607 Week 7 Assignment
Load Libraries
library(XML)
library(RJSONIO)
library(dplyr)
library(jsonlite)
HTML Parse
require(RCurl)
options(stringsAsFactors = FALSE)
phtml<-htmlParse(getURL("https://raw.githubusercontent.com/apag101/CUNYSPS/master/IS607/Week7/book.html"))
phtml
## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html>
## <head><title>Books</title></head>
## <body>
## <table>
## <tr>
## <th>Title</th> <th>Authors</th> <th>Type</th> <th>Copyright</th>
## </tr>
## <tr>
## <td>The Black Swan</td> <td>Nassim Nicholas Taleb</td> <td>Non-Fiction</td> <td>2010</td>
## </tr>
## <tr>
## <td>Thinking Fast and Slow</td> <td>Daniel Kahneman</td> <td>Non-Fiction</td> <td>2011</td>
## </tr>
## <tr>
## <td>Sapiens A Brief History of Humankind</td> <td>Yuval Noah Harari</td> <td>Non-Fiction</td> <td>2015</td>
## </tr>
## <tr>
## <td>How to Create a Mind</td> <td>Ray Kurzweil, Terry Grossman</td> <td>Non-Fiction</td> <td>2012</td>
## </tr>
## </table>
## </body>
## </html>
##
t.phtml<-readHTMLTable(phtml)
glimpse(t.phtml)
## List of 1
## $ NULL:'data.frame': 4 obs. of 4 variables:
## ..$ Title : chr [1:4] "The Black Swan" "Thinking Fast and Slow" "Sapiens A Brief History of Humankind" "How to Create a Mind"
## ..$ Authors : chr [1:4] "Nassim Nicholas Taleb" "Daniel Kahneman" "Yuval Noah Harari" "Ray Kurzweil, Terry Grossman"
## ..$ Type : chr [1:4] "Non-Fiction" "Non-Fiction" "Non-Fiction" "Non-Fiction"
## ..$ Copyright: chr [1:4] "2010" "2011" "2015" "2012"
t.phtml
## $`NULL`
## Title Authors
## 1 The Black Swan Nassim Nicholas Taleb
## 2 Thinking Fast and Slow Daniel Kahneman
## 3 Sapiens A Brief History of Humankind Yuval Noah Harari
## 4 How to Create a Mind Ray Kurzweil, Terry Grossman
## Type Copyright
## 1 Non-Fiction 2010
## 2 Non-Fiction 2011
## 3 Non-Fiction 2015
## 4 Non-Fiction 2012
XML Parse
pxml<-xmlParse(getURL("https://raw.githubusercontent.com/apag101/CUNYSPS/master/IS607/Week7/book.xml"))
pxml
## <?xml version="1.0" encoding="ISO-8859-1"?>
## <books>
## <book id="1">
## <Title>The Black Swan</Title>
## <Authors>Nassim Nicholas Taleb</Authors>
## <Type>Non-Fiction</Type>
## <Copyright>2010</Copyright>
## </book>
## <book id="2">
## <Title>Thinking Fast and Slow</Title>
## <Authors>Daniel Kahneman</Authors>
## <Type>Non-Fiction</Type>
## <Copyright>2011</Copyright>
## </book>
## <book id="3">
## <Title>Sapiens A Brief History of Humankind</Title>
## <Authors>Yuval Noah Harari</Authors>
## <Type>Non-Fiction</Type>
## <Copyright>2015</Copyright>
## </book>
## <book id="1">
## <Title>How to Create a Mind</Title>
## <Authors>"Ray Kurzweil" "Terry Grossman"</Authors>
## <Type>Non-Fiction</Type>
## <Copyright>2012</Copyright>
## </book>
## </books>
##
rpxml <-xmlRoot(pxml)
rpxml
## <books>
## <book id="1">
## <Title>The Black Swan</Title>
## <Authors>Nassim Nicholas Taleb</Authors>
## <Type>Non-Fiction</Type>
## <Copyright>2010</Copyright>
## </book>
## <book id="2">
## <Title>Thinking Fast and Slow</Title>
## <Authors>Daniel Kahneman</Authors>
## <Type>Non-Fiction</Type>
## <Copyright>2011</Copyright>
## </book>
## <book id="3">
## <Title>Sapiens A Brief History of Humankind</Title>
## <Authors>Yuval Noah Harari</Authors>
## <Type>Non-Fiction</Type>
## <Copyright>2015</Copyright>
## </book>
## <book id="1">
## <Title>How to Create a Mind</Title>
## <Authors>"Ray Kurzweil" "Terry Grossman"</Authors>
## <Type>Non-Fiction</Type>
## <Copyright>2012</Copyright>
## </book>
## </books>
df.pxml<-xmlToDataFrame(rpxml, stringsAsFactors = FALSE)
glimpse(df.pxml)
## Observations: 4
## Variables: 4
## $ Title <chr> "The Black Swan", "Thinking Fast and Slow", "Sapiens...
## $ Authors <chr> "Nassim Nicholas Taleb", "Daniel Kahneman", "Yuval N...
## $ Type <chr> "Non-Fiction", "Non-Fiction", "Non-Fiction", "Non-Fi...
## $ Copyright <chr> "2010", "2011", "2015", "2012"
df.pxml
## Title Authors
## 1 The Black Swan Nassim Nicholas Taleb
## 2 Thinking Fast and Slow Daniel Kahneman
## 3 Sapiens A Brief History of Humankind Yuval Noah Harari
## 4 How to Create a Mind "Ray Kurzweil" "Terry Grossman"
## Type Copyright
## 1 Non-Fiction 2010
## 2 Non-Fiction 2011
## 3 Non-Fiction 2015
## 4 Non-Fiction 2012
#xmlName(rpxml)
#xmlSApply(rpxml, xmlValue)
Json Parse
options(stringsAsFactors = FALSE)
df.pjson<-data.frame()
df.pjson<-fromJSON(getURL("https://raw.githubusercontent.com/apag101/CUNYSPS/master/IS607/Week7/book.json"))
glimpse(df.pjson)
## List of 1
## $ books:'data.frame': 4 obs. of 4 variables:
## ..$ Title : chr [1:4] "The Black Swan" "Thinking Fast and Slow" "Sapiens A Brief History of Humankind" "How to Create a Mind"
## ..$ Authors :List of 4
## .. ..$ : chr "Nassim Nicholas Taleb"
## .. ..$ : chr "Daniel Kahneman"
## .. ..$ : chr "Yuval Noah Harari"
## .. ..$ : chr [1:2] "Ray Kurzweil" "Terry Grossman"
## ..$ Type : chr [1:4] "Non-Fiction" "Non-Fiction" "Non-Fiction" "Non-Fiction"
## ..$ Copyright: int [1:4] 2010 2011 2015 2012
df.pjson
## $books
## Title Authors
## 1 The Black Swan Nassim Nicholas Taleb
## 2 Thinking Fast and Slow Daniel Kahneman
## 3 Sapiens A Brief History of Humankind Yuval Noah Harari
## 4 How to Create a Mind Ray Kurzweil, Terry Grossman
## Type Copyright
## 1 Non-Fiction 2010
## 2 Non-Fiction 2011
## 3 Non-Fiction 2015
## 4 Non-Fiction 2012
Are the 3 Files similar?
All are similar except XML adds quotes to the attribute with 2 authors.