Overview

Assignment 3 consists of developing some toy data on book publishers in JSON, XML and HTML. These data sets are then coverted to R data frames to show a simple case of how these sources of data might be handled. Below we proceed in the following order
1.JSON
2.XML
3.HTML

We first created the above datasets in the JSON, XML and HTML formats, “by hand”. The respective formats were then imported into R using appopriate packages.

#file path for all three datasets
datadir=file.path("C://Users//talha//Documents//Training//CUNY Classes//IS607//Week7//")

JSON data

#load packages in JSONLite
library(jsonlite)
filename=file.path(datadir,"bookdata.js")
#read file names
bdata<-readLines(filename)
## Warning in readLines(filename): incomplete final line found on 'C://Users//
## talha//Documents//Training//CUNY Classes//IS607//Week7//bookdata.js'
bdata
## [1] "{"                                                                                                         
## [2] "\"ID\":[\"1\",\"2\",\"3\",\"4\"],"                                                                         
## [3] "\"Author\":[\"Greg Mortenson\",\"David Oliver Relin\",\"Jonathan Frenzen\",\"Michael Chabon\"],"           
## [4] "\"Title\":[\"Three Cups of Tea\",\"Three Cups of Tea\",\"The Corrections\",\"Yiddish Policeman's Union\"],"
## [5] "\"Type\":[\"Non-Fiction\",\"Non-Fiction\",\"Fiction\",\"Fiction\"],"                                       
## [6] "\"Publisher\":[\"Penguin\",\"Penguin\",\"Picador\",\"Harper Perenial\"],"                                  
## [7] "\"Language\":[\"English\",\"English\",\"English\",\"English\"]"                                            
## [8] "}"
bdata_str<-paste(bdata,"")
#get JSON data
jsondata<-fromJSON(bdata_str)
#class and attributes of jsondata
class(jsondata)
## [1] "list"
length(jsondata)
## [1] 6
names(jsondata)
## [1] "ID"        "Author"    "Title"     "Type"      "Publisher" "Language"
lapply(jsondata,class)
## $ID
## [1] "character"
## 
## $Author
## [1] "character"
## 
## $Title
## [1] "character"
## 
## $Type
## [1] "character"
## 
## $Publisher
## [1] "character"
## 
## $Language
## [1] "character"
lapply(jsondata,length)
## $ID
## [1] 4
## 
## $Author
## [1] 4
## 
## $Title
## [1] 4
## 
## $Type
## [1] 4
## 
## $Publisher
## [1] 4
## 
## $Language
## [1] 4
lapply(jsondata,dim)
## $ID
## NULL
## 
## $Author
## NULL
## 
## $Title
## NULL
## 
## $Type
## NULL
## 
## $Publisher
## NULL
## 
## $Language
## NULL
#view different data elements
head(jsondata[[1]],n=4)
## [1] "1" "2" "3" "4"
head(jsondata[[2]],n=4)
## [1] "Greg Mortenson"     "David Oliver Relin" "Jonathan Frenzen"  
## [4] "Michael Chabon"
#create data frame from JSON data
jsondf<-as.data.frame(jsondata)
#the data frame
jsondf
##   ID             Author                     Title        Type
## 1  1     Greg Mortenson         Three Cups of Tea Non-Fiction
## 2  2 David Oliver Relin         Three Cups of Tea Non-Fiction
## 3  3   Jonathan Frenzen           The Corrections     Fiction
## 4  4     Michael Chabon Yiddish Policeman's Union     Fiction
##         Publisher Language
## 1         Penguin  English
## 2         Penguin  English
## 3         Picador  English
## 4 Harper Perenial  English
dim(jsondf)
## [1] 4 6

XML data

#load a the XML library
library(XML)
## Warning: package 'XML' was built under R version 3.3.1
#load the file
filename=file.path(datadir,"bookdata.xml")
#parse the XML file
xml_str<-xmlTreeParse(filename)
class(xml_str)
## [1] "XMLDocument"         "XMLAbstractDocument"
#extract the root node
root<-xmlRoot(xml_str)
# extract children node
child<-xmlChildren(root)
# extract subchildren node
schild<-xmlChildren(child[[1]])

# extract nodes using XPATH notation
getNodeSet(root,"//booksnum[@ID]")
## [[1]]
## <booksnum ID="1">
##  <author>Greg Mortenson</author>
##  <title>Three Cups of Tea</title>
##  <type>Nonfiction</type>
##  <publisher>Penguin</publisher>
##  <language>English</language>
## </booksnum>
## 
## [[2]]
## <booksnum ID="2">
##  <author>David Oliver Relin</author>
##  <title>Three Cups of Tea</title>
##  <type>Nonfiction</type>
##  <publisher>Penguin</publisher>
##  <language>English</language>
## </booksnum>
## 
## [[3]]
## <booksnum ID="3">
##  <author>Jonathan Frenzen</author>
##  <title>The Corrections</title>
##  <type>Fiction</type>
##  <publisher>Picador</publisher>
##  <language>English</language>
## </booksnum>
## 
## [[4]]
## <booksnum ID="4">
##  <author>Michael Chabon</author>
##  <title>The Yiddish Policemans Union</title>
##  <type>Fiction</type>
##  <publisher>Harper Perenial</publisher>
##  <language>English</language>
## </booksnum>
getNodeSet(root,"//author")
## [[1]]
## <author>Greg Mortenson</author>
## 
## [[2]]
## <author>David Oliver Relin</author>
## 
## [[3]]
## <author>Jonathan Frenzen</author>
## 
## [[4]]
## <author>Michael Chabon</author>
getNodeSet(root,"//title")
## [[1]]
## <title>Three Cups of Tea</title>
## 
## [[2]]
## <title>Three Cups of Tea</title>
## 
## [[3]]
## <title>The Corrections</title>
## 
## [[4]]
## <title>The Yiddish Policemans Union</title>
getNodeSet(root,"//type")
## [[1]]
## <type>Nonfiction</type>
## 
## [[2]]
## <type>Nonfiction</type>
## 
## [[3]]
## <type>Fiction</type>
## 
## [[4]]
## <type>Fiction</type>
getNodeSet(root,"//publisher")
## [[1]]
## <publisher>Penguin</publisher>
## 
## [[2]]
## <publisher>Penguin</publisher>
## 
## [[3]]
## <publisher>Picador</publisher>
## 
## [[4]]
## <publisher>Harper Perenial</publisher>
getNodeSet(root,"//language")
## [[1]]
## <language>English</language>
## 
## [[2]]
## <language>English</language>
## 
## [[3]]
## <language>English</language>
## 
## [[4]]
## <language>English</language>
sapply(schild,xmlName)
##      author       title        type   publisher    language 
##    "author"     "title"      "type" "publisher"  "language"
sapply(child,xmlValue)
##                                                                  booksnum 
##                 "Greg MortensonThree Cups of TeaNonfictionPenguinEnglish" 
##                                                                  booksnum 
##             "David Oliver RelinThree Cups of TeaNonfictionPenguinEnglish" 
##                                                                  booksnum 
##                    "Jonathan FrenzenThe CorrectionsFictionPicadorEnglish" 
##                                                                  booksnum 
## "Michael ChabonThe Yiddish Policemans UnionFictionHarper PerenialEnglish"
sapply(child,xmlAttrs)
## booksnum.ID booksnum.ID booksnum.ID booksnum.ID 
##         "1"         "2"         "3"         "4"
sapply(schild,xmlValue)
##              author               title                type 
##    "Greg Mortenson" "Three Cups of Tea"        "Nonfiction" 
##           publisher            language 
##           "Penguin"           "English"
# develop data frame from XML data
author_df0<-as.data.frame(sapply(getNodeSet(root,"//booksnum[@ID]"),xmlAttrs))
author_df1<-as.data.frame(sapply(getNodeSet(root,"//author"),xmlValue))
author_df2<-as.data.frame(sapply(getNodeSet(root,"//title"),xmlValue))
author_df3<-as.data.frame(sapply(getNodeSet(root,"//type"),xmlValue))
author_df4<-as.data.frame(sapply(getNodeSet(root,"//publisher"),xmlValue))
author_df5<-as.data.frame(sapply(getNodeSet(root,"//language"),xmlValue))
author_df<-cbind(author_df0,author_df1,author_df2,author_df3,author_df4,author_df5)
#column names
colnames(author_df)<-c("ID",sapply(schild,xmlName))
#dataframe from XML data
author_df
##   ID             author                        title       type
## 1  1     Greg Mortenson            Three Cups of Tea Nonfiction
## 2  2 David Oliver Relin            Three Cups of Tea Nonfiction
## 3  3   Jonathan Frenzen              The Corrections    Fiction
## 4  4     Michael Chabon The Yiddish Policemans Union    Fiction
##         publisher language
## 1         Penguin  English
## 2         Penguin  English
## 3         Picador  English
## 4 Harper Perenial  English

HTML data

filename=file.path(datadir,"bookdata.html")
# parse the HTML file
html_str<-htmlParse(filename)
# read tables
tables<-readHTMLTable(html_str,stringsAsFactors=FALSE)
# develop data frames
html_df<-as.data.frame(tables[[1]])
html_df
##   ID             Author                         Title        Type
## 1  1     Greg Mortenson             Three Cups of Tea Non-Fiction
## 2  2 David Oliver Relin             Three Cups of Tea Non-Fiction
## 3  3   Jonathan Franzen               The Corrections     Fiction
## 4  4     Michael Chabon The Yiddish Policeman's Union     Fiction
##         Publisher Language
## 1         Penguin  English
## 2         Penguin  English
## 3         Picador  English
## 4 Harper Perenial  English