Assignment 3 consists of developing some toy data on book publishers in JSON, XML and HTML. These data sets are then coverted to R data frames to show a simple case of how these sources of data might be handled. Below we proceed in the following order
1.JSON
2.XML
3.HTML
We first created the above datasets in the JSON, XML and HTML formats, “by hand”. The respective formats were then imported into R using appopriate packages.
#file path for all three datasets
datadir=file.path("C://Users//talha//Documents//Training//CUNY Classes//IS607//Week7//")
#load packages in JSONLite
library(jsonlite)
filename=file.path(datadir,"bookdata.js")
#read file names
bdata<-readLines(filename)
## Warning in readLines(filename): incomplete final line found on 'C://Users//
## talha//Documents//Training//CUNY Classes//IS607//Week7//bookdata.js'
bdata
## [1] "{"
## [2] "\"ID\":[\"1\",\"2\",\"3\",\"4\"],"
## [3] "\"Author\":[\"Greg Mortenson\",\"David Oliver Relin\",\"Jonathan Frenzen\",\"Michael Chabon\"],"
## [4] "\"Title\":[\"Three Cups of Tea\",\"Three Cups of Tea\",\"The Corrections\",\"Yiddish Policeman's Union\"],"
## [5] "\"Type\":[\"Non-Fiction\",\"Non-Fiction\",\"Fiction\",\"Fiction\"],"
## [6] "\"Publisher\":[\"Penguin\",\"Penguin\",\"Picador\",\"Harper Perenial\"],"
## [7] "\"Language\":[\"English\",\"English\",\"English\",\"English\"]"
## [8] "}"
bdata_str<-paste(bdata,"")
#get JSON data
jsondata<-fromJSON(bdata_str)
#class and attributes of jsondata
class(jsondata)
## [1] "list"
length(jsondata)
## [1] 6
names(jsondata)
## [1] "ID" "Author" "Title" "Type" "Publisher" "Language"
lapply(jsondata,class)
## $ID
## [1] "character"
##
## $Author
## [1] "character"
##
## $Title
## [1] "character"
##
## $Type
## [1] "character"
##
## $Publisher
## [1] "character"
##
## $Language
## [1] "character"
lapply(jsondata,length)
## $ID
## [1] 4
##
## $Author
## [1] 4
##
## $Title
## [1] 4
##
## $Type
## [1] 4
##
## $Publisher
## [1] 4
##
## $Language
## [1] 4
lapply(jsondata,dim)
## $ID
## NULL
##
## $Author
## NULL
##
## $Title
## NULL
##
## $Type
## NULL
##
## $Publisher
## NULL
##
## $Language
## NULL
#view different data elements
head(jsondata[[1]],n=4)
## [1] "1" "2" "3" "4"
head(jsondata[[2]],n=4)
## [1] "Greg Mortenson" "David Oliver Relin" "Jonathan Frenzen"
## [4] "Michael Chabon"
#create data frame from JSON data
jsondf<-as.data.frame(jsondata)
#the data frame
jsondf
## ID Author Title Type
## 1 1 Greg Mortenson Three Cups of Tea Non-Fiction
## 2 2 David Oliver Relin Three Cups of Tea Non-Fiction
## 3 3 Jonathan Frenzen The Corrections Fiction
## 4 4 Michael Chabon Yiddish Policeman's Union Fiction
## Publisher Language
## 1 Penguin English
## 2 Penguin English
## 3 Picador English
## 4 Harper Perenial English
dim(jsondf)
## [1] 4 6
#load a the XML library
library(XML)
## Warning: package 'XML' was built under R version 3.3.1
#load the file
filename=file.path(datadir,"bookdata.xml")
#parse the XML file
xml_str<-xmlTreeParse(filename)
class(xml_str)
## [1] "XMLDocument" "XMLAbstractDocument"
#extract the root node
root<-xmlRoot(xml_str)
# extract children node
child<-xmlChildren(root)
# extract subchildren node
schild<-xmlChildren(child[[1]])
# extract nodes using XPATH notation
getNodeSet(root,"//booksnum[@ID]")
## [[1]]
## <booksnum ID="1">
## <author>Greg Mortenson</author>
## <title>Three Cups of Tea</title>
## <type>Nonfiction</type>
## <publisher>Penguin</publisher>
## <language>English</language>
## </booksnum>
##
## [[2]]
## <booksnum ID="2">
## <author>David Oliver Relin</author>
## <title>Three Cups of Tea</title>
## <type>Nonfiction</type>
## <publisher>Penguin</publisher>
## <language>English</language>
## </booksnum>
##
## [[3]]
## <booksnum ID="3">
## <author>Jonathan Frenzen</author>
## <title>The Corrections</title>
## <type>Fiction</type>
## <publisher>Picador</publisher>
## <language>English</language>
## </booksnum>
##
## [[4]]
## <booksnum ID="4">
## <author>Michael Chabon</author>
## <title>The Yiddish Policemans Union</title>
## <type>Fiction</type>
## <publisher>Harper Perenial</publisher>
## <language>English</language>
## </booksnum>
getNodeSet(root,"//author")
## [[1]]
## <author>Greg Mortenson</author>
##
## [[2]]
## <author>David Oliver Relin</author>
##
## [[3]]
## <author>Jonathan Frenzen</author>
##
## [[4]]
## <author>Michael Chabon</author>
getNodeSet(root,"//title")
## [[1]]
## <title>Three Cups of Tea</title>
##
## [[2]]
## <title>Three Cups of Tea</title>
##
## [[3]]
## <title>The Corrections</title>
##
## [[4]]
## <title>The Yiddish Policemans Union</title>
getNodeSet(root,"//type")
## [[1]]
## <type>Nonfiction</type>
##
## [[2]]
## <type>Nonfiction</type>
##
## [[3]]
## <type>Fiction</type>
##
## [[4]]
## <type>Fiction</type>
getNodeSet(root,"//publisher")
## [[1]]
## <publisher>Penguin</publisher>
##
## [[2]]
## <publisher>Penguin</publisher>
##
## [[3]]
## <publisher>Picador</publisher>
##
## [[4]]
## <publisher>Harper Perenial</publisher>
getNodeSet(root,"//language")
## [[1]]
## <language>English</language>
##
## [[2]]
## <language>English</language>
##
## [[3]]
## <language>English</language>
##
## [[4]]
## <language>English</language>
sapply(schild,xmlName)
## author title type publisher language
## "author" "title" "type" "publisher" "language"
sapply(child,xmlValue)
## booksnum
## "Greg MortensonThree Cups of TeaNonfictionPenguinEnglish"
## booksnum
## "David Oliver RelinThree Cups of TeaNonfictionPenguinEnglish"
## booksnum
## "Jonathan FrenzenThe CorrectionsFictionPicadorEnglish"
## booksnum
## "Michael ChabonThe Yiddish Policemans UnionFictionHarper PerenialEnglish"
sapply(child,xmlAttrs)
## booksnum.ID booksnum.ID booksnum.ID booksnum.ID
## "1" "2" "3" "4"
sapply(schild,xmlValue)
## author title type
## "Greg Mortenson" "Three Cups of Tea" "Nonfiction"
## publisher language
## "Penguin" "English"
# develop data frame from XML data
author_df0<-as.data.frame(sapply(getNodeSet(root,"//booksnum[@ID]"),xmlAttrs))
author_df1<-as.data.frame(sapply(getNodeSet(root,"//author"),xmlValue))
author_df2<-as.data.frame(sapply(getNodeSet(root,"//title"),xmlValue))
author_df3<-as.data.frame(sapply(getNodeSet(root,"//type"),xmlValue))
author_df4<-as.data.frame(sapply(getNodeSet(root,"//publisher"),xmlValue))
author_df5<-as.data.frame(sapply(getNodeSet(root,"//language"),xmlValue))
author_df<-cbind(author_df0,author_df1,author_df2,author_df3,author_df4,author_df5)
#column names
colnames(author_df)<-c("ID",sapply(schild,xmlName))
#dataframe from XML data
author_df
## ID author title type
## 1 1 Greg Mortenson Three Cups of Tea Nonfiction
## 2 2 David Oliver Relin Three Cups of Tea Nonfiction
## 3 3 Jonathan Frenzen The Corrections Fiction
## 4 4 Michael Chabon The Yiddish Policemans Union Fiction
## publisher language
## 1 Penguin English
## 2 Penguin English
## 3 Picador English
## 4 Harper Perenial English
filename=file.path(datadir,"bookdata.html")
# parse the HTML file
html_str<-htmlParse(filename)
# read tables
tables<-readHTMLTable(html_str,stringsAsFactors=FALSE)
# develop data frames
html_df<-as.data.frame(tables[[1]])
html_df
## ID Author Title Type
## 1 1 Greg Mortenson Three Cups of Tea Non-Fiction
## 2 2 David Oliver Relin Three Cups of Tea Non-Fiction
## 3 3 Jonathan Franzen The Corrections Fiction
## 4 4 Michael Chabon The Yiddish Policeman's Union Fiction
## Publisher Language
## 1 Penguin English
## 2 Penguin English
## 3 Picador English
## 4 Harper Perenial English