For this assignment, we create and parse an XML, JSON and html file all having the same data.
library(XML);library(jsonlite);library(plyr);library(tidyr);
library(dplyr)
my.file<-'~/Documents/CUNY/data_class/week7-xml_json/books.xml'
parse.file<-xmlParse(file=my.file) #doesn't allow for indexing
parse.file<-xmlTreeParse(file=my.file)
root<-xmlRoot(parse.file) #allows for indexing of nodes
child.1<-xmlChildren(root)
#retrieve attributes:
y<-sapply(child.1,function(x) unlist(sapply(xmlChildren(x),xmlAttrs)))
k<-sapply(child.1,function(x) unlist(sapply(xmlChildren(x),xmlValue)))
k.1<-data.frame(k)
k.2<-k.1 %>%
t() %>%
data.frame()
j<-data.frame(t(y),check.names=FALSE)
## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 2,3 --> row.names NOT used
df.xml<-cbind(k.2,j)
file<-'~/Documents/CUNY/data_class/week7-xml_json/books_formatted.json'
#isValidJSON(file)
rfile<-fromJSON(file)
rfile
## $books
## book id
## 1 1
## 2 2
## 3 3
## title
## 1 Introduction to Fixed Income Analytics
## 2 Calculus
## 3 Data Analysis Using Regression and Multilevel/Hierarchical Models
## authors.first authors.second length academic
## 1 Frank Fabozzi Steven Mann 321 no
## 2 Larson Edwards 1141 yes
## 3 Andrew Gelman Jennifer Hill 592 yes
df.json<-rfile[[1]]
df.json.1<-df.json %>%
select(title,length,academic) %>%
cbind(df.json$authors[,1]) %>%
cbind(df.json$authors[,2])
names(df.json.1)[4:5]<-c('authors.1','authors.2')
my.file<-'~/Documents/CUNY/data_class/week7-xml_json/books.html'
htm<-xmlParse(my.file,isHTML = TRUE)
htm.1<-htmlTreeParse(my.file)
root<-xmlRoot(htm.1) #allows for indexing of nodes
blah<-readHTMLTable(htm)
html.df<-blah[[1]]
html.df
## Title
## 1 Calculus
## 2 Data Analysis Using Regression and Multilevel/Hierarchical Models
## 3 Introduction to Fixed Income Analytics
## Author Author.2 Length Academic
## 1 Larson Edwards 1141 yes
## 2 Jennifer Hill Andrew Gelman 592 yes
## 3 Frank Fabozzi Steven Mann 321 no
The three dataframes are the same - they contain the same data, datatype and structure. We prove this by looking at the dimensions and column names for each.
lapply(list(html.df,df.json.1,df.xml),is.data.frame)
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] TRUE
##
## [[3]]
## [1] TRUE
lapply(list(html.df,df.json.1,df.xml),names)
## [[1]]
## [1] "Title" "Author" "Author.2" "Length" "Academic"
##
## [[2]]
## [1] "title" "length" "academic" "authors.1" "authors.2"
##
## [[3]]
## [1] "title" "length" "academic" "authors.first"
## [5] "authors.second"
lapply(list(html.df,df.json.1,df.xml),dim)
## [[1]]
## [1] 3 5
##
## [[2]]
## [1] 3 5
##
## [[3]]
## [1] 3 5