scrapingAssignment

For this assignment, we create and parse an XML, JSON and html file all having the same data.

library(XML);library(jsonlite);library(plyr);library(tidyr);
library(dplyr)

XML File Parsing:

my.file<-'~/Documents/CUNY/data_class/week7-xml_json/books.xml'
parse.file<-xmlParse(file=my.file) #doesn't allow for indexing
parse.file<-xmlTreeParse(file=my.file)
root<-xmlRoot(parse.file) #allows for indexing of nodes

child.1<-xmlChildren(root)

#retrieve attributes:
y<-sapply(child.1,function(x) unlist(sapply(xmlChildren(x),xmlAttrs)))

k<-sapply(child.1,function(x) unlist(sapply(xmlChildren(x),xmlValue)))
k.1<-data.frame(k)
k.2<-k.1 %>%
  t() %>%
  data.frame()

j<-data.frame(t(y),check.names=FALSE)

## Warning in data.row.names(row.names, rowsi, i): some row.names duplicated:
## 2,3 --> row.names NOT used

df.xml<-cbind(k.2,j)

JSON Parsing:

file<-'~/Documents/CUNY/data_class/week7-xml_json/books_formatted.json'
#isValidJSON(file)
rfile<-fromJSON(file)
rfile

## $books
##   book id
## 1       1
## 2       2
## 3       3
##                                                               title
## 1                            Introduction to Fixed Income Analytics
## 2                                                          Calculus
## 3 Data Analysis Using Regression and Multilevel/Hierarchical Models
##   authors.first authors.second length academic
## 1 Frank Fabozzi    Steven Mann    321       no
## 2        Larson        Edwards   1141      yes
## 3 Andrew Gelman  Jennifer Hill    592      yes

df.json<-rfile[[1]]
df.json.1<-df.json %>%
  select(title,length,academic) %>%
  cbind(df.json$authors[,1]) %>%
  cbind(df.json$authors[,2])
  

names(df.json.1)[4:5]<-c('authors.1','authors.2')

HTML Parsing:

my.file<-'~/Documents/CUNY/data_class/week7-xml_json/books.html'
htm<-xmlParse(my.file,isHTML = TRUE)
htm.1<-htmlTreeParse(my.file)
root<-xmlRoot(htm.1) #allows for indexing of nodes

blah<-readHTMLTable(htm)
html.df<-blah[[1]]
html.df

##                                                               Title
## 1                                                          Calculus
## 2 Data Analysis Using Regression and Multilevel/Hierarchical Models
## 3                            Introduction to Fixed Income Analytics
##          Author      Author.2 Length Academic
## 1        Larson       Edwards   1141      yes
## 2 Jennifer Hill Andrew Gelman    592      yes
## 3 Frank Fabozzi   Steven Mann    321       no

Dataframe Comparison

The three dataframes are the same - they contain the same data, datatype and structure. We prove this by looking at the dimensions and column names for each.

lapply(list(html.df,df.json.1,df.xml),is.data.frame)

## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE

lapply(list(html.df,df.json.1,df.xml),names)

## [[1]]
## [1] "Title"    "Author"   "Author.2" "Length"   "Academic"
## 
## [[2]]
## [1] "title"     "length"    "academic"  "authors.1" "authors.2"
## 
## [[3]]
## [1] "title"          "length"         "academic"       "authors.first" 
## [5] "authors.second"

lapply(list(html.df,df.json.1,df.xml),dim)

## [[1]]
## [1] 3 5
## 
## [[2]]
## [1] 3 5
## 
## [[3]]
## [1] 3 5

scrapingAssignment

Luis Calleja

October 16, 2016

XML File Parsing:

JSON Parsing:

HTML Parsing:

Dataframe Comparison