IS607-Week9 Assignment

This is a warm up exercise to help you to get more familiar with the HTML, XML, and JSON file formats, and using packages to read these data formats for downstream use in R data frames.

File URL locations:

#error from reading github urls
#html_url <- "https://raw.github.com/shcuny/Cuny/blob/master/IS607/Books.html"
#json_url <- "https://raw.github.com/shcuny/Cuny/blob/master/IS607/Books.json"
#xml_url <- "https://raw.github.com/shcuny/Cuny/blob/master/IS607/Books.xml"

#read the files from desktop and files are located on github
#https://github.com/shcuny/Cuny/blob/master/IS607/Books.html
#https://github.com/shcuny/Cuny/blob/master/IS607/Books.json
#https://github.com/shcuny/Cuny/blob/master/IS607/Books.xml

#access files from desktop
html_url <- "C:/Users/Tempest/Desktop/Books.html"
json_url <- "C:/Users/Tempest/Desktop/Books.json"
xml_url <-  "C:/Users/Tempest/Desktop/Books.xml"

library(plyr)
library(knitr)
library(RCurl)
## Loading required package: bitops
library(XML)
library(rjson)

Read in Books.html

#read html url
#library(XML)
tables <- readHTMLTable(html_url)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
tables[[which.max(n.rows)]]
##        Subject                                            Title
## 1 Data Science    Mathematical Statistics with Resampling and R
## 2 Data Science XML and Web Technologies for Data Science with R
## 3 Data Science                              The Little SAS Book
##                                Authors Year Language
## 1     Laura Chihara and Tim Hesterberg 2011  English
## 2 Deborah Nolan and Duncan Temple Lang 2014  English
## 3   Lora D.Delwiche,Susan J. Slaughter 2008  English

Read in Books.json

#read json url
#library(rjson)

#http://stackoverflow.com/questions/20925492/how-to-import-json-into-r-and-convert-it-to-table
# You can pass directly the filename
my.JSON <- fromJSON(file=json_url)
json_df <- lapply(my.JSON, function(play) # Loop through each "play"
{
  # Convert each group to a data frame.
  # This assumes you have 6 elements each time
  data.frame(matrix(unlist(play), ncol=5, byrow=T))
})

# Now you have a list of data frames, connect them together in
# one single dataframe
json_df <- do.call(rbind, json_df)
# Make column names nicer, remove row names
colnames(df) <- names(my.JSON[[1]][[1]])
rownames(df) <- NULL
json_df
##             X1                                               X2
## 1 Data Science    Mathematical Statistics with Resampling and R
## 2 Data Science XML and Web Technologies for Data Science with R
## 3 Data Science                              The Little SAS Book
##                                     X3   X4      X5
## 1     Laura Chihara and Tim Hesterberg 2011 English
## 2 Deborah Nolan and Duncan Temple Lang 2014 English
## 3 Lora D. Delwiche, Susan J. Slaughter 2008 English

Read in Books.xml

library(XML)
data <- xmlToDataFrame(xml_url)
head(data)
##        Subject                                            Title
## 1 Data Science    Mathematical Statistics with Resampling and R
## 2 Data Science XML and Web Technologies for Data Science with R
## 3 Data Science                               Duncan Temple Lang
##                              Authors Year Language
## 1        Laura ChiharaTim Hesterberg 2011  English
## 2    Deborah NolanDuncan Temple Lang 2014  English
## 3 Lora D. DelwicheSusan J. Slaughter 2011  English