This is a warm up exercise to help you to get more familiar with the HTML, XML, and JSON file formats, and using packages to read these data formats for downstream use in R data frames.
File URL locations:
#error from reading github urls
#html_url <- "https://raw.github.com/shcuny/Cuny/blob/master/IS607/Books.html"
#json_url <- "https://raw.github.com/shcuny/Cuny/blob/master/IS607/Books.json"
#xml_url <- "https://raw.github.com/shcuny/Cuny/blob/master/IS607/Books.xml"
#read the files from desktop and files are located on github
#https://github.com/shcuny/Cuny/blob/master/IS607/Books.html
#https://github.com/shcuny/Cuny/blob/master/IS607/Books.json
#https://github.com/shcuny/Cuny/blob/master/IS607/Books.xml
#access files from desktop
html_url <- "C:/Users/Tempest/Desktop/Books.html"
json_url <- "C:/Users/Tempest/Desktop/Books.json"
xml_url <- "C:/Users/Tempest/Desktop/Books.xml"
library(plyr)
library(knitr)
library(RCurl)
## Loading required package: bitops
library(XML)
library(rjson)
Read in Books.html
#read html url
#library(XML)
tables <- readHTMLTable(html_url)
n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
tables[[which.max(n.rows)]]
## Subject Title
## 1 Data Science Mathematical Statistics with Resampling and R
## 2 Data Science XML and Web Technologies for Data Science with R
## 3 Data Science The Little SAS Book
## Authors Year Language
## 1 Laura Chihara and Tim Hesterberg 2011 English
## 2 Deborah Nolan and Duncan Temple Lang 2014 English
## 3 Lora D.Delwiche,Susan J. Slaughter 2008 English
Read in Books.json
#read json url
#library(rjson)
#http://stackoverflow.com/questions/20925492/how-to-import-json-into-r-and-convert-it-to-table
# You can pass directly the filename
my.JSON <- fromJSON(file=json_url)
json_df <- lapply(my.JSON, function(play) # Loop through each "play"
{
# Convert each group to a data frame.
# This assumes you have 6 elements each time
data.frame(matrix(unlist(play), ncol=5, byrow=T))
})
# Now you have a list of data frames, connect them together in
# one single dataframe
json_df <- do.call(rbind, json_df)
# Make column names nicer, remove row names
colnames(df) <- names(my.JSON[[1]][[1]])
rownames(df) <- NULL
json_df
## X1 X2
## 1 Data Science Mathematical Statistics with Resampling and R
## 2 Data Science XML and Web Technologies for Data Science with R
## 3 Data Science The Little SAS Book
## X3 X4 X5
## 1 Laura Chihara and Tim Hesterberg 2011 English
## 2 Deborah Nolan and Duncan Temple Lang 2014 English
## 3 Lora D. Delwiche, Susan J. Slaughter 2008 English
Read in Books.xml
library(XML)
data <- xmlToDataFrame(xml_url)
head(data)
## Subject Title
## 1 Data Science Mathematical Statistics with Resampling and R
## 2 Data Science XML and Web Technologies for Data Science with R
## 3 Data Science Duncan Temple Lang
## Authors Year Language
## 1 Laura ChiharaTim Hesterberg 2011 English
## 2 Deborah NolanDuncan Temple Lang 2014 English
## 3 Lora D. DelwicheSusan J. Slaughter 2011 English