# This is the code chunk for reading in the HTML file into R as a data frame
library(XML)
library(RCurl)
## Loading required package: bitops
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#fileurl<-"C:\\Jagdish\\Masters Programs\\CUNY\\Data 607 Data Acquisition and Management\\Week7\\books.html"
fileurl<-getURL("https://raw.githubusercontent.com/Jagdish16/jagdish_r_repo/master/DATA607/Week7/books.html")
#books1<-readHTMLTable(doc=fileurl, trim=T, as.data.frame=T, header=T, Encoding="windows-1252")
#books2<-htmlParse(file=fileurl, encoding = "windows-1252", as.data.frame=T)
#books2<-read_html(fileurl, encoding = "UTF-8")
#tables<-html_nodes(books2, "table")
# table1 <- html_table(tables, fill = TRUE)
booksdf <- as.data.frame(read_html(fileurl) %>% html_table(fill=TRUE))
booksdf
# This is the code chunk for reading in the XML file into R as a data frame
library(XML)
library(RCurl)
library(httr)
## Warning: package 'httr' was built under R version 3.5.3
#fileurl<-"C:\\Jagdish\\Masters Programs\\CUNY\\Data 607 Data Acquisition and Management\\Week7\\books1.xml"
fileurl<-getURL("https://raw.githubusercontent.com/Jagdish16/jagdish_r_repo/master/DATA607/Week7/books1.xml")
books<-xmlParse(file=fileurl)
books_df<-xmlToDataFrame(books, stringsAsFactors = FALSE)
books_df
# This is the code chunk for reading in the JSON file into R as a data frame
library(jsonlite)
library(dplyr)
library(RCurl)
#fileurl<-"C:\\Jagdish\\Masters Programs\\CUNY\\Data 607 Data Acquisition and Management\\Week7\\books5.json"
fileurl<-getURL("https://raw.githubusercontent.com/Jagdish16/jagdish_r_repo/master/DATA607/Week7/books5.json")
books.df<-fromJSON(fileurl) %>% as.data.frame
books.df
The 3 dataframes are not completely identical. There are minor differences such as: 1) the “id” column missing in the dataframe converted from the html file; 2) capitalization of column names is different across the datafarmes created from html versus xml, while the column names for the dataframe created from the json file are pre-fixed by the name of the first object in the json file. Presumably these differences are on account of how the different handling functions work under the hood.