knitr::opts_chunk$set(warning=FALSE,
message=FALSE,
tidy=F,
#comment = "",
dev="png",
dev.args=list(type="cairo"))
library(XML)
library(rjson)
library(RCurl)
## Loading required package: bitops
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.4.2
##
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
library(RJSONIO)
##
## Attaching package: 'RJSONIO'
## The following objects are masked from 'package:jsonlite':
##
## fromJSON, toJSON
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
suppressMessages(library(XML))
suppressWarnings(library(XML))
suppressMessages(library(rjson))
suppressMessages(library(jsonlite))
suppressWarnings(library(jsonlite))
suppressPackageStartupMessages(library(jsonlite))
suppressMessages(library(RJSONIO))
suppressWarnings(library(RJSONIO))
books.html <- readLines("https://raw.githubusercontent.com/doradu8030/Data607/master/books.html")
###Finding the lines matching the pattern "<td>...</td>"
html.pattern = "<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>"
###Tyding the file
b.content = grep(html.pattern, books.html[1:length(books.html)], value = TRUE)
b.info <- trimws(unlist(sub("<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>", "\\1", b.content)))
Convert in a Data.frame
b.infoDF <- cbind.data.frame(split(b.info, rep(1:5, times=length(b.info)/5)), stringsAsFactors=F)
names(b.infoDF) <- c("Title", "Author", "Topic", "Publication_Year", "ISBN")
b.infoDF = b.infoDF[-1,]
# HTML Contents as Dataframe
b.infoDF
## Title
## 2 Think Big: Unleashing Your Potential for Excellence
## 3 You Can't Teach a Kid to Ride a Bike at a Seminar
## 4 R for Data Science
## Author Topic
## 2 Ben Carson MD,Cecil Murphey (Contributor) Self Motivation
## 3 David H.Sandler,John Hayes Self-Confidence
## 4 Garrett Grolemund, Hadley Wickham Programming
## Publication_Year ISBN
## 2 2015 978-0310343363
## 3 1996 978-0967179902
## 4 2017 978-1491910399
url.xml = getURL("https://raw.githubusercontent.com/doradu8030/Data607/master/books2.XML")
xml_df <- xmlToDataFrame(url.xml)
#book.xml<- xmlParse(xml_df)
xml_df
## row
## 1 Think Big: Unleashing Your Potential for ExcellenceBen Carson MDSelf Motivation2015978-0310343363
## NA
## 1 You Can't Teach a Kid to Ride a Bike at a SeminarDavid H Sandler and John HayesSelf Motivation1996978-0967179902
## NA
## 1 R for Data ScienceGarrett Grolemund and Hadley WickhamProgramming2017978-1491910399
download.file("https://raw.githubusercontent.com/doradu8030/Data607/master/books.XML", destfile = "book_XML.xml")
book_XML.xml <- xmlParse("book_XML.xml")
books.XMLDF <- xmlToDataFrame(book_XML.xml)
books.XMLDF
## title
## 1 Think Big: Unleashing Your Potential for Excellence
## 2 You Can't Teach a Kid to Ride a Bike at a Seminar
## 3 R for Data Science
## author topic publishedyear
## 1 Ben Carson MD,Cecil,Murphey (Contributor) Self Motivation 2015
## 2 David H.Sandler,John Hayes Self-Confidence 1996
## 3 Garrett Grolemund, Hadley Wickham Programming 2017
## ISBN
## 1 978-0310343363
## 2 978-0967179902
## 3 978-1491910399
url.json <-"https://raw.githubusercontent.com/doradu8030/Data607/master/books3.json"
json.Df <- as.data.frame(fromJSON(url.json))
jsonData <- rjson::fromJSON(file="https://raw.githubusercontent.com/doradu8030/Data607/master/books3.json")
Conclusion
Afert tons of attemps HTML and XML files are identical and also they are less difficult to manipulate than Jason files. Eventhough, I was not able to show the comparision between them, I was able to see in R when i was running chunks individually that the JSON file sctructure is different that XML and HTML files. I left some of the commands to load and parse a JSON file beign that at firts those command were giving the expected result but i could not figure out why after few attemps those commands were given tons of errors. For sure, i learned how to get data from HTML, XML and JSON files and the difference between their structures.
Also, I notice that the column’s data type in JSON is ’chr’while in XML and HTML are ’factors.