library(XML)
library(rjson)
library(RCurl)
## Loading required package: bitops
library(stringr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.2
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.4.2
##
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
library(RJSONIO)
##
## Attaching package: 'RJSONIO'
## The following objects are masked from 'package:jsonlite':
##
## fromJSON, toJSON
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
library(prettydoc)
## Warning: package 'prettydoc' was built under R version 3.4.2
library(jsonlite)
url_html <- getURLContent("https://raw.githubusercontent.com/doradu8030/Data607/master/books.html")
writeLines(url_html)
## <U+FEFF><html>
## <head>
## <title>Favorite Books</title>
## </head>
## <body>
## <table border = "1">
## <tr>
## <td>title</td>
## <td>author</td>
## <td>topic</td>
## <td>publishedyear</td>
## <td>ISBN</td>
## </tr>
##
## <tr>
## <td>Think Big: Unleashing Your Potential for Excellence</td>
## <td>Ben Carson MD,Cecil Murphey (Contributor)</td>
## <td>Self Motivation</td>
## <td>2015</td>
## <td>978-0310343363</td>
##
## <tr>
## <td>You Can't Teach a Kid to Ride a Bike at a Seminar</td>
## <td>David H.Sandler,John Hayes</td>
## <td>Self-Confidence</td>
## <td>1996</td>
## <td>978-0967179902</td>
##
## <tr>
## <td>R for Data Science</td>
## <td>Garrett Grolemund, Hadley Wickham</td>
## <td>Programming</td>
## <td>2017</td>
## <td>978-1491910399</td>
## </tr>
## </table>
## </body>
## </html>
html_df <- readHTMLTable(url_html, which = 1)
html_df
## title
## 1 Think Big: Unleashing Your Potential for Excellence
## 2 You Can't Teach a Kid to Ride a Bike at a Seminar
## 3 R for Data Science
## author topic publishedyear
## 1 Ben Carson MD,Cecil Murphey (Contributor) Self Motivation 2015
## 2 David H.Sandler,John Hayes Self-Confidence 1996
## 3 Garrett Grolemund, Hadley Wickham Programming 2017
## ISBN
## 1 978-0310343363
## 2 978-0967179902
## 3 978-1491910399
books.html <- readLines("https://raw.githubusercontent.com/doradu8030/Data607/master/books.html")
## Warning in readLines("https://raw.githubusercontent.com/doradu8030/
## Data607/master/books.html"): incomplete final line found on 'https://
## raw.githubusercontent.com/doradu8030/Data607/master/books.html'
td.pattern = "<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>"
b.content = grep(td.pattern, books.html[1:length(books.html)], value = TRUE)
b.info <- trimws(unlist(sub("<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>", "\\1", b.content)))
b.info
## [1] "title"
## [2] "author"
## [3] "topic"
## [4] "publishedyear"
## [5] "ISBN"
## [6] "Think Big: Unleashing Your Potential for Excellence"
## [7] "Ben Carson MD,Cecil Murphey (Contributor)"
## [8] "Self Motivation"
## [9] "2015"
## [10] "978-0310343363"
## [11] "You Can't Teach a Kid to Ride a Bike at a Seminar"
## [12] "David H.Sandler,John Hayes"
## [13] "Self-Confidence"
## [14] "1996"
## [15] "978-0967179902"
## [16] "R for Data Science"
## [17] "Garrett Grolemund, Hadley Wickham"
## [18] "Programming"
## [19] "2017"
## [20] "978-1491910399"
str(books.html)
## chr [1:38] "<html>" " \t<head>" ...
b.infoDF <- cbind.data.frame(split(b.info, rep(1:5, times=length(b.info)/5)), stringsAsFactors=F)
names(b.infoDF) <- c("Title", "Author", "Theme", "Publication_Year", "ISBN")
b.infoDF = b.infoDF[-1,]
# HTML Contents as Dataframe
b.infoDF
## Title
## 2 Think Big: Unleashing Your Potential for Excellence
## 3 You Can't Teach a Kid to Ride a Bike at a Seminar
## 4 R for Data Science
## Author Theme
## 2 Ben Carson MD,Cecil Murphey (Contributor) Self Motivation
## 3 David H.Sandler,John Hayes Self-Confidence
## 4 Garrett Grolemund, Hadley Wickham Programming
## Publication_Year ISBN
## 2 2015 978-0310343363
## 3 1996 978-0967179902
## 4 2017 978-1491910399
str(b.infoDF)
## 'data.frame': 3 obs. of 5 variables:
## $ Title : chr "Think Big: Unleashing Your Potential for Excellence" "You Can't Teach a Kid to Ride a Bike at a Seminar" "R for Data Science"
## $ Author : chr "Ben Carson MD,Cecil Murphey (Contributor)" "David H.Sandler,John Hayes" "Garrett Grolemund, Hadley Wickham"
## $ Theme : chr "Self Motivation" "Self-Confidence" "Programming"
## $ Publication_Year: chr "2015" "1996" "2017"
## $ ISBN : chr "978-0310343363" "978-0967179902" "978-1491910399"
#books.json = JSON.parse()
download.file("https://raw.githubusercontent.com/doradu8030/Data607/master/books.XML", destfile = "book_XML.xml")
book_XML.xml <- xmlParse("book_XML.xml")
books.XMLDF <- xmlToDataFrame(book_XML.xml)
str(books.XMLDF)
## 'data.frame': 3 obs. of 5 variables:
## $ title : Factor w/ 3 levels "R for Data Science",..: 2 3 1
## $ author : Factor w/ 3 levels "Ben Carson MD,Cecil,Murphey (Contributor)",..: 1 2 3
## $ topic : Factor w/ 3 levels "Programming",..: 3 2 1
## $ publishedyear: Factor w/ 3 levels "1996","2015",..: 2 1 3
## $ ISBN : Factor w/ 3 levels "978-0310343363",..: 1 2 3
books.json <- getURLContent("https://raw.githubusercontent.com/doradu8030/Data607/master/books1.json")
writeLines(books.json)
## <U+FEFF>{"favorite-books":[
## {
## "title":"Think Big: Unleashing Your Potential for Excellence",
## "author":"Ben Carson M.D,Cecil Murphey-Contributor",
## "topic":"Personal Formula for success,Self Motivation,Hope for Good things and be honest",
## "published_year":"2015",
## "ISBN":978-0310343363
## },
## {
## "title":"You Can't Teach a Kid to Ride a Bike at a Seminar",
## "author":"David H.Sandler,John Hayes",
## "topic":"Self-Confidence,Higher achivement,buyer-seller dance",
## "published_year":"1996",
## "ISBN":978-0967179902
## },
## {
## "title":"R for Data Science",
## "author":"Garrett Grolemund, Hadley Wickham",
## "topic":"Clean data,Transforming data,Visualise",
## "published_year":"2017",
## "ISBN":978-1491910399
## }]
## }
##
Conclusion
Afert tons of attemps HTML and XML files are identical and also they are less difficult to manipulate than Jason files. Eventhough, I was not able to show the comparision between them, I was able to see in R when i was running chunks individually that the JSON file sctructure is different that XML and HTML files. I left some of the commands to load and parse a JSON file beign that at firts those command were giving the expected result but i could not figure out why after few attemps those commands were given tons of errors. For sure, i learned how to get data from HTML, XML and JSON files and the difference between their structures.
Also, I notice that the column’s data type in JSON is ’chr’while in XML and HTML are ’factors.
json.Df <- as.data.frame(fromJSON(books.json))