Load Packages
knitr::opts_chunk$set(warning=FALSE,
message=FALSE,
tidy=F,
#comment = "",
dev="png",
dev.args=list(type="cairo"))
library(XML)
library(rjson)
library(RCurl)
## Loading required package: bitops
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 3.4.2
##
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
library(RJSONIO)
##
## Attaching package: 'RJSONIO'
## The following objects are masked from 'package:jsonlite':
##
## fromJSON, toJSON
## The following objects are masked from 'package:rjson':
##
## fromJSON, toJSON
suppressMessages(library(XML))
suppressWarnings(library(XML))
suppressMessages(library(rjson))
suppressMessages(library(jsonlite))
suppressWarnings(library(jsonlite))
suppressPackageStartupMessages(library(jsonlite))
suppressMessages(library(RJSONIO))
suppressWarnings(library(RJSONIO))
Reading a HTML file
books.html <- readLines("https://raw.githubusercontent.com/doradu8030/Data607/master/books.html")
###Finding the lines matching the pattern "<td>...</td>"
html.pattern = "<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>"
###Tyding the file
b.content = grep(html.pattern, books.html[1:length(books.html)], value = TRUE)
b.info <- trimws(unlist(sub("<td>([0-9A-Za-z\\., \\(\\)-\\']*)</td>", "\\1", b.content)))
b.info
## [1] "Book Title"
## [2] "Author(s)"
## [3] "Topic"
## [4] "Year Publisher"
## [5] "ISBN"
## [6] "Think Big: Unleashing Your Potential for Excellence"
## [7] "Ben Carson M.D,Cecil Murphey (Contributor)"
## [8] "Personal Formula for success,Self Motivation,Hope for Good things and be honest"
## [9] "2015"
## [10] "978-0310343363"
## [11] "You Can't Teach a Kid to Ride a Bike at a Seminar"
## [12] "David H.Sandler,John Hayes"
## [13] "Self-Confidence,Higher achivement,buyer-seller dance"
## [14] "1996"
## [15] "978-0967179902"
## [16] "R for Data Science"
## [17] "Garrett Grolemund, Hadley Wickham"
## [18] "Clean data,Transforming data,Visualise"
## [19] "2017"
## [20] "978-1491910399"
Convert in a Data.frame
b.infoDF <- cbind.data.frame(split(b.info, rep(1:5, times=length(b.info)/5)), stringsAsFactors=F)
names(b.infoDF) <- c("Title", "Author", "Topic", "Publication_Year", "ISBN")
b.infoDF = b.infoDF[-1,]
# HTML Contents as Dataframe
b.infoDF
## Title
## 2 Think Big: Unleashing Your Potential for Excellence
## 3 You Can't Teach a Kid to Ride a Bike at a Seminar
## 4 R for Data Science
## Author
## 2 Ben Carson M.D,Cecil Murphey (Contributor)
## 3 David H.Sandler,John Hayes
## 4 Garrett Grolemund, Hadley Wickham
## Topic
## 2 Personal Formula for success,Self Motivation,Hope for Good things and be honest
## 3 Self-Confidence,Higher achivement,buyer-seller dance
## 4 Clean data,Transforming data,Visualise
## Publication_Year ISBN
## 2 2015 978-0310343363
## 3 1996 978-0967179902
## 4 2017 978-1491910399
str(b.infoDF)
## 'data.frame': 3 obs. of 5 variables:
## $ Title : chr "Think Big: Unleashing Your Potential for Excellence" "You Can't Teach a Kid to Ride a Bike at a Seminar" "R for Data Science"
## $ Author : chr "Ben Carson M.D,Cecil Murphey (Contributor)" "David H.Sandler,John Hayes" "Garrett Grolemund, Hadley Wickham"
## $ Topic : chr "Personal Formula for success,Self Motivation,Hope for Good things and be honest" "Self-Confidence,Higher achivement,buyer-seller dance" "Clean data,Transforming data,Visualise"
## $ Publication_Year: chr "2015" "1996" "2017"
## $ ISBN : chr "978-0310343363" "978-0967179902" "978-1491910399"
Reading a XML file
#books.json = JSON.parse()
download.file("https://raw.githubusercontent.com/doradu8030/Data607/master/books.XML", destfile = "book_XML.xml")
book_XML.xml <- xmlParse("book_XML.xml")
books.XMLDF <- xmlToDataFrame(book_XML.xml)
books.XMLDF
## title
## 1 Think Big: Unleashing Your Potential for Excellence
## 2 You Can't Teach a Kid to Ride a Bike at a Seminar
## 3 R for Data Science
## author
## 1 Ben Carson M.D,Cecil Murphey (Contributor)
## 2 David H.Sandler,John Hayes
## 3 Garrett Grolemund, Hadley Wickham
## topic
## 1 Personal Formula for success,Self Motivation,Hope for Good things and be honest
## 2 Self-Confidence,Higher achivement,buyer-seller dance
## 3 Clean data,Transforming data,Visualise
## published_year ISBN
## 1 2015 978-0310343363
## 2 1996 978-0967179902
## 3 2017 978-1491910399
#str(books.XMLDF)
Reading a JSON file
url.json <-"https://raw.githubusercontent.com/doradu8030/Data607/master/books3.json"
json.Df <- fromJSON(url.json)
json.Df
## [[1]]
## [[1]]$title
## [1] "Think Big: Unleashing Your Potential for Excellence"
##
## [[1]]$authors
## [[1]]$authors[[1]]
## named list()
##
## [[1]]$authors[[2]]
## author
## "Cecil Murphey (Contributor)"
##
##
## [[1]]$topic
## [1] "Self Motivation"
##
## [[1]]$published_year
## [1] "2015"
##
## [[1]]$ISBN
## [1] "978-0310343363"
##
##
## [[2]]
## [[2]]$title
## [1] "You Can't Teach a Kid to Ride a Bike at a Seminar"
##
## [[2]]$authors
## [[2]]$authors[[1]]
## author:
## "David H Sandler"
##
## [[2]]$authors[[2]]
## author
## "John Hayes"
##
##
## [[2]]$topic
## [1] "Self-Confidence"
##
## [[2]]$published_year
## [1] "1996"
##
## [[2]]$ISBN
## [1] "978-0967179902"
##
##
## [[3]]
## NULL
#DF.json <- as.data.frame(json.Df)
#DF.json
#names(b.infoDF) <- c("Title", "Author", "Topic", "Publication_Year", "ISBN")
#Parsing JSON content
#head(json.Df[[1]], n=3)
knitr::kable(json.Df)
| Think Big: Unleashing Your Potential for Excellence |
|
|
|
| author |
Cecil Murphey (Contributor) |
|
|
|
|
|
|
| You Can’t Teach a Kid to Ride a Bike at a Seminar |
|
|
|
|
|
|
|
#names(bs) = c("Title", "Author", "Topic", "Published Year","ISBN-13")
```