I am going to create a book data set that contains 3 books where 2 of the books have 2 authors and one book has only one. We use line to identify the multiple records of the book. For example Line 1 indicates the main author and line 2 the second author.
library(XML)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(RCurl)
##
## Attaching package: 'RCurl'
##
## The following object is masked from 'package:tidyr':
##
## complete
library(XML)
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(DT)
library(xml2)
library(methods)
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(htmlTable)
library(sjPlot)
## #refugeeswelcome
# creating the data set for the book store
book <- data.frame(Book_name = c(c('Data Science for Business','Data Science for Business'),
c('The Miseducation of the Negro','The Miseducation of the Negro'),
'Twilight'),
Author = c(c('Tom Fawcett','Foster Provost'),
c('Carter Godwin Woodson','H. Khalif Khalifah'),'Stephenie Meyer'),
Line = c(c(1,2),c(1,2),1), # 1 mean main author
ISB_Number = c(c('9781449374280','9781449374280'),c('9781564110411','9781564110411'),'9780316007443'),
Page_Counts = c(c(414,414),c(215,215),544),
Published = c(c('08/27/2023','08/27/2023'),
c('01/01/1992','01/01/1992'),
'08/18/2007'))
book
## Book_name Author Line ISB_Number
## 1 Data Science for Business Tom Fawcett 1 9781449374280
## 2 Data Science for Business Foster Provost 2 9781449374280
## 3 The Miseducation of the Negro Carter Godwin Woodson 1 9781564110411
## 4 The Miseducation of the Negro H. Khalif Khalifah 2 9781564110411
## 5 Twilight Stephenie Meyer 1 9780316007443
## Page_Counts Published
## 1 414 08/27/2023
## 2 414 08/27/2023
## 3 215 01/01/1992
## 4 215 01/01/1992
## 5 544 08/18/2007
there some steps required to build the xml file. Fist, i need to create a empty xml file and iterate between the variables of the book data set.
# CREATE XML FILE
book_doc = newXMLDoc()
root = newXMLNode("book_store", doc = book_doc)
# WRITE XML NODES AND DATA
for (i in 1:nrow(book)){
prodNode = newXMLNode("books", parent = root)
# APPEND TO PRODUCT NODE
newXMLNode("Book_name", book$Book_name[i], parent = prodNode)
newXMLNode("Author", book$Author[i], parent = prodNode)
newXMLNode("Line", book$Line[i], parent = prodNode)
newXMLNode("ISB_Number", book$ISB_Number[i], parent = prodNode)
newXMLNode("Page_Counts", book$Page_Counts[i], parent = prodNode)
newXMLNode("Published", book$Published[i], parent = prodNode)
}
vwNode = newXMLNode("views", parent = root)
print(book_doc)
## <?xml version="1.0"?>
## <book_store>
## <books>
## <Book_name>Data Science for Business</Book_name>
## <Author>Tom Fawcett</Author>
## <Line>1</Line>
## <ISB_Number>9781449374280</ISB_Number>
## <Page_Counts>414</Page_Counts>
## <Published>08/27/2023</Published>
## </books>
## <books>
## <Book_name>Data Science for Business</Book_name>
## <Author>Foster Provost</Author>
## <Line>2</Line>
## <ISB_Number>9781449374280</ISB_Number>
## <Page_Counts>414</Page_Counts>
## <Published>08/27/2023</Published>
## </books>
## <books>
## <Book_name>The Miseducation of the Negro</Book_name>
## <Author>Carter Godwin Woodson</Author>
## <Line>1</Line>
## <ISB_Number>9781564110411</ISB_Number>
## <Page_Counts>215</Page_Counts>
## <Published>01/01/1992</Published>
## </books>
## <books>
## <Book_name>The Miseducation of the Negro</Book_name>
## <Author>H. Khalif Khalifah</Author>
## <Line>2</Line>
## <ISB_Number>9781564110411</ISB_Number>
## <Page_Counts>215</Page_Counts>
## <Published>01/01/1992</Published>
## </books>
## <books>
## <Book_name>Twilight</Book_name>
## <Author>Stephenie Meyer</Author>
## <Line>1</Line>
## <ISB_Number>9780316007443</ISB_Number>
## <Page_Counts>544</Page_Counts>
## <Published>08/18/2007</Published>
## </books>
## <views/>
## </book_store>
##
# OUTPUT XML CONTENT TO FILE
saveXML(book_doc, file="Book_store_library.xml")
## [1] "Book_store_library.xml"
Now we are going to upload the file on github and read it and transform it into a data frame.
# creating the url link
url <- getURL("https://raw.githubusercontent.com/joewarner89/CUNY-607/main/homeworks/Assignement%208/Book_store_library.xml")
# load the data from the url
book_data <- xmlParse(url)
print(book_data)
## <?xml version="1.0"?>
## <book_store>
## <books>
## <Book_name>Data Science for Business</Book_name>
## <Author>Tom Fawcett</Author>
## <Line>1</Line>
## <ISB_Number>9781449374280</ISB_Number>
## <Page_Counts>414</Page_Counts>
## <Published>08/27/2023</Published>
## </books>
## <books>
## <Book_name>Data Science for Business</Book_name>
## <Author>Foster Provost</Author>
## <Line>2</Line>
## <ISB_Number>9781449374280</ISB_Number>
## <Page_Counts>414</Page_Counts>
## <Published>08/27/2023</Published>
## </books>
## <books>
## <Book_name>The Miseducation of the Negro</Book_name>
## <Author>Carter Godwin Woodson</Author>
## <Line>1</Line>
## <ISB_Number>9781564110411</ISB_Number>
## <Page_Counts>215</Page_Counts>
## <Published>01/01/1992</Published>
## </books>
## <books>
## <Book_name>The Miseducation of the Negro</Book_name>
## <Author>H. Khalif Khalifah</Author>
## <Line>2</Line>
## <ISB_Number>9781564110411</ISB_Number>
## <Page_Counts>215</Page_Counts>
## <Published>01/01/1992</Published>
## </books>
## <books>
## <Book_name>Twilights</Book_name>
## <Author>Stephenie Meyer</Author>
## <Line>1</Line>
## <ISB_Number>9780316007443</ISB_Number>
## <Page_Counts>544</Page_Counts>
## <Published>08/18/2007</Published>
## </books>
## <views/>
## </book_store>
##
rootnode <- xmlRoot(book_data)
# check the root size
rootsize <- xmlSize(rootnode)
book_frm <- xmlToDataFrame(book_data)
head(book_frm)
## Book_name Author Line ISB_Number
## 1 Data Science for Business Tom Fawcett 1 9781449374280
## 2 Data Science for Business Foster Provost 2 9781449374280
## 3 The Miseducation of the Negro Carter Godwin Woodson 1 9781564110411
## 4 The Miseducation of the Negro H. Khalif Khalifah 2 9781564110411
## 5 Twilights Stephenie Meyer 1 9780316007443
## 6 <NA> <NA> <NA> <NA>
## Page_Counts Published
## 1 414 08/27/2023
## 2 414 08/27/2023
## 3 215 01/01/1992
## 4 215 01/01/1992
## 5 544 08/18/2007
## 6 <NA> <NA>
The same table will be used to create the html table. Book data frame was designed to emulate all type file conversion.
head(book)
## Book_name Author Line ISB_Number
## 1 Data Science for Business Tom Fawcett 1 9781449374280
## 2 Data Science for Business Foster Provost 2 9781449374280
## 3 The Miseducation of the Negro Carter Godwin Woodson 1 9781564110411
## 4 The Miseducation of the Negro H. Khalif Khalifah 2 9781564110411
## 5 Twilight Stephenie Meyer 1 9780316007443
## Page_Counts Published
## 1 414 08/27/2023
## 2 414 08/27/2023
## 3 215 01/01/1992
## 4 215 01/01/1992
## 5 544 08/18/2007
# Using Books to create the HTLM table
#html_table <- tab_df(book,show.rownames = T,title = "My Favorite Books")
html_table <- htmlTable(book)
# export the data into html
writeLines(html_table,sep = "", con = 'books_html.html')
html_table
Book_name | Author | Line | ISB_Number | Page_Counts | Published | |
---|---|---|---|---|---|---|
1 | Data Science for Business | Tom Fawcett | 1 | 9781449374280 | 414 | 08/27/2023 |
2 | Data Science for Business | Foster Provost | 2 | 9781449374280 | 414 | 08/27/2023 |
3 | The Miseducation of the Negro | Carter Godwin Woodson | 1 | 9781564110411 | 215 | 01/01/1992 |
4 | The Miseducation of the Negro | H. Khalif Khalifah | 2 | 9781564110411 | 215 | 01/01/1992 |
5 | Twilight | Stephenie Meyer | 1 | 9780316007443 | 544 | 08/18/2007 |
Let read the table from Github.
url_html <- getURL('https://raw.githubusercontent.com/joewarner89/CUNY-607/main/homeworks/Assignement%208/books_html.html')
book_ht <- htmlTable(url_html)
book_final_html <- url_html %>%
read_html(encoding = 'UTF-8',skip = 1) %>%
html_table(header = T, trim = TRUE) %>%
.[[1]]
book_final_html
## # A tibble: 5 × 7
## `` Book_name Author Line ISB_Number Page_Counts Published
## <int> <chr> <chr> <int> <dbl> <int> <chr>
## 1 1 Data Science for Business Tom F… 1 9.78e12 414 08/27/20…
## 2 2 Data Science for Business Foste… 2 9.78e12 414 08/27/20…
## 3 3 The Miseducation of the N… Carte… 1 9.78e12 215 01/01/19…
## 4 4 The Miseducation of the N… H. Kh… 2 9.78e12 215 01/01/19…
## 5 5 Twilight Steph… 1 9.78e12 544 08/18/20…
i will use the same data set i create at the beginning of the project. I will transform book data set into JSON format.
library(jsonlite)
jsonlite::toJSON(x = book,
dataframe = 'values',
pretty = T)
## [
## ["Data Science for Business", "Tom Fawcett", 1, "9781449374280", 414, "08/27/2023"],
## ["Data Science for Business", "Foster Provost", 2, "9781449374280", 414, "08/27/2023"],
## ["The Miseducation of the Negro", "Carter Godwin Woodson", 1, "9781564110411", 215, "01/01/1992"],
## ["The Miseducation of the Negro", "H. Khalif Khalifah", 2, "9781564110411", 215, "01/01/1992"],
## ["Twilight", "Stephenie Meyer", 1, "9780316007443", 544, "08/18/2007"]
## ]
# add object
jsonlite::toJSON(x = book,
dataframe = 'columns',pretty = T)
## {
## "Book_name": ["Data Science for Business", "Data Science for Business", "The Miseducation of the Negro", "The Miseducation of the Negro", "Twilight"],
## "Author": ["Tom Fawcett", "Foster Provost", "Carter Godwin Woodson", "H. Khalif Khalifah", "Stephenie Meyer"],
## "Line": [1, 2, 1, 2, 1],
## "ISB_Number": ["9781449374280", "9781449374280", "9781564110411", "9781564110411", "9780316007443"],
## "Page_Counts": [414, 414, 215, 215, 544],
## "Published": ["08/27/2023", "08/27/2023", "01/01/1992", "01/01/1992", "08/18/2007"]
## }
# add objects with row s
book_json <- jsonlite::toJSON(x = book,
dataframe = 'rows',pretty = T)
book_json
## [
## {
## "Book_name": "Data Science for Business",
## "Author": "Tom Fawcett",
## "Line": 1,
## "ISB_Number": "9781449374280",
## "Page_Counts": 414,
## "Published": "08/27/2023"
## },
## {
## "Book_name": "Data Science for Business",
## "Author": "Foster Provost",
## "Line": 2,
## "ISB_Number": "9781449374280",
## "Page_Counts": 414,
## "Published": "08/27/2023"
## },
## {
## "Book_name": "The Miseducation of the Negro",
## "Author": "Carter Godwin Woodson",
## "Line": 1,
## "ISB_Number": "9781564110411",
## "Page_Counts": 215,
## "Published": "01/01/1992"
## },
## {
## "Book_name": "The Miseducation of the Negro",
## "Author": "H. Khalif Khalifah",
## "Line": 2,
## "ISB_Number": "9781564110411",
## "Page_Counts": 215,
## "Published": "01/01/1992"
## },
## {
## "Book_name": "Twilight",
## "Author": "Stephenie Meyer",
## "Line": 1,
## "ISB_Number": "9780316007443",
## "Page_Counts": 544,
## "Published": "08/18/2007"
## }
## ]
# save the file in your local library and upload it in Github
write_lines(book_json, "newBook.json")
I am going to read the json file from Github and turn into a dataframe.
# read json file and transform it into dataframe
sample_data <- jsonlite::read_json('https://raw.githubusercontent.com/joewarner89/CUNY-607/main/homeworks/Assignement%208/newBook.json',auto_unbox = T)
sample_data
## [[1]]
## [[1]]$Book_name
## [1] "Data Science for Business"
##
## [[1]]$Author
## [1] "Tom Fawcett"
##
## [[1]]$Line
## [1] 1
##
## [[1]]$ISB_Number
## [1] "9781449374280"
##
## [[1]]$Page_Counts
## [1] 414
##
## [[1]]$Published
## [1] "08/27/2023"
##
##
## [[2]]
## [[2]]$Book_name
## [1] "Data Science for Business"
##
## [[2]]$Author
## [1] "Foster Provost"
##
## [[2]]$Line
## [1] 2
##
## [[2]]$ISB_Number
## [1] "9781449374280"
##
## [[2]]$Page_Counts
## [1] 414
##
## [[2]]$Published
## [1] "08/27/2023"
##
##
## [[3]]
## [[3]]$Book_name
## [1] "The Miseducation of the Negro"
##
## [[3]]$Author
## [1] "Carter Godwin Woodson"
##
## [[3]]$Line
## [1] 1
##
## [[3]]$ISB_Number
## [1] "9781564110411"
##
## [[3]]$Page_Counts
## [1] 215
##
## [[3]]$Published
## [1] "01/01/1992"
##
##
## [[4]]
## [[4]]$Book_name
## [1] "The Miseducation of the Negro"
##
## [[4]]$Author
## [1] "H. Khalif Khalifah"
##
## [[4]]$Line
## [1] 2
##
## [[4]]$ISB_Number
## [1] "9781564110411"
##
## [[4]]$Page_Counts
## [1] 215
##
## [[4]]$Published
## [1] "01/01/1992"
##
##
## [[5]]
## [[5]]$Book_name
## [1] "Twilights"
##
## [[5]]$Author
## [1] "Stephenie Meyer"
##
## [[5]]$Line
## [1] 1
##
## [[5]]$ISB_Number
## [1] "9780316007443"
##
## [[5]]$Page_Counts
## [1] 544
##
## [[5]]$Published
## [1] "08/18/2007"
# transform it into a data frame
book_sample <- as.data.frame(jsonlite::fromJSON('newBook.json'))
book_sample
## Book_name Author Line ISB_Number
## 1 Data Science for Business Tom Fawcett 1 9781449374280
## 2 Data Science for Business Foster Provost 2 9781449374280
## 3 The Miseducation of the Negro Carter Godwin Woodson 1 9781564110411
## 4 The Miseducation of the Negro H. Khalif Khalifah 2 9781564110411
## 5 Twilight Stephenie Meyer 1 9780316007443
## Page_Counts Published
## 1 414 08/27/2023
## 2 414 08/27/2023
## 3 215 01/01/1992
## 4 215 01/01/1992
## 5 544 08/18/2007
this assignment requires a series of steps to create the data set and transform in three different formats XLM, HTM and JSON. We use different packages to play around with the data set. All files are available in Github.