Week 7 assignment data 607

Introduction:

a matrix was created of books, then the matrix was saved as three file types xml,json and html. The files were uploaded onto github. the files were then read in and converted into dataframes.

library(htmlTable)

## Warning: package 'htmlTable' was built under R version 4.3.3

library(magrittr)
library(jsonlite)
library(tableHTML)

## Warning: package 'tableHTML' was built under R version 4.3.3

library(rjson)

## 
## Attaching package: 'rjson'

## The following objects are masked from 'package:jsonlite':
## 
##     fromJSON, toJSON

library(XML)
library(xml2)

## Warning: package 'xml2' was built under R version 4.3.3

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ purrr::flatten()   masks jsonlite::flatten()
## ✖ rjson::fromJSON()  masks jsonlite::fromJSON()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ rjson::toJSON()    masks jsonlite::toJSON()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(RCurl)

## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete

library(RJSONIO)

## 
## Attaching package: 'RJSONIO'
## 
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
## 
## The following objects are masked from 'package:jsonlite':
## 
##     fromJSON, toJSON

library(knitr)

## Warning: package 'knitr' was built under R version 4.3.3

# create matrix of books
bookmatrix <- matrix(c("Modern Physics for Scientists and Engineers", "Computational Physics", "Modern Quantum Mechanics", "Stephen T. Thornton, Andrew Rex, Carol E. Hood", "Nicholas Giordano, Hisao Nakanishi", "J. J. Sakurai, Jim Napolitano","5th", "2nd", "3rd", "1337919454", "131469908", "1108473229", "Cengage Learning", "Pearson", "Cambridge University Press" ),
       ncol = 5,
       dimnames = list(c("Book 1", "Book 2","Book 3"),
                       c("Title", "Authors", "Edition", "ISBN10", "Publisher")))


#create an html table 


#and to export the html to a file
write_tableHTML(tableHTML(bookmatrix), file = 'books.html')

data created and html file:

A matrix was constructed with some information of three books. the data consisted of title, authors, edition, ISBN10, and publisher.

bookdf <- data.frame(bookmatrix)
#decided to convert the matrix to a dataframe before making it into a json format
exportJson <- toJSON(bookdf)
#view(exportJson) check the file is in correct format.
## Save the JSON to file
write(exportJson, "books.json")
#json file created
#save(exportJson, file="books.json") another potential way of saving a json file

json file:

data was stored into a data frame before converting it to a json file. the file was then saved.

# create a new xml doc
doc_xml <- newXMLDoc(isHTML = FALSE)

# create a table node
table_node <- newXMLNode("table", doc = doc_xml)

# row data
row_data <- apply(bookdf, 1, function(x) {
  z1 <- newXMLNode('row') # create a new node for each row
  addChildren(z1, lapply(names(x), function(y) newXMLNode(y, x[y])))
})

# add row data to table node
xmlParent(row_data) <- table_node

# save as xml file
saveXML(doc_xml, file = "books.xml")

## [1] "books.xml"

xml file:

The creation of an xml file is significantly more complex than the others in R. nodes have to created for the rows then reapplied tp the declared doc xml.

jsonurl <- getURL("https://raw.githubusercontent.com/division-zero/Data607/main/Week%20%207%20assignment/books.JSON")
htmlurl <- getURL("https://raw.githubusercontent.com/division-zero/Data607/main/Week%20%207%20assignment/books.html")
xmlurl <- getURL("https://raw.githubusercontent.com/division-zero/Data607/ce284f604b2abaa58a05c21ab2ec079626c8c677/Week%20%207%20assignment/books.xml", ssl.verifypeer = FALSE)

htmldf <- data.frame(readHTMLTable(htmlurl, as.data.frame = TRUE))



json_data <- fromJSON(jsonurl)

#output_dataframe <- as.data.frame(sample_data)


json_dataframe<- data.frame(matrix(unlist(json_data), 
ncol = length(json_data), byrow = FALSE), stringsAsFactors = FALSE)

#read_xml(xmlurl)
xmldoc <- xmlParse(xmlurl)
xmldf <- xmlToDataFrame(doc = xmldoc)

head(xmldf)

##                                         Title
## 1 Modern Physics for Scientists and Engineers
## 2                       Computational Physics
## 3                    Modern Quantum Mechanics
##                                          Authors Edition     ISBN10
## 1 Stephen T. Thornton, Andrew Rex, Carol E. Hood     5th 1337919454
## 2             Nicholas Giordano, Hisao Nakanishi     2nd  131469908
## 3                  J. J. Sakurai, Jim Napolitano     3rd 1108473229
##                    Publisher
## 1           Cengage Learning
## 2                    Pearson
## 3 Cambridge University Press

glimpse(xmldf)

## Rows: 3
## Columns: 5
## $ Title     <chr> "Modern Physics for Scientists and Engineers", "Computationa…
## $ Authors   <chr> "Stephen T. Thornton, Andrew Rex, Carol E. Hood", "Nicholas …
## $ Edition   <chr> "5th", "2nd", "3rd"
## $ ISBN10    <chr> "1337919454", "131469908", "1108473229"
## $ Publisher <chr> "Cengage Learning", "Pearson", "Cambridge University Press"

head(json_dataframe)

##                                            X1
## 1 Modern Physics for Scientists and Engineers
## 2                       Computational Physics
## 3                    Modern Quantum Mechanics
##                                               X2  X3         X4
## 1 Stephen T. Thornton, Andrew Rex, Carol E. Hood 5th 1337919454
## 2             Nicholas Giordano, Hisao Nakanishi 2nd  131469908
## 3                  J. J. Sakurai, Jim Napolitano 3rd 1108473229
##                           X5
## 1           Cengage Learning
## 2                    Pearson
## 3 Cambridge University Press

glimpse(json_dataframe)

## Rows: 3
## Columns: 5
## $ X1 <chr> "Modern Physics for Scientists and Engineers", "Computational Physi…
## $ X2 <chr> "Stephen T. Thornton, Andrew Rex, Carol E. Hood", "Nicholas Giordan…
## $ X3 <chr> "5th", "2nd", "3rd"
## $ X4 <chr> "1337919454", "131469908", "1108473229"
## $ X5 <chr> "Cengage Learning", "Pearson", "Cambridge University Press"

head(htmldf)

##   NULL..                                  NULL.Title
## 1 Book 1 Modern Physics for Scientists and Engineers
## 2 Book 2                       Computational Physics
## 3 Book 3                    Modern Quantum Mechanics
##                                     NULL.Authors NULL.Edition NULL.ISBN10
## 1 Stephen T. Thornton, Andrew Rex, Carol E. Hood          5th  1337919454
## 2             Nicholas Giordano, Hisao Nakanishi          2nd   131469908
## 3                  J. J. Sakurai, Jim Napolitano          3rd  1108473229
##               NULL.Publisher
## 1           Cengage Learning
## 2                    Pearson
## 3 Cambridge University Press

glimpse(htmldf)

## Rows: 3
## Columns: 6
## $ NULL..         <chr> "Book 1", "Book 2", "Book 3"
## $ NULL.Title     <chr> "Modern Physics for Scientists and Engineers", "Computa…
## $ NULL.Authors   <chr> "Stephen T. Thornton, Andrew Rex, Carol E. Hood", "Nich…
## $ NULL.Edition   <chr> "5th", "2nd", "3rd"
## $ NULL.ISBN10    <chr> "1337919454", "131469908", "1108473229"
## $ NULL.Publisher <chr> "Cengage Learning", "Pearson", "Cambridge University Pr…

conclusion/dataframes:

The data files were loaded into github and then were read in using geturl for each data file. The HTML dataframe has an extra column for row names such as “book 1, book 2”. The json data frame did not keep the column names. The json file required some work to get it into the correct columns and rows. The xml derived data frame is closest to the originally constructed data frame that was made from the matrix of book data. With the way each was created and read in they created slightly different dataframes that contain the same information.

Week 7 assignment data 607

Keith DeNivo

2024-03-10