library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
I choose to tidyup the data before I create data files thinking it will be easier to read.
# Read the csv file and clean the data into tidy form
raw <- "https://raw.githubusercontent.com/amily52131/DATA607/refs/heads/main/Assignment_7/data.csv"
products <- read.csv(raw)
# Replace . in column name to _
names(products) <- gsub("\\.","_", names(products))
# Separate the additional Column "X" for details from table 1
products_1 <- products %>%
select(-c(X))
# Separate the Variation_Details column from X and rename X to Variation Details
products_2 <- products %>%
select(-c(Variation_Details)) %>%
rename(Variation_Details = X)
# Combine table 1 and table two and separate the Variation_Details to two columns by :
products <- rbind(products_1, products_2) %>%
separate(Variation_Details, into = c("Variation_Type", "Variation_Detail"), sep = ":") %>%
arrange(Variation_ID)
head(products)
## Category Item_Name Item_ID Brand Price Variation_ID Variation_Type
## 1 Electronics Smartphone 101 TechBrand 699.99 101-A Color
## 2 Electronics Smartphone 101 TechBrand 699.99 101-A Storage
## 3 Electronics Smartphone 101 TechBrand 699.99 101-B Color
## 4 Electronics Smartphone 101 TechBrand 699.99 101-B Storage
## 5 Electronics Laptop 102 CompuBrand 1099.99 102-A Color
## 6 Electronics Laptop 102 CompuBrand 1099.99 102-A Storage
## Variation_Detail
## 1 Black
## 2 64GB
## 3 White
## 4 128GB
## 5 Silver
## 6 256GB
# Create HTML file with the content as a table
library(htmltools)
# Create HTML content
html_content <- tags$html(
tags$body(
tags$table(
tags$thead(
tags$tr(
tags$th("Category"),
tags$th("Item Name"),
tags$th("Item ID"),
tags$th("Brand"),
tags$th("Price"),
tags$th("Variation ID"),
tags$th("Variation Type"),
tags$th("Variation Details")
)
),
tags$tbody(
lapply(1:nrow(products), function(i) {
tags$tr(
tags$td(products$Category[i]),
tags$td(products$Item_Name[i]),
tags$td(products$Item_ID[i]),
tags$td(products$Brand[i]),
tags$td(products$Price[i]),
tags$td(products$Variation_ID[i]),
tags$td(products$Variation_Type[i]),
tags$td(products$Variation_Detail[i])
)
})
)
)
)
)
# Save the HTML file
save_html(html_content, "product_data_report.html")
# Create JSON file with the content from the data frame created from csv
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
# Convert to JSON
json_data <- toJSON(products, pretty = TRUE)
# Save the JSON file
write(json_data, "product_data.json")
# Create XML file with the content from the data frame created from csv
library(XML)
# Create the XML document
xml_doc <- newXMLDoc()
# Create root node
root <- newXMLNode("Products", doc = xml_doc)
# Add product entries
for (i in 1:nrow(products)) {
product_node <- newXMLNode("Product", parent = root)
newXMLNode("Category", products$Category[i], parent = product_node)
newXMLNode("Item_Name", products$Item_Name[i], parent = product_node)
newXMLNode("Item_ID", products$Item_ID[i], parent = product_node)
newXMLNode("Brand", products$Brand[i], parent = product_node)
newXMLNode("Price", products$Price[i], parent = product_node)
newXMLNode("Variation_ID", products$Variation_ID[i], parent = product_node)
newXMLNode("Variation_Type", products$Variation_Type[i], parent = product_node)
newXMLNode("Variation_Details", products$Variation_Detail[i], parent = product_node)
}
# Save the XML file
saveXML(xml_doc, file = "product_data.xml")
## [1] "product_data.xml"
# Create XML file with the content from the data frame created from csv
library(arrow)
##
## Attaching package: 'arrow'
## The following object is masked from 'package:lubridate':
##
## duration
## The following object is masked from 'package:utils':
##
## timestamp
# Write parquet file with dataframe
write_parquet(products,"product_data.parquet")
# Reading HTML file
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
raw <- "https://raw.githubusercontent.com/amily52131/DATA607/refs/heads/main/Assignment_7/product_data_report.html"
html_file <- read_html(raw)
# Locates the table element and converts it to data frame
product_html <- html_file %>%
html_node("table") %>%
html_table()
print(product_html)
## # A tibble: 40 × 8
## Category `Item Name` `Item ID` Brand Price `Variation ID` `Variation Type`
## <chr> <chr> <int> <chr> <dbl> <chr> <chr>
## 1 Electronics Smartphone 101 Tech… 700. 101-A Color
## 2 Electronics Smartphone 101 Tech… 700. 101-A Storage
## 3 Electronics Smartphone 101 Tech… 700. 101-B Color
## 4 Electronics Smartphone 101 Tech… 700. 101-B Storage
## 5 Electronics Laptop 102 Comp… 1100. 102-A Color
## 6 Electronics Laptop 102 Comp… 1100. 102-A Storage
## 7 Electronics Laptop 102 Comp… 1100. 102-B Color
## 8 Electronics Laptop 102 Comp… 1100. 102-B Storage
## 9 Home Appli… Refrigerat… 201 Home… 900. 201-A Color
## 10 Home Appli… Refrigerat… 201 Home… 900. 201-A Capacity
## # ℹ 30 more rows
## # ℹ 1 more variable: `Variation Details` <chr>
# Reading JSON file
library(jsonlite)
raw <- "https://raw.githubusercontent.com/amily52131/DATA607/refs/heads/main/Assignment_7/product_data.json"
products_json <- fromJSON(raw)
print(products_json)
## Category Item_Name Item_ID Brand Price Variation_ID
## 1 Electronics Smartphone 101 TechBrand 699.99 101-A
## 2 Electronics Smartphone 101 TechBrand 699.99 101-A
## 3 Electronics Smartphone 101 TechBrand 699.99 101-B
## 4 Electronics Smartphone 101 TechBrand 699.99 101-B
## 5 Electronics Laptop 102 CompuBrand 1099.99 102-A
## 6 Electronics Laptop 102 CompuBrand 1099.99 102-A
## 7 Electronics Laptop 102 CompuBrand 1099.99 102-B
## 8 Electronics Laptop 102 CompuBrand 1099.99 102-B
## 9 Home Appliances Refrigerator 201 HomeCool 899.99 201-A
## 10 Home Appliances Refrigerator 201 HomeCool 899.99 201-A
## 11 Home Appliances Refrigerator 201 HomeCool 899.99 201-B
## 12 Home Appliances Refrigerator 201 HomeCool 899.99 201-B
## 13 Home Appliances Washing Machine 202 CleanTech 499.99 202-A
## 14 Home Appliances Washing Machine 202 CleanTech 499.99 202-A
## 15 Home Appliances Washing Machine 202 CleanTech 499.99 202-B
## 16 Home Appliances Washing Machine 202 CleanTech 499.99 202-B
## 17 Clothing T-Shirt 301 FashionCo 19.99 301-A
## 18 Clothing T-Shirt 301 FashionCo 19.99 301-A
## 19 Clothing T-Shirt 301 FashionCo 19.99 301-B
## 20 Clothing T-Shirt 301 FashionCo 19.99 301-B
## 21 Clothing T-Shirt 301 FashionCo 19.99 301-C
## 22 Clothing T-Shirt 301 FashionCo 19.99 301-C
## 23 Clothing Jeans 302 DenimWorks 49.99 302-A
## 24 Clothing Jeans 302 DenimWorks 49.99 302-A
## 25 Clothing Jeans 302 DenimWorks 49.99 302-B
## 26 Clothing Jeans 302 DenimWorks 49.99 302-B
## 27 Books Fiction Novel 401 - 14.99 401-A
## 28 Books Fiction Novel 401 - 14.99 401-A
## 29 Books Fiction Novel 401 - 14.99 401-B
## 30 Books Fiction Novel 401 - 14.99 401-B
## 31 Books Non-Fiction Guide 402 - 24.99 402-A
## 32 Books Non-Fiction Guide 402 - 24.99 402-A
## 33 Books Non-Fiction Guide 402 - 24.99 402-B
## 34 Books Non-Fiction Guide 402 - 24.99 402-B
## 35 Sports Equipment Basketball 501 SportsGear 29.99 501-A
## 36 Sports Equipment Basketball 501 SportsGear 29.99 501-A
## 37 Sports Equipment Tennis Racket 502 RacketPro 89.99 502-A
## 38 Sports Equipment Tennis Racket 502 RacketPro 89.99 502-A
## 39 Sports Equipment Tennis Racket 502 RacketPro 89.99 502-B
## 40 Sports Equipment Tennis Racket 502 RacketPro 89.99 502-B
## Variation_Type Variation_Detail
## 1 Color Black
## 2 Storage 64GB
## 3 Color White
## 4 Storage 128GB
## 5 Color Silver
## 6 Storage 256GB
## 7 Color Space Gray
## 8 Storage 512GB
## 9 Color Stainless Steel
## 10 Capacity 20 cu ft
## 11 Color White
## 12 Capacity 18 cu ft
## 13 Type Front Load
## 14 Capacity 4.5 cu ft
## 15 Type Top Load
## 16 Capacity 5.0 cu ft
## 17 Color Blue
## 18 Size S
## 19 Color Red
## 20 Size M
## 21 Color Green
## 22 Size L
## 23 Color Dark Blue
## 24 Size 32
## 25 Color Light Blue
## 26 Size 34
## 27 Format Hardcover
## 28 Language English
## 29 Format Paperback
## 30 Language Spanish
## 31 Format eBook
## 32 Language English
## 33 Format Paperback
## 34 Language French
## 35 Size Size 7
## 36 Color Orange
## 37 Material Graphite
## 38 Color Black
## 39 Material Aluminum
## 40 Color Silver
# reading XML file
library(xml2)
raw <- "https://raw.githubusercontent.com/amily52131/DATA607/refs/heads/main/Assignment_7/product_data.xml"
xml_file <- read_xml(raw)
# Extract data from the XML
products <- xml_find_all(xml_file, ".//Product")
# Create a data frame to hold the extracted data
product_xml <- data.frame(
Category = xml_text(xml_find_all(products, "Category")),
Item_Name = xml_text(xml_find_all(products, "Item_Name")),
Item_ID = xml_text(xml_find_all(products, "Item_ID")),
Brand = xml_text(xml_find_all(products, "Brand")),
Price = as.numeric(xml_text(xml_find_all(products, "Price"))),
Variation_ID = xml_text(xml_find_all(products, "Variation_ID")),
Variation_Type = xml_text(xml_find_all(products, "Variation_Type")),
Variation_Details = xml_text(xml_find_all(products, "Variation_Details")),
stringsAsFactors = FALSE
)
# View the data frame
print(product_xml)
## Category Item_Name Item_ID Brand Price Variation_ID
## 1 Electronics Smartphone 101 TechBrand 699.99 101-A
## 2 Electronics Smartphone 101 TechBrand 699.99 101-A
## 3 Electronics Smartphone 101 TechBrand 699.99 101-B
## 4 Electronics Smartphone 101 TechBrand 699.99 101-B
## 5 Electronics Laptop 102 CompuBrand 1099.99 102-A
## 6 Electronics Laptop 102 CompuBrand 1099.99 102-A
## 7 Electronics Laptop 102 CompuBrand 1099.99 102-B
## 8 Electronics Laptop 102 CompuBrand 1099.99 102-B
## 9 Home Appliances Refrigerator 201 HomeCool 899.99 201-A
## 10 Home Appliances Refrigerator 201 HomeCool 899.99 201-A
## 11 Home Appliances Refrigerator 201 HomeCool 899.99 201-B
## 12 Home Appliances Refrigerator 201 HomeCool 899.99 201-B
## 13 Home Appliances Washing Machine 202 CleanTech 499.99 202-A
## 14 Home Appliances Washing Machine 202 CleanTech 499.99 202-A
## 15 Home Appliances Washing Machine 202 CleanTech 499.99 202-B
## 16 Home Appliances Washing Machine 202 CleanTech 499.99 202-B
## 17 Clothing T-Shirt 301 FashionCo 19.99 301-A
## 18 Clothing T-Shirt 301 FashionCo 19.99 301-A
## 19 Clothing T-Shirt 301 FashionCo 19.99 301-B
## 20 Clothing T-Shirt 301 FashionCo 19.99 301-B
## 21 Clothing T-Shirt 301 FashionCo 19.99 301-C
## 22 Clothing T-Shirt 301 FashionCo 19.99 301-C
## 23 Clothing Jeans 302 DenimWorks 49.99 302-A
## 24 Clothing Jeans 302 DenimWorks 49.99 302-A
## 25 Clothing Jeans 302 DenimWorks 49.99 302-B
## 26 Clothing Jeans 302 DenimWorks 49.99 302-B
## 27 Books Fiction Novel 401 - 14.99 401-A
## 28 Books Fiction Novel 401 - 14.99 401-A
## 29 Books Fiction Novel 401 - 14.99 401-B
## 30 Books Fiction Novel 401 - 14.99 401-B
## 31 Books Non-Fiction Guide 402 - 24.99 402-A
## 32 Books Non-Fiction Guide 402 - 24.99 402-A
## 33 Books Non-Fiction Guide 402 - 24.99 402-B
## 34 Books Non-Fiction Guide 402 - 24.99 402-B
## 35 Sports Equipment Basketball 501 SportsGear 29.99 501-A
## 36 Sports Equipment Basketball 501 SportsGear 29.99 501-A
## 37 Sports Equipment Tennis Racket 502 RacketPro 89.99 502-A
## 38 Sports Equipment Tennis Racket 502 RacketPro 89.99 502-A
## 39 Sports Equipment Tennis Racket 502 RacketPro 89.99 502-B
## 40 Sports Equipment Tennis Racket 502 RacketPro 89.99 502-B
## Variation_Type Variation_Details
## 1 Color Black
## 2 Storage 64GB
## 3 Color White
## 4 Storage 128GB
## 5 Color Silver
## 6 Storage 256GB
## 7 Color Space Gray
## 8 Storage 512GB
## 9 Color Stainless Steel
## 10 Capacity 20 cu ft
## 11 Color White
## 12 Capacity 18 cu ft
## 13 Type Front Load
## 14 Capacity 4.5 cu ft
## 15 Type Top Load
## 16 Capacity 5.0 cu ft
## 17 Color Blue
## 18 Size S
## 19 Color Red
## 20 Size M
## 21 Color Green
## 22 Size L
## 23 Color Dark Blue
## 24 Size 32
## 25 Color Light Blue
## 26 Size 34
## 27 Format Hardcover
## 28 Language English
## 29 Format Paperback
## 30 Language Spanish
## 31 Format eBook
## 32 Language English
## 33 Format Paperback
## 34 Language French
## 35 Size Size 7
## 36 Color Orange
## 37 Material Graphite
## 38 Color Black
## 39 Material Aluminum
## 40 Color Silver
# Reading parquet file
library(arrow)
# Unable to generate the raw file from Github
parquet_file <- "./product_data.parquet"
product_parquet <- read_parquet(parquet_file)
print(product_parquet)
## # A tibble: 40 × 8
## Category Item_Name Item_ID Brand Price Variation_ID Variation_Type
## <chr> <chr> <int> <chr> <dbl> <chr> <chr>
## 1 Electronics Smartphone 101 TechB… 700. 101-A "Color"
## 2 Electronics Smartphone 101 TechB… 700. 101-A " Storage"
## 3 Electronics Smartphone 101 TechB… 700. 101-B "Color"
## 4 Electronics Smartphone 101 TechB… 700. 101-B " Storage"
## 5 Electronics Laptop 102 Compu… 1100. 102-A "Color"
## 6 Electronics Laptop 102 Compu… 1100. 102-A " Storage"
## 7 Electronics Laptop 102 Compu… 1100. 102-B "Color"
## 8 Electronics Laptop 102 Compu… 1100. 102-B " Storage"
## 9 Home Appliances Refrigerator 201 HomeC… 900. 201-A "Color"
## 10 Home Appliances Refrigerator 201 HomeC… 900. 201-A " Capacity"
## # ℹ 30 more rows
## # ℹ 1 more variable: Variation_Detail <chr>
HTML uses predefined tags to structure a web page. It has many more tags that describes webpage layout and not very suitable for data exchange. In general it will be harder to get data from HTML page since it contains other information that does not pertain to data.
JSON files is commonly used by APIs its file size is smaller compared to XML but larger than csv files. JSON files is loaded completely into memory which makes it slower to read. However, it is easy for human to read and great for unstructured data.
XML is a markup language designed for data representation and storage. It contains user defined tags that makes it versatile for application. XML can be used to represent complex data structures. However, XML file is in general larger than JSON file since it can contain a lot of information.
Parquet is a self-describing in that it includes metadata that includes the schema and structure of the file. It is one of the fastest file types to read, faster than JSON, and the file size is small. However, it is not easy to read like JSON files.