library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Uploading the information as a csv file to Github then read it as a dataframe with R

I choose to tidyup the data before I create data files thinking it will be easier to read.

# Read the csv file and clean the data into tidy form
raw <- "https://raw.githubusercontent.com/amily52131/DATA607/refs/heads/main/Assignment_7/data.csv"
products <- read.csv(raw)

# Replace . in column name to _
names(products) <- gsub("\\.","_", names(products))

# Separate the additional Column "X" for details from table 1
products_1 <- products %>%
  select(-c(X))
# Separate the Variation_Details column from X and rename X to Variation Details
products_2 <- products %>%
  select(-c(Variation_Details)) %>% 
  rename(Variation_Details = X)
# Combine table 1 and table two and separate the Variation_Details to two columns by :
products <- rbind(products_1, products_2) %>% 
  separate(Variation_Details, into = c("Variation_Type", "Variation_Detail"), sep = ":") %>% 
  arrange(Variation_ID) 
head(products)
##      Category  Item_Name Item_ID      Brand   Price Variation_ID Variation_Type
## 1 Electronics Smartphone     101  TechBrand  699.99        101-A          Color
## 2 Electronics Smartphone     101  TechBrand  699.99        101-A        Storage
## 3 Electronics Smartphone     101  TechBrand  699.99        101-B          Color
## 4 Electronics Smartphone     101  TechBrand  699.99        101-B        Storage
## 5 Electronics     Laptop     102 CompuBrand 1099.99        102-A          Color
## 6 Electronics     Laptop     102 CompuBrand 1099.99        102-A        Storage
##   Variation_Detail
## 1            Black
## 2             64GB
## 3            White
## 4            128GB
## 5           Silver
## 6            256GB

Create JSON, HTML, XML and Parquet with R

Creating HTML file with the csv file as a table

# Create HTML file with the content as a table

library(htmltools)
# Create HTML content
html_content <- tags$html(
  tags$body(
    tags$table(
      tags$thead(
        tags$tr(
          tags$th("Category"),
          tags$th("Item Name"),
          tags$th("Item ID"),
          tags$th("Brand"),
          tags$th("Price"),
          tags$th("Variation ID"),
          tags$th("Variation Type"),
          tags$th("Variation Details")
        )
      ),
      tags$tbody(
        lapply(1:nrow(products), function(i) {
          tags$tr(
            tags$td(products$Category[i]),
            tags$td(products$Item_Name[i]),
            tags$td(products$Item_ID[i]),
            tags$td(products$Brand[i]),
            tags$td(products$Price[i]),
            tags$td(products$Variation_ID[i]),
            tags$td(products$Variation_Type[i]),
            tags$td(products$Variation_Detail[i])
          )
        })
      )
    )
  )
)

# Save the HTML file
save_html(html_content, "product_data_report.html")

Creating JSON file with the csv file

# Create JSON file with the content from the data frame created from csv
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
# Convert to JSON
json_data <- toJSON(products, pretty = TRUE)

# Save the JSON file
write(json_data, "product_data.json")

Creating XML file with the csv file

# Create XML file with the content from the data frame created from csv
library(XML)
# Create the XML document
xml_doc <- newXMLDoc()

# Create root node
root <- newXMLNode("Products", doc = xml_doc)

# Add product entries
for (i in 1:nrow(products)) {
  product_node <- newXMLNode("Product", parent = root)
  newXMLNode("Category", products$Category[i], parent = product_node)
  newXMLNode("Item_Name", products$Item_Name[i], parent = product_node)
  newXMLNode("Item_ID", products$Item_ID[i], parent = product_node)
  newXMLNode("Brand", products$Brand[i], parent = product_node)
  newXMLNode("Price", products$Price[i], parent = product_node)
  newXMLNode("Variation_ID", products$Variation_ID[i], parent = product_node)
  newXMLNode("Variation_Type", products$Variation_Type[i], parent = product_node)
  newXMLNode("Variation_Details", products$Variation_Detail[i], parent = product_node)
}
# Save the XML file
saveXML(xml_doc, file = "product_data.xml")
## [1] "product_data.xml"

Creating pqrquet file with the csv file

# Create XML file with the content from the data frame created from csv
library(arrow)
## 
## Attaching package: 'arrow'
## The following object is masked from 'package:lubridate':
## 
##     duration
## The following object is masked from 'package:utils':
## 
##     timestamp
# Write parquet file with dataframe
write_parquet(products,"product_data.parquet")

Reading JSON, HTML, XML, parquet files

Reading HTML file

# Reading HTML file
library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
raw <- "https://raw.githubusercontent.com/amily52131/DATA607/refs/heads/main/Assignment_7/product_data_report.html"
html_file <- read_html(raw)

# Locates the table element and converts it to data frame
product_html <- html_file %>% 
  html_node("table") %>%
  html_table()

print(product_html)
## # A tibble: 40 × 8
##    Category    `Item Name` `Item ID` Brand Price `Variation ID` `Variation Type`
##    <chr>       <chr>           <int> <chr> <dbl> <chr>          <chr>           
##  1 Electronics Smartphone        101 Tech…  700. 101-A          Color           
##  2 Electronics Smartphone        101 Tech…  700. 101-A          Storage         
##  3 Electronics Smartphone        101 Tech…  700. 101-B          Color           
##  4 Electronics Smartphone        101 Tech…  700. 101-B          Storage         
##  5 Electronics Laptop            102 Comp… 1100. 102-A          Color           
##  6 Electronics Laptop            102 Comp… 1100. 102-A          Storage         
##  7 Electronics Laptop            102 Comp… 1100. 102-B          Color           
##  8 Electronics Laptop            102 Comp… 1100. 102-B          Storage         
##  9 Home Appli… Refrigerat…       201 Home…  900. 201-A          Color           
## 10 Home Appli… Refrigerat…       201 Home…  900. 201-A          Capacity        
## # ℹ 30 more rows
## # ℹ 1 more variable: `Variation Details` <chr>

Reading JSON file

# Reading JSON file
library(jsonlite)
raw <- "https://raw.githubusercontent.com/amily52131/DATA607/refs/heads/main/Assignment_7/product_data.json"

products_json <- fromJSON(raw)
print(products_json)
##            Category         Item_Name Item_ID      Brand   Price Variation_ID
## 1       Electronics        Smartphone     101  TechBrand  699.99        101-A
## 2       Electronics        Smartphone     101  TechBrand  699.99        101-A
## 3       Electronics        Smartphone     101  TechBrand  699.99        101-B
## 4       Electronics        Smartphone     101  TechBrand  699.99        101-B
## 5       Electronics            Laptop     102 CompuBrand 1099.99        102-A
## 6       Electronics            Laptop     102 CompuBrand 1099.99        102-A
## 7       Electronics            Laptop     102 CompuBrand 1099.99        102-B
## 8       Electronics            Laptop     102 CompuBrand 1099.99        102-B
## 9   Home Appliances      Refrigerator     201   HomeCool  899.99        201-A
## 10  Home Appliances      Refrigerator     201   HomeCool  899.99        201-A
## 11  Home Appliances      Refrigerator     201   HomeCool  899.99        201-B
## 12  Home Appliances      Refrigerator     201   HomeCool  899.99        201-B
## 13  Home Appliances   Washing Machine     202  CleanTech  499.99        202-A
## 14  Home Appliances   Washing Machine     202  CleanTech  499.99        202-A
## 15  Home Appliances   Washing Machine     202  CleanTech  499.99        202-B
## 16  Home Appliances   Washing Machine     202  CleanTech  499.99        202-B
## 17         Clothing           T-Shirt     301  FashionCo   19.99        301-A
## 18         Clothing           T-Shirt     301  FashionCo   19.99        301-A
## 19         Clothing           T-Shirt     301  FashionCo   19.99        301-B
## 20         Clothing           T-Shirt     301  FashionCo   19.99        301-B
## 21         Clothing           T-Shirt     301  FashionCo   19.99        301-C
## 22         Clothing           T-Shirt     301  FashionCo   19.99        301-C
## 23         Clothing             Jeans     302 DenimWorks   49.99        302-A
## 24         Clothing             Jeans     302 DenimWorks   49.99        302-A
## 25         Clothing             Jeans     302 DenimWorks   49.99        302-B
## 26         Clothing             Jeans     302 DenimWorks   49.99        302-B
## 27            Books     Fiction Novel     401          -   14.99        401-A
## 28            Books     Fiction Novel     401          -   14.99        401-A
## 29            Books     Fiction Novel     401          -   14.99        401-B
## 30            Books     Fiction Novel     401          -   14.99        401-B
## 31            Books Non-Fiction Guide     402          -   24.99        402-A
## 32            Books Non-Fiction Guide     402          -   24.99        402-A
## 33            Books Non-Fiction Guide     402          -   24.99        402-B
## 34            Books Non-Fiction Guide     402          -   24.99        402-B
## 35 Sports Equipment        Basketball     501 SportsGear   29.99        501-A
## 36 Sports Equipment        Basketball     501 SportsGear   29.99        501-A
## 37 Sports Equipment     Tennis Racket     502  RacketPro   89.99        502-A
## 38 Sports Equipment     Tennis Racket     502  RacketPro   89.99        502-A
## 39 Sports Equipment     Tennis Racket     502  RacketPro   89.99        502-B
## 40 Sports Equipment     Tennis Racket     502  RacketPro   89.99        502-B
##    Variation_Type Variation_Detail
## 1           Color            Black
## 2         Storage             64GB
## 3           Color            White
## 4         Storage            128GB
## 5           Color           Silver
## 6         Storage            256GB
## 7           Color       Space Gray
## 8         Storage            512GB
## 9           Color  Stainless Steel
## 10       Capacity         20 cu ft
## 11          Color            White
## 12       Capacity         18 cu ft
## 13           Type       Front Load
## 14       Capacity        4.5 cu ft
## 15           Type         Top Load
## 16       Capacity        5.0 cu ft
## 17          Color             Blue
## 18           Size                S
## 19          Color              Red
## 20           Size                M
## 21          Color            Green
## 22           Size                L
## 23          Color        Dark Blue
## 24           Size               32
## 25          Color       Light Blue
## 26           Size               34
## 27         Format        Hardcover
## 28       Language          English
## 29         Format        Paperback
## 30       Language          Spanish
## 31         Format            eBook
## 32       Language          English
## 33         Format        Paperback
## 34       Language           French
## 35           Size           Size 7
## 36          Color           Orange
## 37       Material         Graphite
## 38          Color            Black
## 39       Material         Aluminum
## 40          Color           Silver

Reading XML file

# reading XML file
library(xml2)

raw <- "https://raw.githubusercontent.com/amily52131/DATA607/refs/heads/main/Assignment_7/product_data.xml"
xml_file <- read_xml(raw)

# Extract data from the XML
products <- xml_find_all(xml_file, ".//Product")

# Create a data frame to hold the extracted data
product_xml <- data.frame(
  Category = xml_text(xml_find_all(products, "Category")),
  Item_Name = xml_text(xml_find_all(products, "Item_Name")),
  Item_ID = xml_text(xml_find_all(products, "Item_ID")),
  Brand = xml_text(xml_find_all(products, "Brand")),
  Price = as.numeric(xml_text(xml_find_all(products, "Price"))),
  Variation_ID = xml_text(xml_find_all(products, "Variation_ID")),
  Variation_Type = xml_text(xml_find_all(products, "Variation_Type")),
  Variation_Details = xml_text(xml_find_all(products, "Variation_Details")),
  stringsAsFactors = FALSE
)

# View the data frame
print(product_xml)
##            Category         Item_Name Item_ID      Brand   Price Variation_ID
## 1       Electronics        Smartphone     101  TechBrand  699.99        101-A
## 2       Electronics        Smartphone     101  TechBrand  699.99        101-A
## 3       Electronics        Smartphone     101  TechBrand  699.99        101-B
## 4       Electronics        Smartphone     101  TechBrand  699.99        101-B
## 5       Electronics            Laptop     102 CompuBrand 1099.99        102-A
## 6       Electronics            Laptop     102 CompuBrand 1099.99        102-A
## 7       Electronics            Laptop     102 CompuBrand 1099.99        102-B
## 8       Electronics            Laptop     102 CompuBrand 1099.99        102-B
## 9   Home Appliances      Refrigerator     201   HomeCool  899.99        201-A
## 10  Home Appliances      Refrigerator     201   HomeCool  899.99        201-A
## 11  Home Appliances      Refrigerator     201   HomeCool  899.99        201-B
## 12  Home Appliances      Refrigerator     201   HomeCool  899.99        201-B
## 13  Home Appliances   Washing Machine     202  CleanTech  499.99        202-A
## 14  Home Appliances   Washing Machine     202  CleanTech  499.99        202-A
## 15  Home Appliances   Washing Machine     202  CleanTech  499.99        202-B
## 16  Home Appliances   Washing Machine     202  CleanTech  499.99        202-B
## 17         Clothing           T-Shirt     301  FashionCo   19.99        301-A
## 18         Clothing           T-Shirt     301  FashionCo   19.99        301-A
## 19         Clothing           T-Shirt     301  FashionCo   19.99        301-B
## 20         Clothing           T-Shirt     301  FashionCo   19.99        301-B
## 21         Clothing           T-Shirt     301  FashionCo   19.99        301-C
## 22         Clothing           T-Shirt     301  FashionCo   19.99        301-C
## 23         Clothing             Jeans     302 DenimWorks   49.99        302-A
## 24         Clothing             Jeans     302 DenimWorks   49.99        302-A
## 25         Clothing             Jeans     302 DenimWorks   49.99        302-B
## 26         Clothing             Jeans     302 DenimWorks   49.99        302-B
## 27            Books     Fiction Novel     401          -   14.99        401-A
## 28            Books     Fiction Novel     401          -   14.99        401-A
## 29            Books     Fiction Novel     401          -   14.99        401-B
## 30            Books     Fiction Novel     401          -   14.99        401-B
## 31            Books Non-Fiction Guide     402          -   24.99        402-A
## 32            Books Non-Fiction Guide     402          -   24.99        402-A
## 33            Books Non-Fiction Guide     402          -   24.99        402-B
## 34            Books Non-Fiction Guide     402          -   24.99        402-B
## 35 Sports Equipment        Basketball     501 SportsGear   29.99        501-A
## 36 Sports Equipment        Basketball     501 SportsGear   29.99        501-A
## 37 Sports Equipment     Tennis Racket     502  RacketPro   89.99        502-A
## 38 Sports Equipment     Tennis Racket     502  RacketPro   89.99        502-A
## 39 Sports Equipment     Tennis Racket     502  RacketPro   89.99        502-B
## 40 Sports Equipment     Tennis Racket     502  RacketPro   89.99        502-B
##    Variation_Type Variation_Details
## 1           Color             Black
## 2         Storage              64GB
## 3           Color             White
## 4         Storage             128GB
## 5           Color            Silver
## 6         Storage             256GB
## 7           Color        Space Gray
## 8         Storage             512GB
## 9           Color   Stainless Steel
## 10       Capacity          20 cu ft
## 11          Color             White
## 12       Capacity          18 cu ft
## 13           Type        Front Load
## 14       Capacity         4.5 cu ft
## 15           Type          Top Load
## 16       Capacity         5.0 cu ft
## 17          Color              Blue
## 18           Size                 S
## 19          Color               Red
## 20           Size                 M
## 21          Color             Green
## 22           Size                 L
## 23          Color         Dark Blue
## 24           Size                32
## 25          Color        Light Blue
## 26           Size                34
## 27         Format         Hardcover
## 28       Language           English
## 29         Format         Paperback
## 30       Language           Spanish
## 31         Format             eBook
## 32       Language           English
## 33         Format         Paperback
## 34       Language            French
## 35           Size            Size 7
## 36          Color            Orange
## 37       Material          Graphite
## 38          Color             Black
## 39       Material          Aluminum
## 40          Color            Silver

Reading parquet file

# Reading parquet file
library(arrow)

# Unable to generate the raw file from Github
parquet_file <- "./product_data.parquet"
product_parquet <- read_parquet(parquet_file)

print(product_parquet)
## # A tibble: 40 × 8
##    Category        Item_Name    Item_ID Brand  Price Variation_ID Variation_Type
##    <chr>           <chr>          <int> <chr>  <dbl> <chr>        <chr>         
##  1 Electronics     Smartphone       101 TechB…  700. 101-A        "Color"       
##  2 Electronics     Smartphone       101 TechB…  700. 101-A        " Storage"    
##  3 Electronics     Smartphone       101 TechB…  700. 101-B        "Color"       
##  4 Electronics     Smartphone       101 TechB…  700. 101-B        " Storage"    
##  5 Electronics     Laptop           102 Compu… 1100. 102-A        "Color"       
##  6 Electronics     Laptop           102 Compu… 1100. 102-A        " Storage"    
##  7 Electronics     Laptop           102 Compu… 1100. 102-B        "Color"       
##  8 Electronics     Laptop           102 Compu… 1100. 102-B        " Storage"    
##  9 Home Appliances Refrigerator     201 HomeC…  900. 201-A        "Color"       
## 10 Home Appliances Refrigerator     201 HomeC…  900. 201-A        " Capacity"   
## # ℹ 30 more rows
## # ℹ 1 more variable: Variation_Detail <chr>

Conclusion JSON, HTML, XML, and parquet file

HTML

HTML uses predefined tags to structure a web page. It has many more tags that describes webpage layout and not very suitable for data exchange. In general it will be harder to get data from HTML page since it contains other information that does not pertain to data.

JSON

JSON files is commonly used by APIs its file size is smaller compared to XML but larger than csv files. JSON files is loaded completely into memory which makes it slower to read. However, it is easy for human to read and great for unstructured data.

XML

XML is a markup language designed for data representation and storage. It contains user defined tags that makes it versatile for application. XML can be used to represent complex data structures. However, XML file is in general larger than JSON file since it can contain a lot of information.

Parquet

Parquet is a self-describing in that it includes metadata that includes the schema and structure of the file. It is one of the fastest file types to read, faster than JSON, and the file size is small. However, it is not easy to read like JSON files.