# Install necessary packages
packages <- c("jsonlite", "htmlTable", "XML", "arrow")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
library(jsonlite)
library(htmlTable)
library(XML)
library(arrow)
##
## Attaching package: 'arrow'
## The following object is masked from 'package:utils':
##
## timestamp
# Create the original dataset as a data frame
data <- data.frame(
Category = c("Electronics", "Electronics", "Electronics", "Electronics",
"Home Appliances", "Home Appliances", "Home Appliances", "Home Appliances",
"Clothing", "Clothing", "Clothing", "Clothing",
"Books", "Books", "Books", "Books",
"Sports Equipment", "Sports Equipment", "Sports Equipment", "Sports Equipment"),
Item_Name = c("Smartphone", "Smartphone", "Laptop", "Laptop",
"Refrigerator", "Refrigerator", "Washing Machine", "Washing Machine",
"T-Shirt", "T-Shirt", "T-Shirt", "Jeans",
"Fiction Novel", "Fiction Novel", "Non-Fiction Guide", "Non-Fiction Guide",
"Basketball", "Tennis Racket", "Tennis Racket", "Tennis Racket"),
Item_ID = c(101, 101, 102, 102, 201, 201, 202, 202, 301, 301, 301, 302, 401, 401, 402, 402, 501, 502, 502, 502),
Brand = c("TechBrand", "TechBrand", "CompuBrand", "CompuBrand",
"HomeCool", "HomeCool", "CleanTech", "CleanTech",
"FashionCo", "FashionCo", "FashionCo", "DenimWorks",
"-", "-", "-", "-",
"SportsGear", "RacketPro", "RacketPro", "RacketPro"),
Price = c(699.99, 699.99, 1099.99, 1099.99, 899.99, 899.99, 499.99, 499.99,
19.99, 19.99, 19.99, 49.99, 14.99, 14.99, 24.99, 24.99, 29.99, 89.99, 89.99, 89.99),
Variation_ID = c("101-A", "101-B", "102-A", "102-B", "201-A", "201-B", "202-A", "202-B",
"301-A", "301-B", "301-C", "302-A", "401-A", "401-B", "402-A", "402-B",
"501-A", "502-A", "502-B", "502-B"),
Variation_Details = c("Color: Black, Storage: 64GB", "Color: White, Storage: 128GB",
"Color: Silver, Storage: 256GB", "Color: Space Gray, Storage: 512GB",
"Color: Stainless Steel, Capacity: 20 cu ft", "Color: White, Capacity: 18 cu ft",
"Type: Front Load, Capacity: 4.5 cu ft", "Type: Top Load, Capacity: 5.0 cu ft",
"Color: Blue, Size: S", "Color: Red, Size: M", "Color: Green, Size: L",
"Color: Dark Blue, Size: 32", "Format: Hardcover, Language: English",
"Format: Paperback, Language: Spanish", "Format: eBook, Language: English",
"Format: Paperback, Language: French", "Size: Size 7, Color: Orange",
"Material: Graphite, Color: Black", "Material: Aluminum, Color: Silver",
"Material: Aluminum, Color: Silver")
)
# 1. Convert to JSON and Import Back
json_data <- toJSON(data, pretty = TRUE)
write(json_data, "CUNYMart_data.json")
imported_json_data <- fromJSON("CUNYMart_data.json")
# Print JSON Imported Data
print("JSON Data Imported:")
## [1] "JSON Data Imported:"
print(head(imported_json_data))
## Category Item_Name Item_ID Brand Price Variation_ID
## 1 Electronics Smartphone 101 TechBrand 699.99 101-A
## 2 Electronics Smartphone 101 TechBrand 699.99 101-B
## 3 Electronics Laptop 102 CompuBrand 1099.99 102-A
## 4 Electronics Laptop 102 CompuBrand 1099.99 102-B
## 5 Home Appliances Refrigerator 201 HomeCool 899.99 201-A
## 6 Home Appliances Refrigerator 201 HomeCool 899.99 201-B
## Variation_Details
## 1 Color: Black, Storage: 64GB
## 2 Color: White, Storage: 128GB
## 3 Color: Silver, Storage: 256GB
## 4 Color: Space Gray, Storage: 512GB
## 5 Color: Stainless Steel, Capacity: 20 cu ft
## 6 Color: White, Capacity: 18 cu ft
JSON Structure: Each row of the dataset is represented as a JSON object, with keys as column names and values as cell data. The entire dataset is an array of these objects. The structure allows easy data interchange, especially in web applications. For example:
[
{
"Category": "Electronics",
"Item_Name": "Smartphone",
"Item_ID": 101,
"Brand": "TechBrand",
"Price": 699.99,
"Variation_ID": "101-A",
"Variation_Details": "Color: Black, Storage: 64GB"
},
...
]
Pros and Cons of Json:
Pros: Easy to read and write, widely used in web APIs, supports hierarchical data structures. Cons: Can be less efficient for very large datasets, lacks schema validation.
# 2. Convert to HTML and Import Back
html_data <- htmlTable(data)
write(html_data, "CUNYMart_data.html")
html_raw <- readLines("CUNYMart_data.html") # Importing as plain text
# Print HTML Imported Data (raw text)
print("HTML Data Imported (raw text):")
## [1] "HTML Data Imported (raw text):"
print(head(html_raw))
## [1] "<table class='gmisc_table' style='border-collapse: collapse; margin-top: 1em; margin-bottom: 1em;' >"
## [2] "<thead>"
## [3] "<tr><th style='border-bottom: 1px solid grey; border-top: 2px solid grey;'></th>"
## [4] "<th style='font-weight: 900; border-bottom: 1px solid grey; border-top: 2px solid grey; text-align: center;'>Category</th>"
## [5] "<th style='font-weight: 900; border-bottom: 1px solid grey; border-top: 2px solid grey; text-align: center;'>Item_Name</th>"
## [6] "<th style='font-weight: 900; border-bottom: 1px solid grey; border-top: 2px solid grey; text-align: center;'>Item_ID</th>"
HTML Structure: The dataset is represented in a
tabular format, with <table>
tags containing rows
(<tr>
) and cells (<td>
). This
format is suitable for displaying data in web browsers. The HTML table
structure looks like:
<table border="1">
<tr>
<th>Category</th><th>Item Name</th><th>Item ID</th><th>Brand</th><th>Price</th><th>Variation ID</th><th>Variation Details</th>
</tr>
<tr>
<td>Electronics</td><td>Smartphone</td><td>101</td><td>TechBrand</td><td>699.99</td><td>101-A</td><td>Color: Black, Storage: 64GB</td>
</tr>
...
</table>
Pros and Cons of HTML: Pros: Suitable for displaying data in web browsers, human-readable, supports styling. Cons: Not ideal for complex data analysis, bulky file size.
# 3. Convert to XML and Import Back
xml_data <- newXMLNode("Inventory")
for (i in 1:nrow(data)) {
item_node <- newXMLNode("Item", parent = xml_data)
for (col in names(data)) {
newXMLNode(col, data[i, col], parent = item_node)
}
}
saveXML(xml_data, file = "CUNYMart_data.xml")
## [1] "CUNYMart_data.xml"
imported_xml_data <- xmlToDataFrame("CUNYMart_data.xml")
# Print XML Imported Data
print("XML Data Imported:")
## [1] "XML Data Imported:"
print(head(imported_xml_data))
## Category Item_Name Item_ID Brand Price Variation_ID
## 1 Electronics Smartphone 101 TechBrand 699.99 101-A
## 2 Electronics Smartphone 101 TechBrand 699.99 101-B
## 3 Electronics Laptop 102 CompuBrand 1099.99 102-A
## 4 Electronics Laptop 102 CompuBrand 1099.99 102-B
## 5 Home Appliances Refrigerator 201 HomeCool 899.99 201-A
## 6 Home Appliances Refrigerator 201 HomeCool 899.99 201-B
## Variation_Details
## 1 Color: Black, Storage: 64GB
## 2 Color: White, Storage: 128GB
## 3 Color: Silver, Storage: 256GB
## 4 Color: Space Gray, Storage: 512GB
## 5 Color: Stainless Steel, Capacity: 20 cu ft
## 6 Color: White, Capacity: 18 cu ft
XML Structure: Each row is represented by an
<Item>
element, and each column is represented by
child nodes within the <Item>
. This nested structure
is well-suited for hierarchical data but is more verbose. The XML
structure looks like:
<Inventory>
<Item>
<Category>Electronics</Category>
<Item_Name>Smartphone</Item_Name>
<Item_ID>101</Item_ID>
<Brand>TechBrand</Brand>
<Price>699.99</Price>
<Variation_ID>101-A</Variation_ID>
<Variation_Details>Color: Black, Storage: 64GB</Variation_Details>
</Item>
...
</Inventory>
Pros and Cons of XML:
Pros: Supports complex hierarchical structures, provides validation (DTD/XSD). Cons: Verbose and large files, slower processing compared to JSON.
# 4. Convert to Parquet and Import Back
write_parquet(data, "CUNYMart_data.parquet")
imported_parquet_data <- read_parquet("CUNYMart_data.parquet")
# Print Parquet Imported Data
print("Parquet Data Imported:")
## [1] "Parquet Data Imported:"
print(head(imported_parquet_data))
## Category Item_Name Item_ID Brand Price Variation_ID
## 1 Electronics Smartphone 101 TechBrand 699.99 101-A
## 2 Electronics Smartphone 101 TechBrand 699.99 101-B
## 3 Electronics Laptop 102 CompuBrand 1099.99 102-A
## 4 Electronics Laptop 102 CompuBrand 1099.99 102-B
## 5 Home Appliances Refrigerator 201 HomeCool 899.99 201-A
## 6 Home Appliances Refrigerator 201 HomeCool 899.99 201-B
## Variation_Details
## 1 Color: Black, Storage: 64GB
## 2 Color: White, Storage: 128GB
## 3 Color: Silver, Storage: 256GB
## 4 Color: Space Gray, Storage: 512GB
## 5 Color: Stainless Steel, Capacity: 20 cu ft
## 6 Color: White, Capacity: 18 cu ft
Pros and Cons of Parquet:
Pros: Efficient for large datasets, optimized for analytical queries, compact storage format. Cons: Less human-readable, not as commonly used as JSON/XML for web data exchange.