Assigment 5

# Install necessary packages
packages <- c("jsonlite", "htmlTable", "XML", "arrow")
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
library(jsonlite)
library(htmlTable)
library(XML)
library(arrow)

## 
## Attaching package: 'arrow'

## The following object is masked from 'package:utils':
## 
##     timestamp

# Create the original dataset as a data frame
data <- data.frame(
  Category = c("Electronics", "Electronics", "Electronics", "Electronics", 
               "Home Appliances", "Home Appliances", "Home Appliances", "Home Appliances", 
               "Clothing", "Clothing", "Clothing", "Clothing", 
               "Books", "Books", "Books", "Books", 
               "Sports Equipment", "Sports Equipment", "Sports Equipment", "Sports Equipment"),
  Item_Name = c("Smartphone", "Smartphone", "Laptop", "Laptop", 
                "Refrigerator", "Refrigerator", "Washing Machine", "Washing Machine", 
                "T-Shirt", "T-Shirt", "T-Shirt", "Jeans", 
                "Fiction Novel", "Fiction Novel", "Non-Fiction Guide", "Non-Fiction Guide", 
                "Basketball", "Tennis Racket", "Tennis Racket", "Tennis Racket"),
  Item_ID = c(101, 101, 102, 102, 201, 201, 202, 202, 301, 301, 301, 302, 401, 401, 402, 402, 501, 502, 502, 502),
  Brand = c("TechBrand", "TechBrand", "CompuBrand", "CompuBrand", 
            "HomeCool", "HomeCool", "CleanTech", "CleanTech", 
            "FashionCo", "FashionCo", "FashionCo", "DenimWorks", 
            "-", "-", "-", "-", 
            "SportsGear", "RacketPro", "RacketPro", "RacketPro"),
  Price = c(699.99, 699.99, 1099.99, 1099.99, 899.99, 899.99, 499.99, 499.99, 
            19.99, 19.99, 19.99, 49.99, 14.99, 14.99, 24.99, 24.99, 29.99, 89.99, 89.99, 89.99),
  Variation_ID = c("101-A", "101-B", "102-A", "102-B", "201-A", "201-B", "202-A", "202-B", 
                   "301-A", "301-B", "301-C", "302-A", "401-A", "401-B", "402-A", "402-B", 
                   "501-A", "502-A", "502-B", "502-B"),
  Variation_Details = c("Color: Black, Storage: 64GB", "Color: White, Storage: 128GB", 
                        "Color: Silver, Storage: 256GB", "Color: Space Gray, Storage: 512GB", 
                        "Color: Stainless Steel, Capacity: 20 cu ft", "Color: White, Capacity: 18 cu ft", 
                        "Type: Front Load, Capacity: 4.5 cu ft", "Type: Top Load, Capacity: 5.0 cu ft", 
                        "Color: Blue, Size: S", "Color: Red, Size: M", "Color: Green, Size: L", 
                        "Color: Dark Blue, Size: 32", "Format: Hardcover, Language: English", 
                        "Format: Paperback, Language: Spanish", "Format: eBook, Language: English", 
                        "Format: Paperback, Language: French", "Size: Size 7, Color: Orange", 
                        "Material: Graphite, Color: Black", "Material: Aluminum, Color: Silver", 
                        "Material: Aluminum, Color: Silver")
)

Convert to JSON and Import Back

# 1. Convert to JSON and Import Back
json_data <- toJSON(data, pretty = TRUE)
write(json_data, "CUNYMart_data.json")
imported_json_data <- fromJSON("CUNYMart_data.json")

# Print JSON Imported Data
print("JSON Data Imported:")

## [1] "JSON Data Imported:"

print(head(imported_json_data))

##          Category    Item_Name Item_ID      Brand   Price Variation_ID
## 1     Electronics   Smartphone     101  TechBrand  699.99        101-A
## 2     Electronics   Smartphone     101  TechBrand  699.99        101-B
## 3     Electronics       Laptop     102 CompuBrand 1099.99        102-A
## 4     Electronics       Laptop     102 CompuBrand 1099.99        102-B
## 5 Home Appliances Refrigerator     201   HomeCool  899.99        201-A
## 6 Home Appliances Refrigerator     201   HomeCool  899.99        201-B
##                            Variation_Details
## 1                Color: Black, Storage: 64GB
## 2               Color: White, Storage: 128GB
## 3              Color: Silver, Storage: 256GB
## 4          Color: Space Gray, Storage: 512GB
## 5 Color: Stainless Steel, Capacity: 20 cu ft
## 6           Color: White, Capacity: 18 cu ft

JSON Structure: Each row of the dataset is represented as a JSON object, with keys as column names and values as cell data. The entire dataset is an array of these objects. The structure allows easy data interchange, especially in web applications. For example:
```
[
  {
    "Category": "Electronics",
    "Item_Name": "Smartphone",
    "Item_ID": 101,
    "Brand": "TechBrand",
    "Price": 699.99,
    "Variation_ID": "101-A",
    "Variation_Details": "Color: Black, Storage: 64GB"
  },
  ...
]
```

Pros and Cons of Json:

Pros: Easy to read and write, widely used in web APIs, supports hierarchical data structures. Cons: Can be less efficient for very large datasets, lacks schema validation.

Convert to HTML and Import Back

# 2. Convert to HTML and Import Back
html_data <- htmlTable(data)
write(html_data, "CUNYMart_data.html")
html_raw <- readLines("CUNYMart_data.html")  # Importing as plain text

# Print HTML Imported Data (raw text)
print("HTML Data Imported (raw text):")

## [1] "HTML Data Imported (raw text):"

print(head(html_raw))

## [1] "<table class='gmisc_table' style='border-collapse: collapse; margin-top: 1em; margin-bottom: 1em;' >"                       
## [2] "<thead>"                                                                                                                    
## [3] "<tr><th style='border-bottom: 1px solid grey; border-top: 2px solid grey;'></th>"                                           
## [4] "<th style='font-weight: 900; border-bottom: 1px solid grey; border-top: 2px solid grey; text-align: center;'>Category</th>" 
## [5] "<th style='font-weight: 900; border-bottom: 1px solid grey; border-top: 2px solid grey; text-align: center;'>Item_Name</th>"
## [6] "<th style='font-weight: 900; border-bottom: 1px solid grey; border-top: 2px solid grey; text-align: center;'>Item_ID</th>"

HTML Structure: The dataset is represented in a tabular format, with <table> tags containing rows (<tr>) and cells (<td>). This format is suitable for displaying data in web browsers. The HTML table structure looks like:
```
<table border="1">
  <tr>
    <th>Category</th><th>Item Name</th><th>Item ID</th><th>Brand</th><th>Price</th><th>Variation ID</th><th>Variation Details</th>
  </tr>
  <tr>
    <td>Electronics</td><td>Smartphone</td><td>101</td><td>TechBrand</td><td>699.99</td><td>101-A</td><td>Color: Black, Storage: 64GB</td>
  </tr>
  ...
</table>
```
Pros and Cons of HTML: Pros: Suitable for displaying data in web browsers, human-readable, supports styling. Cons: Not ideal for complex data analysis, bulky file size.

Convert to XML and Import Back

# 3. Convert to XML and Import Back
xml_data <- newXMLNode("Inventory")
for (i in 1:nrow(data)) {
  item_node <- newXMLNode("Item", parent = xml_data)
  for (col in names(data)) {
    newXMLNode(col, data[i, col], parent = item_node)
  }
}
saveXML(xml_data, file = "CUNYMart_data.xml")

## [1] "CUNYMart_data.xml"

imported_xml_data <- xmlToDataFrame("CUNYMart_data.xml")

# Print XML Imported Data
print("XML Data Imported:")

## [1] "XML Data Imported:"

print(head(imported_xml_data))

##          Category    Item_Name Item_ID      Brand   Price Variation_ID
## 1     Electronics   Smartphone     101  TechBrand  699.99        101-A
## 2     Electronics   Smartphone     101  TechBrand  699.99        101-B
## 3     Electronics       Laptop     102 CompuBrand 1099.99        102-A
## 4     Electronics       Laptop     102 CompuBrand 1099.99        102-B
## 5 Home Appliances Refrigerator     201   HomeCool  899.99        201-A
## 6 Home Appliances Refrigerator     201   HomeCool  899.99        201-B
##                            Variation_Details
## 1                Color: Black, Storage: 64GB
## 2               Color: White, Storage: 128GB
## 3              Color: Silver, Storage: 256GB
## 4          Color: Space Gray, Storage: 512GB
## 5 Color: Stainless Steel, Capacity: 20 cu ft
## 6           Color: White, Capacity: 18 cu ft

XML Structure: Each row is represented by an <Item> element, and each column is represented by child nodes within the <Item>. This nested structure is well-suited for hierarchical data but is more verbose. The XML structure looks like:

<Inventory>
  <Item>
    <Category>Electronics</Category>
    <Item_Name>Smartphone</Item_Name>
    <Item_ID>101</Item_ID>
    <Brand>TechBrand</Brand>
    <Price>699.99</Price>
    <Variation_ID>101-A</Variation_ID>
    <Variation_Details>Color: Black, Storage: 64GB</Variation_Details>
  </Item>
  ...
</Inventory>

Pros and Cons of XML:

Pros: Supports complex hierarchical structures, provides validation (DTD/XSD). Cons: Verbose and large files, slower processing compared to JSON.

Convert to Parquet and Import Back

# 4. Convert to Parquet and Import Back
write_parquet(data, "CUNYMart_data.parquet")
imported_parquet_data <- read_parquet("CUNYMart_data.parquet")

# Print Parquet Imported Data
print("Parquet Data Imported:")

## [1] "Parquet Data Imported:"

print(head(imported_parquet_data))

##          Category    Item_Name Item_ID      Brand   Price Variation_ID
## 1     Electronics   Smartphone     101  TechBrand  699.99        101-A
## 2     Electronics   Smartphone     101  TechBrand  699.99        101-B
## 3     Electronics       Laptop     102 CompuBrand 1099.99        102-A
## 4     Electronics       Laptop     102 CompuBrand 1099.99        102-B
## 5 Home Appliances Refrigerator     201   HomeCool  899.99        201-A
## 6 Home Appliances Refrigerator     201   HomeCool  899.99        201-B
##                            Variation_Details
## 1                Color: Black, Storage: 64GB
## 2               Color: White, Storage: 128GB
## 3              Color: Silver, Storage: 256GB
## 4          Color: Space Gray, Storage: 512GB
## 5 Color: Stainless Steel, Capacity: 20 cu ft
## 6           Color: White, Capacity: 18 cu ft

Parquet Structure: Parquet uses a columnar storage format, optimizing for storage and retrieval efficiency. It is particularly suitable for large datasets used in analytical contexts. The structure is represented as schema metadata for each column, making it highly efficient for querying specific columns in big data environments.

Pros and Cons of Parquet:

Pros: Efficient for large datasets, optimized for analytical queries, compact storage format. Cons: Less human-readable, not as commonly used as JSON/XML for web data exchange.

Assigment 5

2024-10-19

Convert to JSON and Import Back

Convert to HTML and Import Back

Convert to XML and Import Back

Convert to Parquet and Import Back