Github Repository

Introduction

In this report, unstructured data from copied from a PDF file will be cleaned, and converted into JSON, HTML, XML, and Parquet.


The code below reads in the text copied from the PDF file:

txt <- read_csv(
    "Category,Item Name,Item ID,Brand,Price,Variation ID,Variation Details
Electronics,Smartphone,101,TechBrand,699.99,101-A,Color: Black, Storage: 64GB
Electronics,Smartphone,101,TechBrand,699.99,101-B,Color: White, Storage: 128GB
Electronics,Laptop,102,CompuBrand,1099.99,102-A,Color: Silver, Storage: 256GB
Electronics,Laptop,102,CompuBrand,1099.99,102-B,Color: Space Gray, Storage: 512GB
Home Appliances,Refrigerator,201,HomeCool,899.99,201-A,Color: Stainless Steel, Capacity: 20 cu ft,
Home Appliances,Refrigerator,201,HomeCool,899.99,201-B,Color: White, Capacity: 18 cu ft
Home Appliances,Washing Machine,202,CleanTech,499.99,202-A,Type: Front Load, Capacity: 4.5 cu ft,
Home Appliances,Washing Machine,202,CleanTech,499.99,202-B,Type: Top Load, Capacity: 5.0 cu ft,
Clothing,T-Shirt,301,FashionCo,19.99,301-A,Color: Blue, Size: S
Clothing,T-Shirt,301,FashionCo,19.99,301-B,Color: Red, Size: M
Clothing,T-Shirt,301,FashionCo,19.99,301-C,Color: Green, Size: L
Clothing,Jeans,302,DenimWorks,49.99,302-A,Color: Dark Blue, Size: 32
Clothing,Jeans,302,DenimWorks,49.99,302-B,Color: Light Blue, Size: 34
Books,Fiction Novel,401,-,14.99,401-A,Format: Hardcover, Language: English
Books,Fiction Novel,401,-,14.99,401-B,Format: Paperback, Language: Spanish
Books,Non-Fiction Guide,402,-,24.99,402-A,Format: eBook, Language: English
Books,Non-Fiction Guide,402,-,24.99,402-B,Format: Paperback, Language: French
Sports Equipment,Basketball,501,SportsGear,29.99,501-A,Size: Size 7, Color: Orange
Sports Equipment,Tennis Racket,502,RacketPro,89.99,502-A,Material: Graphite, Color: Black
Sports Equipment,Tennis Racket,502,RacketPro,89.99,502-B,Material: Aluminum, Color: Silver"
) |> 
  # change names to be in snake case
  janitor::clean_names()


In the code below, the data is cleaned so that there are no longer multiple values in the variation_details column. This will produce more null values, but will make it simpler to analyze the variation details for each item.

df <- txt |> 
  # split the terms by the comma
  separate_wider_delim(variation_details, delim = ", ",names_sep = "_") |>
  # Split again by colon and space
  separate_wider_delim(c("variation_details_1","variation_details_2"), delim =": ", names_sep = "_") |>  
  # pivot longer so that the variation categories are in each row
  pivot_longer(
     cols = matches("details"),
     names_to = c("variable", ".value"),
     names_pattern = "([0-9]{1})_([0-9]{1})"
   ) |>
  select(-variable) |>
  # pivot wider to make each category a column
  pivot_wider(
    names_from = `1`,
    values_from = `2`
  )


Convert Data to Various Data Types

Once the data is cleaned, it can be converted into different formats, such as JSON, HTML, XML, and Parquet.

JSON

JSON is largely used web applications and APIs due to its lightweight, human-readable format that integrates well with modern data pipelines.

# convert df to json
json_data <- toJSON(df, pretty = TRUE)

# create json file
write(json_data, "CUNYMart.json")

print(json_data)

[ { “category”: “Electronics”, “item_name”: “Smartphone”, “item_id”: 101, “brand”: “TechBrand”, “price”: 699.99, “variation_id”: “101-A”, “Color”: “Black”, “Storage”: “64GB” }, { “category”: “Electronics”, “item_name”: “Smartphone”, “item_id”: 101, “brand”: “TechBrand”, “price”: 699.99, “variation_id”: “101-B”, “Color”: “White”, “Storage”: “128GB” }, { “category”: “Electronics”, “item_name”: “Laptop”, “item_id”: 102, “brand”: “CompuBrand”, “price”: 1099.99, “variation_id”: “102-A”, “Color”: “Silver”, “Storage”: “256GB” }, { “category”: “Electronics”, “item_name”: “Laptop”, “item_id”: 102, “brand”: “CompuBrand”, “price”: 1099.99, “variation_id”: “102-B”, “Color”: “Space Gray”, “Storage”: “512GB” }, { “category”: “Home Appliances”, “item_name”: “Refrigerator”, “item_id”: 201, “brand”: “HomeCool”, “price”: 899.99, “variation_id”: “201-A”, “Color”: “Stainless Steel”, “Capacity”: “20 cu ft,” }, { “category”: “Home Appliances”, “item_name”: “Refrigerator”, “item_id”: 201, “brand”: “HomeCool”, “price”: 899.99, “variation_id”: “201-B”, “Color”: “White”, “Capacity”: “18 cu ft” }, { “category”: “Home Appliances”, “item_name”: “Washing Machine”, “item_id”: 202, “brand”: “CleanTech”, “price”: 499.99, “variation_id”: “202-A”, “Capacity”: “4.5 cu ft,”, “Type”: “Front Load” }, { “category”: “Home Appliances”, “item_name”: “Washing Machine”, “item_id”: 202, “brand”: “CleanTech”, “price”: 499.99, “variation_id”: “202-B”, “Capacity”: “5.0 cu ft,”, “Type”: “Top Load” }, { “category”: “Clothing”, “item_name”: “T-Shirt”, “item_id”: 301, “brand”: “FashionCo”, “price”: 19.99, “variation_id”: “301-A”, “Color”: “Blue”, “Size”: “S” }, { “category”: “Clothing”, “item_name”: “T-Shirt”, “item_id”: 301, “brand”: “FashionCo”, “price”: 19.99, “variation_id”: “301-B”, “Color”: “Red”, “Size”: “M” }, { “category”: “Clothing”, “item_name”: “T-Shirt”, “item_id”: 301, “brand”: “FashionCo”, “price”: 19.99, “variation_id”: “301-C”, “Color”: “Green”, “Size”: “L” }, { “category”: “Clothing”, “item_name”: “Jeans”, “item_id”: 302, “brand”: “DenimWorks”, “price”: 49.99, “variation_id”: “302-A”, “Color”: “Dark Blue”, “Size”: “32” }, { “category”: “Clothing”, “item_name”: “Jeans”, “item_id”: 302, “brand”: “DenimWorks”, “price”: 49.99, “variation_id”: “302-B”, “Color”: “Light Blue”, “Size”: “34” }, { “category”: “Books”, “item_name”: “Fiction Novel”, “item_id”: 401, “brand”: “-”, “price”: 14.99, “variation_id”: “401-A”, “Format”: “Hardcover”, “Language”: “English” }, { “category”: “Books”, “item_name”: “Fiction Novel”, “item_id”: 401, “brand”: “-”, “price”: 14.99, “variation_id”: “401-B”, “Format”: “Paperback”, “Language”: “Spanish” }, { “category”: “Books”, “item_name”: “Non-Fiction Guide”, “item_id”: 402, “brand”: “-”, “price”: 24.99, “variation_id”: “402-A”, “Format”: “eBook”, “Language”: “English” }, { “category”: “Books”, “item_name”: “Non-Fiction Guide”, “item_id”: 402, “brand”: “-”, “price”: 24.99, “variation_id”: “402-B”, “Format”: “Paperback”, “Language”: “French” }, { “category”: “Sports Equipment”, “item_name”: “Basketball”, “item_id”: 501, “brand”: “SportsGear”, “price”: 29.99, “variation_id”: “501-A”, “Color”: “Orange”, “Size”: “Size 7” }, { “category”: “Sports Equipment”, “item_name”: “Tennis Racket”, “item_id”: 502, “brand”: “RacketPro”, “price”: 89.99, “variation_id”: “502-A”, “Color”: “Black”, “Material”: “Graphite” }, { “category”: “Sports Equipment”, “item_name”: “Tennis Racket”, “item_id”: 502, “brand”: “RacketPro”, “price”: 89.99, “variation_id”: “502-B”, “Color”: “Silver”, “Material”: “Aluminum” }]

HTML

HTML formatting is best for displaying structured content on web pages. Examples are using HTML to generate reports, dashboards, and formatted text to host on a website.

# convert to HTML table
html_data <- xtable(df)

# Save HTML to a file
html_output <- print.xtable(html_data, type="html", print.results=FALSE)
write_lines(html_output, "CUNYMart.html")

# Read and display the HTML as plain text
html_text <- readLines("CUNYMart.html")
cat(paste(html_text, collapse="\n"))
## <!-- html table generated in R 4.4.3 by xtable 1.8-4 package -->
## <!-- Sun Mar 30 17:17:37 2025 -->
## <table border=1>
## <tr> <th>  </th> <th> category </th> <th> item_name </th> <th> item_id </th> <th> brand </th> <th> price </th> <th> variation_id </th> <th> Color </th> <th> Storage </th> <th> Capacity </th> <th> Type </th> <th> Size </th> <th> Format </th> <th> Language </th> <th> Material </th>  </tr>
##   <tr> <td align="right"> 1 </td> <td> Electronics </td> <td> Smartphone </td> <td align="right"> 101.00 </td> <td> TechBrand </td> <td align="right"> 699.99 </td> <td> 101-A </td> <td> Black </td> <td> 64GB </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 2 </td> <td> Electronics </td> <td> Smartphone </td> <td align="right"> 101.00 </td> <td> TechBrand </td> <td align="right"> 699.99 </td> <td> 101-B </td> <td> White </td> <td> 128GB </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 3 </td> <td> Electronics </td> <td> Laptop </td> <td align="right"> 102.00 </td> <td> CompuBrand </td> <td align="right"> 1099.99 </td> <td> 102-A </td> <td> Silver </td> <td> 256GB </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 4 </td> <td> Electronics </td> <td> Laptop </td> <td align="right"> 102.00 </td> <td> CompuBrand </td> <td align="right"> 1099.99 </td> <td> 102-B </td> <td> Space Gray </td> <td> 512GB </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 5 </td> <td> Home Appliances </td> <td> Refrigerator </td> <td align="right"> 201.00 </td> <td> HomeCool </td> <td align="right"> 899.99 </td> <td> 201-A </td> <td> Stainless Steel </td> <td>  </td> <td> 20 cu ft, </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 6 </td> <td> Home Appliances </td> <td> Refrigerator </td> <td align="right"> 201.00 </td> <td> HomeCool </td> <td align="right"> 899.99 </td> <td> 201-B </td> <td> White </td> <td>  </td> <td> 18 cu ft </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 7 </td> <td> Home Appliances </td> <td> Washing Machine </td> <td align="right"> 202.00 </td> <td> CleanTech </td> <td align="right"> 499.99 </td> <td> 202-A </td> <td>  </td> <td>  </td> <td> 4.5 cu ft, </td> <td> Front Load </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 8 </td> <td> Home Appliances </td> <td> Washing Machine </td> <td align="right"> 202.00 </td> <td> CleanTech </td> <td align="right"> 499.99 </td> <td> 202-B </td> <td>  </td> <td>  </td> <td> 5.0 cu ft, </td> <td> Top Load </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 9 </td> <td> Clothing </td> <td> T-Shirt </td> <td align="right"> 301.00 </td> <td> FashionCo </td> <td align="right"> 19.99 </td> <td> 301-A </td> <td> Blue </td> <td>  </td> <td>  </td> <td>  </td> <td> S </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 10 </td> <td> Clothing </td> <td> T-Shirt </td> <td align="right"> 301.00 </td> <td> FashionCo </td> <td align="right"> 19.99 </td> <td> 301-B </td> <td> Red </td> <td>  </td> <td>  </td> <td>  </td> <td> M </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 11 </td> <td> Clothing </td> <td> T-Shirt </td> <td align="right"> 301.00 </td> <td> FashionCo </td> <td align="right"> 19.99 </td> <td> 301-C </td> <td> Green </td> <td>  </td> <td>  </td> <td>  </td> <td> L </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 12 </td> <td> Clothing </td> <td> Jeans </td> <td align="right"> 302.00 </td> <td> DenimWorks </td> <td align="right"> 49.99 </td> <td> 302-A </td> <td> Dark Blue </td> <td>  </td> <td>  </td> <td>  </td> <td> 32 </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 13 </td> <td> Clothing </td> <td> Jeans </td> <td align="right"> 302.00 </td> <td> DenimWorks </td> <td align="right"> 49.99 </td> <td> 302-B </td> <td> Light Blue </td> <td>  </td> <td>  </td> <td>  </td> <td> 34 </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 14 </td> <td> Books </td> <td> Fiction Novel </td> <td align="right"> 401.00 </td> <td> - </td> <td align="right"> 14.99 </td> <td> 401-A </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td> Hardcover </td> <td> English </td> <td>  </td> </tr>
##   <tr> <td align="right"> 15 </td> <td> Books </td> <td> Fiction Novel </td> <td align="right"> 401.00 </td> <td> - </td> <td align="right"> 14.99 </td> <td> 401-B </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td> Paperback </td> <td> Spanish </td> <td>  </td> </tr>
##   <tr> <td align="right"> 16 </td> <td> Books </td> <td> Non-Fiction Guide </td> <td align="right"> 402.00 </td> <td> - </td> <td align="right"> 24.99 </td> <td> 402-A </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td> eBook </td> <td> English </td> <td>  </td> </tr>
##   <tr> <td align="right"> 17 </td> <td> Books </td> <td> Non-Fiction Guide </td> <td align="right"> 402.00 </td> <td> - </td> <td align="right"> 24.99 </td> <td> 402-B </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td> Paperback </td> <td> French </td> <td>  </td> </tr>
##   <tr> <td align="right"> 18 </td> <td> Sports Equipment </td> <td> Basketball </td> <td align="right"> 501.00 </td> <td> SportsGear </td> <td align="right"> 29.99 </td> <td> 501-A </td> <td> Orange </td> <td>  </td> <td>  </td> <td>  </td> <td> Size 7 </td> <td>  </td> <td>  </td> <td>  </td> </tr>
##   <tr> <td align="right"> 19 </td> <td> Sports Equipment </td> <td> Tennis Racket </td> <td align="right"> 502.00 </td> <td> RacketPro </td> <td align="right"> 89.99 </td> <td> 502-A </td> <td> Black </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td> Graphite </td> </tr>
##   <tr> <td align="right"> 20 </td> <td> Sports Equipment </td> <td> Tennis Racket </td> <td align="right"> 502.00 </td> <td> RacketPro </td> <td align="right"> 89.99 </td> <td> 502-B </td> <td> Silver </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td>  </td> <td> Aluminum </td> </tr>
##    </table>

XML

XML is used for data exchange in enterprise applications, web services, and configurations.

# create a new xml doc
doc_xml <- newXMLDoc(isHTML = FALSE)

# create an empty table node
table_node <- newXMLNode("table", doc = doc_xml)

# create a function to convert every row in the df into new node
row_data <- apply(df, 1, function(x) {
  z1 <- newXMLNode('row') # create a new node for each row
  addChildren(z1, lapply(names(x), function(y) newXMLNode(y, x[y])))
})

# add row data to the table node
xmlParent(row_data) <- table_node

# save as xml file
saveXML(doc_xml, file = "CUNYMart.xml")

[1] “CUNYMart.xml”

read_doc_xml <- read_xml("CUNYMart.xml")
print(read_doc_xml)
{xml_document}

[1] ElectronicsSmartphoneElectronicsSmartphoneElectronicsLaptopElectronicsLaptopHome AppliancesRefrigerator … [6] Home AppliancesRefrigerator … [7] Home AppliancesWashing Mach … [8] Home AppliancesWashing Mach … [9] ClothingT-Shirt … [10] ClothingT-Shirt … [11] ClothingT-Shirt … [12] ClothingJeans… [13] ClothingJeans… [14] BooksFiction NovelBooksFiction NovelBooksNon-Fiction GuideBooksNon-Fiction GuideSports EquipmentBasketball< … [19] Sports EquipmentTennis Rack … [20] Sports EquipmentTennis Rack …

Parquet

A good use case for Parquet is big data analytics and storage, because it provides efficient compression, making it ideal for inexpensive cloud storage.

# write df to parquet file
write_parquet(
  df,
  "CUNYMart.parquet"
)

Data Types (JSON, HTML, XML, and Parquet)


Format Pros Cons
JSON Lightweight, human-readable, widely used in web APIs, and supports nested data structures No specified data types, no error handling mechanism
HTML Standard for web pages, ideal for visualizing data in tables on websites Requires a lot of code to produce an output, can be difficult to understand due to the structure
XML Designed to store and transport data, optimal for simple and readable data Large files, can be expensive to store for larger datasets
Parquet Very lightweight, inexpensive option for storing big data Isn’t human-readable like CSVs or JSON