In this report, unstructured data from copied from a PDF file will be cleaned, and converted into JSON, HTML, XML, and Parquet.
The code below reads in the text copied from the PDF file:
txt <- read_csv(
"Category,Item Name,Item ID,Brand,Price,Variation ID,Variation Details
Electronics,Smartphone,101,TechBrand,699.99,101-A,Color: Black, Storage: 64GB
Electronics,Smartphone,101,TechBrand,699.99,101-B,Color: White, Storage: 128GB
Electronics,Laptop,102,CompuBrand,1099.99,102-A,Color: Silver, Storage: 256GB
Electronics,Laptop,102,CompuBrand,1099.99,102-B,Color: Space Gray, Storage: 512GB
Home Appliances,Refrigerator,201,HomeCool,899.99,201-A,Color: Stainless Steel, Capacity: 20 cu ft,
Home Appliances,Refrigerator,201,HomeCool,899.99,201-B,Color: White, Capacity: 18 cu ft
Home Appliances,Washing Machine,202,CleanTech,499.99,202-A,Type: Front Load, Capacity: 4.5 cu ft,
Home Appliances,Washing Machine,202,CleanTech,499.99,202-B,Type: Top Load, Capacity: 5.0 cu ft,
Clothing,T-Shirt,301,FashionCo,19.99,301-A,Color: Blue, Size: S
Clothing,T-Shirt,301,FashionCo,19.99,301-B,Color: Red, Size: M
Clothing,T-Shirt,301,FashionCo,19.99,301-C,Color: Green, Size: L
Clothing,Jeans,302,DenimWorks,49.99,302-A,Color: Dark Blue, Size: 32
Clothing,Jeans,302,DenimWorks,49.99,302-B,Color: Light Blue, Size: 34
Books,Fiction Novel,401,-,14.99,401-A,Format: Hardcover, Language: English
Books,Fiction Novel,401,-,14.99,401-B,Format: Paperback, Language: Spanish
Books,Non-Fiction Guide,402,-,24.99,402-A,Format: eBook, Language: English
Books,Non-Fiction Guide,402,-,24.99,402-B,Format: Paperback, Language: French
Sports Equipment,Basketball,501,SportsGear,29.99,501-A,Size: Size 7, Color: Orange
Sports Equipment,Tennis Racket,502,RacketPro,89.99,502-A,Material: Graphite, Color: Black
Sports Equipment,Tennis Racket,502,RacketPro,89.99,502-B,Material: Aluminum, Color: Silver"
) |>
# change names to be in snake case
janitor::clean_names()In the code below, the data is cleaned so that there are no longer
multiple values in the variation_details column. This will
produce more null values, but will make it simpler to analyze the
variation details for each item.
df <- txt |>
# split the terms by the comma
separate_wider_delim(variation_details, delim = ", ",names_sep = "_") |>
# Split again by colon and space
separate_wider_delim(c("variation_details_1","variation_details_2"), delim =": ", names_sep = "_") |>
# pivot longer so that the variation categories are in each row
pivot_longer(
cols = matches("details"),
names_to = c("variable", ".value"),
names_pattern = "([0-9]{1})_([0-9]{1})"
) |>
select(-variable) |>
# pivot wider to make each category a column
pivot_wider(
names_from = `1`,
values_from = `2`
)Once the data is cleaned, it can be converted into different formats, such as JSON, HTML, XML, and Parquet.
JSON is largely used web applications and APIs due to its lightweight, human-readable format that integrates well with modern data pipelines.
# convert df to json
json_data <- toJSON(df, pretty = TRUE)
# create json file
write(json_data, "CUNYMart.json")
print(json_data)[ { “category”: “Electronics”, “item_name”: “Smartphone”, “item_id”: 101, “brand”: “TechBrand”, “price”: 699.99, “variation_id”: “101-A”, “Color”: “Black”, “Storage”: “64GB” }, { “category”: “Electronics”, “item_name”: “Smartphone”, “item_id”: 101, “brand”: “TechBrand”, “price”: 699.99, “variation_id”: “101-B”, “Color”: “White”, “Storage”: “128GB” }, { “category”: “Electronics”, “item_name”: “Laptop”, “item_id”: 102, “brand”: “CompuBrand”, “price”: 1099.99, “variation_id”: “102-A”, “Color”: “Silver”, “Storage”: “256GB” }, { “category”: “Electronics”, “item_name”: “Laptop”, “item_id”: 102, “brand”: “CompuBrand”, “price”: 1099.99, “variation_id”: “102-B”, “Color”: “Space Gray”, “Storage”: “512GB” }, { “category”: “Home Appliances”, “item_name”: “Refrigerator”, “item_id”: 201, “brand”: “HomeCool”, “price”: 899.99, “variation_id”: “201-A”, “Color”: “Stainless Steel”, “Capacity”: “20 cu ft,” }, { “category”: “Home Appliances”, “item_name”: “Refrigerator”, “item_id”: 201, “brand”: “HomeCool”, “price”: 899.99, “variation_id”: “201-B”, “Color”: “White”, “Capacity”: “18 cu ft” }, { “category”: “Home Appliances”, “item_name”: “Washing Machine”, “item_id”: 202, “brand”: “CleanTech”, “price”: 499.99, “variation_id”: “202-A”, “Capacity”: “4.5 cu ft,”, “Type”: “Front Load” }, { “category”: “Home Appliances”, “item_name”: “Washing Machine”, “item_id”: 202, “brand”: “CleanTech”, “price”: 499.99, “variation_id”: “202-B”, “Capacity”: “5.0 cu ft,”, “Type”: “Top Load” }, { “category”: “Clothing”, “item_name”: “T-Shirt”, “item_id”: 301, “brand”: “FashionCo”, “price”: 19.99, “variation_id”: “301-A”, “Color”: “Blue”, “Size”: “S” }, { “category”: “Clothing”, “item_name”: “T-Shirt”, “item_id”: 301, “brand”: “FashionCo”, “price”: 19.99, “variation_id”: “301-B”, “Color”: “Red”, “Size”: “M” }, { “category”: “Clothing”, “item_name”: “T-Shirt”, “item_id”: 301, “brand”: “FashionCo”, “price”: 19.99, “variation_id”: “301-C”, “Color”: “Green”, “Size”: “L” }, { “category”: “Clothing”, “item_name”: “Jeans”, “item_id”: 302, “brand”: “DenimWorks”, “price”: 49.99, “variation_id”: “302-A”, “Color”: “Dark Blue”, “Size”: “32” }, { “category”: “Clothing”, “item_name”: “Jeans”, “item_id”: 302, “brand”: “DenimWorks”, “price”: 49.99, “variation_id”: “302-B”, “Color”: “Light Blue”, “Size”: “34” }, { “category”: “Books”, “item_name”: “Fiction Novel”, “item_id”: 401, “brand”: “-”, “price”: 14.99, “variation_id”: “401-A”, “Format”: “Hardcover”, “Language”: “English” }, { “category”: “Books”, “item_name”: “Fiction Novel”, “item_id”: 401, “brand”: “-”, “price”: 14.99, “variation_id”: “401-B”, “Format”: “Paperback”, “Language”: “Spanish” }, { “category”: “Books”, “item_name”: “Non-Fiction Guide”, “item_id”: 402, “brand”: “-”, “price”: 24.99, “variation_id”: “402-A”, “Format”: “eBook”, “Language”: “English” }, { “category”: “Books”, “item_name”: “Non-Fiction Guide”, “item_id”: 402, “brand”: “-”, “price”: 24.99, “variation_id”: “402-B”, “Format”: “Paperback”, “Language”: “French” }, { “category”: “Sports Equipment”, “item_name”: “Basketball”, “item_id”: 501, “brand”: “SportsGear”, “price”: 29.99, “variation_id”: “501-A”, “Color”: “Orange”, “Size”: “Size 7” }, { “category”: “Sports Equipment”, “item_name”: “Tennis Racket”, “item_id”: 502, “brand”: “RacketPro”, “price”: 89.99, “variation_id”: “502-A”, “Color”: “Black”, “Material”: “Graphite” }, { “category”: “Sports Equipment”, “item_name”: “Tennis Racket”, “item_id”: 502, “brand”: “RacketPro”, “price”: 89.99, “variation_id”: “502-B”, “Color”: “Silver”, “Material”: “Aluminum” }]
HTML formatting is best for displaying structured content on web pages. Examples are using HTML to generate reports, dashboards, and formatted text to host on a website.
# convert to HTML table
html_data <- xtable(df)
# Save HTML to a file
html_output <- print.xtable(html_data, type="html", print.results=FALSE)
write_lines(html_output, "CUNYMart.html")
# Read and display the HTML as plain text
html_text <- readLines("CUNYMart.html")
cat(paste(html_text, collapse="\n"))## <!-- html table generated in R 4.4.3 by xtable 1.8-4 package -->
## <!-- Sun Mar 30 17:17:37 2025 -->
## <table border=1>
## <tr> <th> </th> <th> category </th> <th> item_name </th> <th> item_id </th> <th> brand </th> <th> price </th> <th> variation_id </th> <th> Color </th> <th> Storage </th> <th> Capacity </th> <th> Type </th> <th> Size </th> <th> Format </th> <th> Language </th> <th> Material </th> </tr>
## <tr> <td align="right"> 1 </td> <td> Electronics </td> <td> Smartphone </td> <td align="right"> 101.00 </td> <td> TechBrand </td> <td align="right"> 699.99 </td> <td> 101-A </td> <td> Black </td> <td> 64GB </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 2 </td> <td> Electronics </td> <td> Smartphone </td> <td align="right"> 101.00 </td> <td> TechBrand </td> <td align="right"> 699.99 </td> <td> 101-B </td> <td> White </td> <td> 128GB </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 3 </td> <td> Electronics </td> <td> Laptop </td> <td align="right"> 102.00 </td> <td> CompuBrand </td> <td align="right"> 1099.99 </td> <td> 102-A </td> <td> Silver </td> <td> 256GB </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 4 </td> <td> Electronics </td> <td> Laptop </td> <td align="right"> 102.00 </td> <td> CompuBrand </td> <td align="right"> 1099.99 </td> <td> 102-B </td> <td> Space Gray </td> <td> 512GB </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 5 </td> <td> Home Appliances </td> <td> Refrigerator </td> <td align="right"> 201.00 </td> <td> HomeCool </td> <td align="right"> 899.99 </td> <td> 201-A </td> <td> Stainless Steel </td> <td> </td> <td> 20 cu ft, </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 6 </td> <td> Home Appliances </td> <td> Refrigerator </td> <td align="right"> 201.00 </td> <td> HomeCool </td> <td align="right"> 899.99 </td> <td> 201-B </td> <td> White </td> <td> </td> <td> 18 cu ft </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 7 </td> <td> Home Appliances </td> <td> Washing Machine </td> <td align="right"> 202.00 </td> <td> CleanTech </td> <td align="right"> 499.99 </td> <td> 202-A </td> <td> </td> <td> </td> <td> 4.5 cu ft, </td> <td> Front Load </td> <td> </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 8 </td> <td> Home Appliances </td> <td> Washing Machine </td> <td align="right"> 202.00 </td> <td> CleanTech </td> <td align="right"> 499.99 </td> <td> 202-B </td> <td> </td> <td> </td> <td> 5.0 cu ft, </td> <td> Top Load </td> <td> </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 9 </td> <td> Clothing </td> <td> T-Shirt </td> <td align="right"> 301.00 </td> <td> FashionCo </td> <td align="right"> 19.99 </td> <td> 301-A </td> <td> Blue </td> <td> </td> <td> </td> <td> </td> <td> S </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 10 </td> <td> Clothing </td> <td> T-Shirt </td> <td align="right"> 301.00 </td> <td> FashionCo </td> <td align="right"> 19.99 </td> <td> 301-B </td> <td> Red </td> <td> </td> <td> </td> <td> </td> <td> M </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 11 </td> <td> Clothing </td> <td> T-Shirt </td> <td align="right"> 301.00 </td> <td> FashionCo </td> <td align="right"> 19.99 </td> <td> 301-C </td> <td> Green </td> <td> </td> <td> </td> <td> </td> <td> L </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 12 </td> <td> Clothing </td> <td> Jeans </td> <td align="right"> 302.00 </td> <td> DenimWorks </td> <td align="right"> 49.99 </td> <td> 302-A </td> <td> Dark Blue </td> <td> </td> <td> </td> <td> </td> <td> 32 </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 13 </td> <td> Clothing </td> <td> Jeans </td> <td align="right"> 302.00 </td> <td> DenimWorks </td> <td align="right"> 49.99 </td> <td> 302-B </td> <td> Light Blue </td> <td> </td> <td> </td> <td> </td> <td> 34 </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 14 </td> <td> Books </td> <td> Fiction Novel </td> <td align="right"> 401.00 </td> <td> - </td> <td align="right"> 14.99 </td> <td> 401-A </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> Hardcover </td> <td> English </td> <td> </td> </tr>
## <tr> <td align="right"> 15 </td> <td> Books </td> <td> Fiction Novel </td> <td align="right"> 401.00 </td> <td> - </td> <td align="right"> 14.99 </td> <td> 401-B </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> Paperback </td> <td> Spanish </td> <td> </td> </tr>
## <tr> <td align="right"> 16 </td> <td> Books </td> <td> Non-Fiction Guide </td> <td align="right"> 402.00 </td> <td> - </td> <td align="right"> 24.99 </td> <td> 402-A </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> eBook </td> <td> English </td> <td> </td> </tr>
## <tr> <td align="right"> 17 </td> <td> Books </td> <td> Non-Fiction Guide </td> <td align="right"> 402.00 </td> <td> - </td> <td align="right"> 24.99 </td> <td> 402-B </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> Paperback </td> <td> French </td> <td> </td> </tr>
## <tr> <td align="right"> 18 </td> <td> Sports Equipment </td> <td> Basketball </td> <td align="right"> 501.00 </td> <td> SportsGear </td> <td align="right"> 29.99 </td> <td> 501-A </td> <td> Orange </td> <td> </td> <td> </td> <td> </td> <td> Size 7 </td> <td> </td> <td> </td> <td> </td> </tr>
## <tr> <td align="right"> 19 </td> <td> Sports Equipment </td> <td> Tennis Racket </td> <td align="right"> 502.00 </td> <td> RacketPro </td> <td align="right"> 89.99 </td> <td> 502-A </td> <td> Black </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> Graphite </td> </tr>
## <tr> <td align="right"> 20 </td> <td> Sports Equipment </td> <td> Tennis Racket </td> <td align="right"> 502.00 </td> <td> RacketPro </td> <td align="right"> 89.99 </td> <td> 502-B </td> <td> Silver </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> </td> <td> Aluminum </td> </tr>
## </table>
XML is used for data exchange in enterprise applications, web services, and configurations.
# create a new xml doc
doc_xml <- newXMLDoc(isHTML = FALSE)
# create an empty table node
table_node <- newXMLNode("table", doc = doc_xml)
# create a function to convert every row in the df into new node
row_data <- apply(df, 1, function(x) {
z1 <- newXMLNode('row') # create a new node for each row
addChildren(z1, lapply(names(x), function(y) newXMLNode(y, x[y])))
})
# add row data to the table node
xmlParent(row_data) <- table_node
# save as xml file
saveXML(doc_xml, file = "CUNYMart.xml")[1] “CUNYMart.xml”
{xml_document}| Format | Pros | Cons |
|---|---|---|
| JSON | Lightweight, human-readable, widely used in web APIs, and supports nested data structures | No specified data types, no error handling mechanism |
| HTML | Standard for web pages, ideal for visualizing data in tables on websites | Requires a lot of code to produce an output, can be difficult to understand due to the structure |
| XML | Designed to store and transport data, optimal for simple and readable data | Large files, can be expensive to store for larger datasets |
| Parquet | Very lightweight, inexpensive option for storing big data | Isn’t human-readable like CSVs or JSON |