library(tidyverse)
library(fpp3)
library(naniar)
# 2. Read the raw data
url <- "https://raw.githubusercontent.com/uzmabb182/Data_624_Predictive_Analytics/refs/heads/main/GroceryDataSet.csv"
grocery_df <- read_csv(url)
#view(grocery_df)
Transform Data to Long Format
we will convert your wide table (with many empty columns) into a Long Format list.
# Add a Transaction ID to each row
# Pivot the data into a single column
# Remove the empty (NA) cells
grocery_long <- grocery_df |>
mutate(tid = row_number()) |>
pivot_longer(cols = -tid, names_to = "item_index", values_to = "item") |>
filter(!is.na(item)) |>
select(tid, item)
# View the first few rows to confirm it worked
head(grocery_long)
## # A tibble: 6 × 2
## tid item
## <int> <chr>
## 1 1 tropical fruit
## 2 1 yogurt
## 3 1 coffee
## 4 2 whole milk
## 5 3 pip fruit
## 6 3 yogurt
Calculate Item Frequencies (Support)
we need to calculate the Support for each individual item. This helps you identify the most frequent items in the dataset
# Count the total number of unique transactions (receipts)
total_transactions <- n_distinct(grocery_long$tid)
# Calculate the count and support (percentage) for each item
item_support <- grocery_long |>
group_by(item) |>
summarize(count = n()) |>
mutate(support = count / total_transactions) |>
arrange(desc(count))
# View the top 10 most frequent items
head(item_support, 10)
## # A tibble: 10 × 3
## item count support
## <chr> <int> <dbl>
## 1 whole milk 2513 0.256
## 2 other vegetables 1903 0.194
## 3 rolls/buns 1809 0.184
## 4 soda 1715 0.174
## 5 yogurt 1372 0.140
## 6 bottled water 1087 0.111
## 7 root vegetables 1072 0.109
## 8 tropical fruit 1032 0.105
## 9 shopping bags 969 0.0985
## 10 sausage 924 0.0940
*Visualize Item Frequencies**
# 1. Select the top 10 items from the support table we created
top_items <- item_support |>
head(10)
# 2. Create a bar chart
library(ggplot2)
ggplot(top_items, aes(x = reorder(item, support), y = support)) +
geom_col(fill = "skyblue") +
coord_flip() + # This flips the axis so the item names are easy to read
labs(
title = "Top 10 Most Frequent Items (Support)",
x = "Grocery Item",
y = "Support (Percentage of Baskets)"
) +
theme_minimal()
Find Item Pairs and Calculate Metrics
next step is to find pairs of items that were bought together in the same receipt. This allows us to calculate Confidence and Lift.
We finds every combination of two items in a single basket and calculates the required metrics.
# Create pairs of items bought together in the same transaction (tid)
item_pairs <- grocery_long |>
inner_join(grocery_long, by = "tid", relationship = "many-to-many") |>
# Filter to keep only unique pairs (avoids "milk & milk" and double counting "milk & bread" / "bread & milk")
filter(item.x < item.y) |>
group_by(item.x, item.y) |>
summarize(pair_count = n(), .groups = 'drop')
# Join with individual item counts to calculate Support, Confidence, and Lift
# We join 'item_support' twice: once for the first item (LHS) and once for the second (RHS)
rules_manual <- item_pairs |>
inner_join(item_support, by = c("item.x" = "item")) |>
rename(support_lhs = support, count_lhs = count) |>
inner_join(item_support, by = c("item.y" = "item")) |>
rename(support_rhs = support, count_rhs = count) |>
mutate(
support = pair_count / total_transactions,
confidence = pair_count / count_lhs,
lift = support / (support_lhs * support_rhs)
)
# View the top 10 rules by Lift as required by your assignment
top_10_lift <- rules_manual |>
arrange(desc(lift)) |>
head(10)
print(top_10_lift)
## # A tibble: 10 × 10
## item.x item.y pair_count count_lhs support_lhs count_rhs support_rhs support
## <chr> <chr> <int> <int> <dbl> <int> <dbl> <dbl>
## 1 cocoa … prese… 1 22 0.00224 2 0.000203 1.02e-4
## 2 baby f… finis… 1 1 0.000102 64 0.00651 1.02e-4
## 3 baby f… soups 1 1 0.000102 67 0.00681 1.02e-4
## 4 abrasi… prese… 1 35 0.00356 2 0.000203 1.02e-4
## 5 baby c… cream 1 6 0.000610 13 0.00132 1.02e-4
## 6 frozen… sound… 1 83 0.00844 1 0.000102 1.02e-4
## 7 bags tidbi… 1 4 0.000407 23 0.00234 1.02e-4
## 8 preser… spices 1 2 0.000203 51 0.00519 1.02e-4
## 9 fish kitch… 1 29 0.00295 4 0.000407 1.02e-4
## 10 baby f… cake … 1 1 0.000102 130 0.0132 1.02e-4
## # ℹ 2 more variables: confidence <dbl>, lift <dbl>
Extra Credit - Simple Cluster Analysis
# Identify the top 20 most frequent items to keep the chart readable
top_20_items <- item_support |>
head(20) |>
pull(item)
# Create a binary matrix (Transactions vs. Items)
# This table marks 1 if an item was in a basket, and 0 if it wasn't
cluster_matrix <- grocery_long |>
filter(item %in% top_20_items) |>
mutate(present = 1) |>
pivot_wider(names_from = item, values_from = present, values_fill = 0) |>
select(-tid)
# Calculate the distance between items
# We use 'method = binary' because our data is 1s and 0s
# We transpose 't()' because we want to cluster the items, not the receipts
item_distance <- dist(t(cluster_matrix), method = "binary")
# Perform Hierarchical Clustering using Ward's method
hc <- hclust(item_distance, method = "ward.D2")
# Plot the Dendrogram (the cluster tree)
plot(hc, main = "Cluster Analysis: Top 20 Grocery Items",
xlab = "Items", sub = "Items grouped by similar purchasing patterns",
col = "#2E8B57") # Dark green color
We see some very strong relationships. For example, your rule for
{flour} -> {sugar} with a Lift of 8.46 is a classic “baking”
cluster—those items are bought together over 8 times more often than
random chance would suggest.