Assignment_10_Data

library(tidyverse)
library(fpp3)
library(naniar)

# 2. Read the raw data
url <- "https://raw.githubusercontent.com/uzmabb182/Data_624_Predictive_Analytics/refs/heads/main/GroceryDataSet.csv"
grocery_df <- read_csv(url)

#view(grocery_df)

Transform Data to Long Format

we will convert your wide table (with many empty columns) into a Long Format list.

# Add a Transaction ID to each row
# Pivot the data into a single column
# Remove the empty (NA) cells
grocery_long <- grocery_df |>
  mutate(tid = row_number()) |>
  pivot_longer(cols = -tid, names_to = "item_index", values_to = "item") |>
  filter(!is.na(item)) |>
  select(tid, item)

# View the first few rows to confirm it worked
head(grocery_long)

## # A tibble: 6 × 2
##     tid item          
##   <int> <chr>         
## 1     1 tropical fruit
## 2     1 yogurt        
## 3     1 coffee        
## 4     2 whole milk    
## 5     3 pip fruit     
## 6     3 yogurt

Calculate Item Frequencies (Support)

we need to calculate the Support for each individual item. This helps you identify the most frequent items in the dataset

# Count the total number of unique transactions (receipts)
total_transactions <- n_distinct(grocery_long$tid)

# Calculate the count and support (percentage) for each item
item_support <- grocery_long |>
  group_by(item) |>
  summarize(count = n()) |>
  mutate(support = count / total_transactions) |>
  arrange(desc(count))

# View the top 10 most frequent items
head(item_support, 10)

## # A tibble: 10 × 3
##    item             count support
##    <chr>            <int>   <dbl>
##  1 whole milk        2513  0.256 
##  2 other vegetables  1903  0.194 
##  3 rolls/buns        1809  0.184 
##  4 soda              1715  0.174 
##  5 yogurt            1372  0.140 
##  6 bottled water     1087  0.111 
##  7 root vegetables   1072  0.109 
##  8 tropical fruit    1032  0.105 
##  9 shopping bags      969  0.0985
## 10 sausage            924  0.0940

*Visualize Item Frequencies**

# 1. Select the top 10 items from the support table we created
top_items <- item_support |>
  head(10)

# 2. Create a bar chart
library(ggplot2)

ggplot(top_items, aes(x = reorder(item, support), y = support)) +
  geom_col(fill = "skyblue") +
  coord_flip() +  # This flips the axis so the item names are easy to read
  labs(
    title = "Top 10 Most Frequent Items (Support)",
    x = "Grocery Item",
    y = "Support (Percentage of Baskets)"
  ) +
  theme_minimal()

Find Item Pairs and Calculate Metrics

next step is to find pairs of items that were bought together in the same receipt. This allows us to calculate Confidence and Lift.

We finds every combination of two items in a single basket and calculates the required metrics.

# Create pairs of items bought together in the same transaction (tid)
item_pairs <- grocery_long |>
  inner_join(grocery_long, by = "tid", relationship = "many-to-many") |>
  # Filter to keep only unique pairs (avoids "milk & milk" and double counting "milk & bread" / "bread & milk")
  filter(item.x < item.y) |>
  group_by(item.x, item.y) |>
  summarize(pair_count = n(), .groups = 'drop')

# Join with individual item counts to calculate Support, Confidence, and Lift
# We join 'item_support' twice: once for the first item (LHS) and once for the second (RHS)
rules_manual <- item_pairs |>
  inner_join(item_support, by = c("item.x" = "item")) |>
  rename(support_lhs = support, count_lhs = count) |>
  inner_join(item_support, by = c("item.y" = "item")) |>
  rename(support_rhs = support, count_rhs = count) |>
  mutate(
    support = pair_count / total_transactions,
    confidence = pair_count / count_lhs,
    lift = support / (support_lhs * support_rhs)
  )

# View the top 10 rules by Lift as required by your assignment
top_10_lift <- rules_manual |>
  arrange(desc(lift)) |>
  head(10)

print(top_10_lift)

## # A tibble: 10 × 10
##    item.x  item.y pair_count count_lhs support_lhs count_rhs support_rhs support
##    <chr>   <chr>       <int>     <int>       <dbl>     <int>       <dbl>   <dbl>
##  1 cocoa … prese…          1        22    0.00224          2    0.000203 1.02e-4
##  2 baby f… finis…          1         1    0.000102        64    0.00651  1.02e-4
##  3 baby f… soups           1         1    0.000102        67    0.00681  1.02e-4
##  4 abrasi… prese…          1        35    0.00356          2    0.000203 1.02e-4
##  5 baby c… cream           1         6    0.000610        13    0.00132  1.02e-4
##  6 frozen… sound…          1        83    0.00844          1    0.000102 1.02e-4
##  7 bags    tidbi…          1         4    0.000407        23    0.00234  1.02e-4
##  8 preser… spices          1         2    0.000203        51    0.00519  1.02e-4
##  9 fish    kitch…          1        29    0.00295          4    0.000407 1.02e-4
## 10 baby f… cake …          1         1    0.000102       130    0.0132   1.02e-4
## # ℹ 2 more variables: confidence <dbl>, lift <dbl>

Extra Credit - Simple Cluster Analysis

# Identify the top 20 most frequent items to keep the chart readable
top_20_items <- item_support |>
  head(20) |>
  pull(item)

# Create a binary matrix (Transactions vs. Items)
# This table marks 1 if an item was in a basket, and 0 if it wasn't
cluster_matrix <- grocery_long |>
  filter(item %in% top_20_items) |>
  mutate(present = 1) |>
  pivot_wider(names_from = item, values_from = present, values_fill = 0) |>
  select(-tid)

# Calculate the distance between items
# We use 'method = binary' because our data is 1s and 0s
# We transpose 't()' because we want to cluster the items, not the receipts
item_distance <- dist(t(cluster_matrix), method = "binary")

# Perform Hierarchical Clustering using Ward's method
hc <- hclust(item_distance, method = "ward.D2")

# Plot the Dendrogram (the cluster tree)
plot(hc, main = "Cluster Analysis: Top 20 Grocery Items", 
     xlab = "Items", sub = "Items grouped by similar purchasing patterns",
     col = "#2E8B57") # Dark green color

We see some very strong relationships. For example, your rule for {flour} -> {sugar} with a Lift of 8.46 is a classic “baking” cluster—those items are bought together over 8 times more often than random chance would suggest.

Assignment_10_Data_624

Mubashira Qari

2026-04-18