Data 624_Exercise

Imagine 10000 receipts sitting on your table. Each receipt represents a transaction with items that were purchased. The receipt is a representation of stuff that went into a customer’s basket - and therefore ‘Market Basket Analysis’. That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item. The data set is attached. Your assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift.

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(arules)

## Warning: package 'arules' was built under R version 4.4.3

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)

## Warning: package 'arulesViz' was built under R version 4.4.3

library(cluster)
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(tidyverse)

## Warning: package 'readr' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1
## ✔ purrr     1.0.2     ✔ tibble    3.2.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ Matrix::expand() masks tidyr::expand()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ Matrix::pack()   masks tidyr::pack()
## ✖ arules::recode() masks dplyr::recode()
## ✖ Matrix::unpack() masks tidyr::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(tidytext)

df <- read.csv("https://raw.githubusercontent.com/Jennyjjxxzz/Data-624_HW10/refs/heads/main/GroceryDataSet.csv", stringsAsFactors = FALSE)

# Convert each row to a transaction (remove NAs and blanks)
transactions_list <- apply(df, 1, function(x) na.omit(x[x != ""]))
transactions <- as(transactions_list, "transactions")

summary(transactions)

## transactions as itemMatrix in sparse format with
##  9834 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.0260917 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34051 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1004  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    3.00    4.41    6.00   32.00 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics

# Mine rules using the Apriori algorithm
rules <- apriori(transactions, parameter = list(supp = 0.001, conf = 0.5))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9834 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [5668 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

# Sort
top5_rules <- sort(rules, by = "lift", decreasing = TRUE)[1:5]

# Show top 5 rules
inspect(top5_rules)

##     lhs                                    rhs              support    
## [1] {Instant food products, soda}       => {hamburger meat} 0.001220256
## [2] {popcorn, soda}                     => {salty snack}    0.001220256
## [3] {baking powder, flour}              => {sugar}          0.001016880
## [4] {ham, processed cheese}             => {white bread}    0.001932072
## [5] {Instant food products, whole milk} => {hamburger meat} 0.001525320
##     confidence coverage    lift     count
## [1] 0.6315789  0.001932072 18.99372 12   
## [2] 0.6315789  0.001932072 16.69610 12   
## [3] 0.5555556  0.001830384 16.40641 10   
## [4] 0.6333333  0.003050641 15.04396 19   
## [5] 0.5000000  0.003050641 15.03670 15

Explore the Data for Top 20 items

itemFrequencyPlot(transactions, topN = 20, type = "absolute", horiz = TRUE)

plot(rules[1:20], method = "graph", engine = "htmlwidget")

Elbow Plot for Optimal Clusters

# Convert transactions to binary item matrix
item_matrix <- as(transactions, "matrix")
# Scale the martix
scaled <- scale(item_matrix)

# Compute total within-cluster sum of squares for k = 1 to 10
wss <- map_dbl(1:10, ~kmeans(scaled, centers = ., nstart = 10)$tot.withinss)

# Plot Elbow Plot
tibble(k = 1:10, wss = wss) %>%
  ggplot(aes(k, wss)) +
  geom_line() +
  geom_point() +
  geom_vline(xintercept = 3, linetype = "dashed", color = "red") +
  labs(title = "Elbow Plot for Optimal Clusters",
       x = "Number of Clusters",
       y = "Within-Cluster Sum of Squares") +
  theme_minimal()

#### K-Means Clustering on Market Baskets

set.seed(123)
km_res <- kmeans(scaled, centers = 3, nstart = 25)

fviz_cluster(km_res, data = scaled, geom = "point", ellipse.type = "convex") +
  labs(title = "K-Means Clustering on Market Baskets")

Top 10 Most Sold Products per Cluster

# Assign cluster labels
cluster_assignments <- km_res$cluster

# Convert transactions to a list and combine with cluster labels
transaction_list <- as(transactions, "list")
transaction_df <- tibble(
  transaction_id = seq_along(transaction_list),
  items = transaction_list,
  cluster = cluster_assignments
)

# Unnest the items into long format
exploded_items <- transaction_df %>%
  unnest(items)

# Count top 10 most frequent items in each cluster
top_items_by_cluster <- exploded_items %>%
  group_by(cluster, items) %>%
  summarise(count = n(), .groups = "drop") %>%
  arrange(cluster, desc(count)) %>%
  group_by(cluster) %>%
  slice_head(n = 10)

# Reorder items 
top_items_by_cluster <- top_items_by_cluster %>%
  mutate(items = reorder_within(items, count, cluster))

# plot
ggplot(top_items_by_cluster, aes(x = items, y = count, fill = as.factor(cluster))) +
  geom_col(show.legend = FALSE) +
  scale_x_reordered() +
  facet_wrap(~ cluster, scales = "free_y") +
  coord_flip() +
  labs(
    title = "Top 10 Most Sold Products per Cluster",
    x = "Product",
    y = "Count"
  ) +
  theme_minimal()

Data 624_Exercise_HW10

Jiaxin Zheng

2025-05-17

Explore the Data for Top 20 items

Elbow Plot for Optimal Clusters

Top 10 Most Sold Products per Cluster