DATA624-HW10

Instructions Imagine 10000 receipts sitting on your table. Each receipt represents a transaction with items that were purchased. The receipt is a representation of stuff that went into a customer’s basket - and therefore ‘Market Basket Analysis’.

That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item. The data set is attached.

Your assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift.

Extra credit: do a simple cluster analysis on the data as well. Use whichever packages you like.

# Load necessary libraries
library(arules)

## Warning: package 'arules' was built under R version 4.3.3

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)

# Load the dataset
grocery <- read.transactions("/Users/zigcah/Downloads/GroceryDataSet.csv", sep = ",")
summary(grocery)

## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics

# Frequency plot of top 20 items
itemFrequencyPlot(grocery, topN = 20, main = "Top 20 Frequent Items in Grocery Transactions")

# Load the dataset
grocery_data <- read.csv("/Users/zigcah/Downloads/GroceryDataSet.csv", header = FALSE, stringsAsFactors = FALSE)

head(grocery_data)

##                 V1                  V2             V3                       V4
## 1     citrus fruit semi-finished bread      margarine              ready soups
## 2   tropical fruit              yogurt         coffee                         
## 3       whole milk                                                            
## 4        pip fruit              yogurt  cream cheese              meat spreads
## 5 other vegetables          whole milk condensed milk long life bakery product
## 6       whole milk              butter         yogurt                     rice
##                 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 1                                                                             
## 2                                                                             
## 3                                                                             
## 4                                                                             
## 5                                                                             
## 6 abrasive cleaner                                                            
##   V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32
## 1                                            
## 2                                            
## 3                                            
## 4                                            
## 5                                            
## 6

# Clean the data
# Remove empty strings and convert rows to list of items
transactions_list <- apply(grocery_data, 1, function(row) {
  as.character(na.omit(row[row != ""]))
})

# Convert the list to transactions
transactions <- as(transactions_list, "transactions")

# Inspect the transactions to confirm
inspect(head(transactions))

##     items                      
## [1] {citrus fruit,             
##      margarine,                
##      ready soups,              
##      semi-finished bread}      
## [2] {coffee,                   
##      tropical fruit,           
##      yogurt}                   
## [3] {whole milk}               
## [4] {cream cheese ,            
##      meat spreads,             
##      pip fruit,                
##      yogurt}                   
## [5] {condensed milk,           
##      long life bakery product, 
##      other vegetables,         
##      whole milk}               
## [6] {abrasive cleaner,         
##      butter,                   
##      rice,                     
##      whole milk,               
##      yogurt}

# Mine association rules
rules <- apriori(transactions, parameter = list(support = 0.01, confidence = 0.5))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 98 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.01s].
## sorting and recoding items ... [88 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

# Sort and view the top 10 rules by lift
rules_sorted <- sort(rules, by = "lift")
inspect(head(rules_sorted, 10))

##      lhs                                  rhs                support   
## [1]  {citrus fruit, root vegetables}   => {other vegetables} 0.01037112
## [2]  {root vegetables, tropical fruit} => {other vegetables} 0.01230300
## [3]  {rolls/buns, root vegetables}     => {other vegetables} 0.01220132
## [4]  {root vegetables, yogurt}         => {other vegetables} 0.01291307
## [5]  {curd, yogurt}                    => {whole milk}       0.01006609
## [6]  {butter, other vegetables}        => {whole milk}       0.01148958
## [7]  {root vegetables, tropical fruit} => {whole milk}       0.01199797
## [8]  {root vegetables, yogurt}         => {whole milk}       0.01453991
## [9]  {domestic eggs, other vegetables} => {whole milk}       0.01230300
## [10] {whipped/sour cream, yogurt}      => {whole milk}       0.01087951
##      confidence coverage   lift     count
## [1]  0.5862069  0.01769192 3.029608 102  
## [2]  0.5845411  0.02104728 3.020999 121  
## [3]  0.5020921  0.02430097 2.594890 120  
## [4]  0.5000000  0.02582613 2.584078 127  
## [5]  0.5823529  0.01728521 2.279125  99  
## [6]  0.5736041  0.02003050 2.244885 113  
## [7]  0.5700483  0.02104728 2.230969 118  
## [8]  0.5629921  0.02582613 2.203354 143  
## [9]  0.5525114  0.02226741 2.162336 121  
## [10] 0.5245098  0.02074225 2.052747 107

# Plot top rules
plot(head(rules_sorted, 10), method = "graph", engine = "htmlwidget")

# Generate a co-occurrence matrix
itemMatrix <- crossTable(transactions)

# Perform clustering using hierarchical clustering
dist_matrix <- dist(itemMatrix, method = "euclidean")
clusters <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrogram
plot(clusters)

DATA624-HW10

Biyag Dukuray

2024-11-24

DATA624-HW10