#install.packages('arules')
#install.packages('arulesViz')
#install.packages("tidyverse")
#install.packages("readxml")
#install.packages("knitr")
#install.packages("lubridate")
#install.packages("plyr")
library(arules)
## Warning: package 'arules' was built under R version 4.2.2
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.2.2
library(datasets)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## Warning: package 'lubridate' was built under R version 4.2.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ dplyr::recode() masks arules::recode()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.2
library(knitr)
## Warning: package 'knitr' was built under R version 4.2.2
library(ggplot2)
library(lubridate)
library(plyr)
## Warning: package 'plyr' was built under R version 4.2.2
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact
library(dplyr)

To get to know clients choice and manage the business accordingly, market basket analysis plays a crucial part in this field. Through this we can find all the frequent item sets and list of association rules from these frequent item set. Here we are gonna work with a grocery dataset.

Groceries <- read.csv('E:\\1st Semester\\Unsupervised Learning\\UL Project\\groceries - groceries.csv')
Groceries <- Groceries[complete.cases(Groceries), ]
glimpse(Groceries)
## Rows: 9,835
## Columns: 33
## $ Item.s. <int> 4, 3, 1, 4, 4, 5, 1, 5, 1, 2, 5, 9, 1, 3, 2, 4, 1, 1, 1, 1, 1,…
## $ Item.1  <chr> "citrus fruit", "tropical fruit", "whole milk", "pip fruit", "…
## $ Item.2  <chr> "semi-finished bread", "yogurt", "", "yogurt", "whole milk", "…
## $ Item.3  <chr> "margarine", "coffee", "", "cream cheese", "condensed milk", "…
## $ Item.4  <chr> "ready soups", "", "", "meat spreads", "long life bakery produ…
## $ Item.5  <chr> "", "", "", "", "", "abrasive cleaner", "", "liquor (appetizer…
## $ Item.6  <chr> "", "", "", "", "", "", "", "", "", "", "", "yogurt", "", "", …
## $ Item.7  <chr> "", "", "", "", "", "", "", "", "", "", "", "flour", "", "", "…
## $ Item.8  <chr> "", "", "", "", "", "", "", "", "", "", "", "bottled water", "…
## $ Item.9  <chr> "", "", "", "", "", "", "", "", "", "", "", "dishes", "", "", …
## $ Item.10 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.11 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.12 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.13 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.14 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.15 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.16 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.17 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.18 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.19 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.20 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.21 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.22 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.23 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.24 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.25 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.26 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.27 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.28 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.29 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.30 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.31 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Item.32 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…

Item frequency plot

data(Groceries)
if (!require("RColorBrewer")) {
  install.packages("RColorBrewer")
  library(RColorBrewer)
}
## Loading required package: RColorBrewer
itemFrequencyPlot(Groceries,topN=20,type="absolute",col=brewer.pal(8,'Pastel2'), main="Absolute Item Frequency Plot")

itemFrequencyPlot(Groceries,topN=20,type="relative",col=brewer.pal(8,'Pastel2'),main="Relative Item Frequency Plot")

Generating Rules Minimum Support as 0.001, confidence as 0.8.

association.rules <- apriori(Groceries, parameter = list(supp=0.001, conf=0.8,maxlen=10))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [410 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(association.rules[1:10])
##      lhs                         rhs                    support confidence    coverage      lift count
## [1]  {liquor,                                                                                         
##       red/blush wine}         => {bottled beer}     0.001931876  0.9047619 0.002135231 11.235269    19
## [2]  {curd,                                                                                           
##       cereals}                => {whole milk}       0.001016777  0.9090909 0.001118454  3.557863    10
## [3]  {yogurt,                                                                                         
##       cereals}                => {whole milk}       0.001728521  0.8095238 0.002135231  3.168192    17
## [4]  {butter,                                                                                         
##       jam}                    => {whole milk}       0.001016777  0.8333333 0.001220132  3.261374    10
## [5]  {soups,                                                                                          
##       bottled beer}           => {whole milk}       0.001118454  0.9166667 0.001220132  3.587512    11
## [6]  {napkins,                                                                                        
##       house keeping products} => {whole milk}       0.001321810  0.8125000 0.001626843  3.179840    13
## [7]  {whipped/sour cream,                                                                             
##       house keeping products} => {whole milk}       0.001220132  0.9230769 0.001321810  3.612599    12
## [8]  {pastry,                                                                                         
##       sweet spreads}          => {whole milk}       0.001016777  0.9090909 0.001118454  3.557863    10
## [9]  {turkey,                                                                                         
##       curd}                   => {other vegetables} 0.001220132  0.8000000 0.001525165  4.134524    12
## [10] {rice,                                                                                           
##       sugar}                  => {whole milk}       0.001220132  1.0000000 0.001220132  3.913649    12

Limiting Size

shorter.rules <- apriori(Groceries, parameter = list(supp=0.001, conf=0.8,maxlen=3))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##       3  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3
## Warning in apriori(Groceries, parameter = list(supp = 0.001, conf = 0.8, :
## Mining stopped (maxlen reached). Only patterns up to a length of 3 returned!
##  done [0.00s].
## writing ... [29 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(shorter.rules[1:10])
##      lhs                         rhs                    support confidence    coverage      lift count
## [1]  {liquor,                                                                                         
##       red/blush wine}         => {bottled beer}     0.001931876  0.9047619 0.002135231 11.235269    19
## [2]  {curd,                                                                                           
##       cereals}                => {whole milk}       0.001016777  0.9090909 0.001118454  3.557863    10
## [3]  {yogurt,                                                                                         
##       cereals}                => {whole milk}       0.001728521  0.8095238 0.002135231  3.168192    17
## [4]  {butter,                                                                                         
##       jam}                    => {whole milk}       0.001016777  0.8333333 0.001220132  3.261374    10
## [5]  {soups,                                                                                          
##       bottled beer}           => {whole milk}       0.001118454  0.9166667 0.001220132  3.587512    11
## [6]  {napkins,                                                                                        
##       house keeping products} => {whole milk}       0.001321810  0.8125000 0.001626843  3.179840    13
## [7]  {whipped/sour cream,                                                                             
##       house keeping products} => {whole milk}       0.001220132  0.9230769 0.001321810  3.612599    12
## [8]  {pastry,                                                                                         
##       sweet spreads}          => {whole milk}       0.001016777  0.9090909 0.001118454  3.557863    10
## [9]  {turkey,                                                                                         
##       curd}                   => {other vegetables} 0.001220132  0.8000000 0.001525165  4.134524    12
## [10] {rice,                                                                                           
##       sugar}                  => {whole milk}       0.001220132  1.0000000 0.001220132  3.913649    12

Removing redundant rules

subset.rules <- which(colSums(is.subset(association.rules, association.rules)) > 1) # get subset rules in vector
length(subset.rules) 
## [1] 91
length(association.rules)
## [1] 410
association.rules <- association.rules[-subset.rules] # remove subset rules.
length(association.rules)
## [1] 319

Rules related to given items If one want to find out what causes influence on the purchase of item X we can use appearance option in the apriori command. Appearance gives us options to set LHS (IF part) and RHS (THEN part) of the rule.

milk.association.rules <- apriori(Groceries, parameter = list(supp=0.001, conf=0.8),appearance = list(default="lhs",rhs="whole milk"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 9 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [252 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
inspect(head(milk.association.rules))
##     lhs                         rhs              support confidence    coverage     lift count
## [1] {curd,                                                                                    
##      cereals}                => {whole milk} 0.001016777  0.9090909 0.001118454 3.557863    10
## [2] {yogurt,                                                                                  
##      cereals}                => {whole milk} 0.001728521  0.8095238 0.002135231 3.168192    17
## [3] {butter,                                                                                  
##      jam}                    => {whole milk} 0.001016777  0.8333333 0.001220132 3.261374    10
## [4] {soups,                                                                                   
##      bottled beer}           => {whole milk} 0.001118454  0.9166667 0.001220132 3.587512    11
## [5] {napkins,                                                                                 
##      house keeping products} => {whole milk} 0.001321810  0.8125000 0.001626843 3.179840    13
## [6] {whipped/sour cream,                                                                      
##      house keeping products} => {whole milk} 0.001220132  0.9230769 0.001321810 3.612599    12

Visualizing Association Rules Scatter-Plot: Filter rules with confidence greater than 0.4 or 40%

subRules<-association.rules[quality(association.rules)$confidence>0.4]

Plot SubRules

plot(subRules)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

The above plot shows that rules with high lift have low support.

plot(subRules,method="two-key plot")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

It uses order for coloring. The order is the number of items in the rule.

Graph-Based Visualizations Items are connected with item-sets/rules using directed arrows. Arrows pointing from items to rule vertices indicate LHS items and an arrow from a rule to an item indicates the RHS. The size and color of vertices often represent interest measures. Graph plots tend to become congested as the number of rules increases. So it is better to visualize less number of rules.

top10subRules <- head(subRules, n = 10, by = "confidence")
plot(top10subRules, method = "graph",  engine = "htmlwidget")

Individual Rule Representation Filter top 20 rules with highest lift

subRules2<-head(subRules, n=20, by="lift")
plot(subRules2, method="paracoord")

The topmost arrow shows that when people buy red/blush wine and liquor, they likely to buy bittled beer along with these as well.