The aim of this is to find the relationship in the transactions below to optimize supermarket purchases
# Loading the dataset for association
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
path_trans<-"http://bit.ly/SupermarketDatasetII"
assoc_df<-read.transactions(path_trans,sep=',')
## Warning in asMethod(object): removing duplicated items in transactions
assoc_df
## transactions in sparse format with
## 7501 transactions (rows) and
## 119 items (columns)
# Previewing a quick summary of my dataset to understand the purchases
summary(assoc_df)
## transactions as itemMatrix in sparse format with
## 7501 rows (elements/itemsets/transactions) and
## 119 columns (items) and a density of 0.03288973
##
## most frequent items:
## mineral water eggs spaghetti french fries chocolate
## 1788 1348 1306 1282 1229
## (Other)
## 22405
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1754 1358 1044 816 667 493 391 324 259 139 102 67 40 22 17 4
## 18 19 20
## 1 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.914 5.000 20.000
##
## includes extended item information - examples:
## labels
## 1 almonds
## 2 antioxydant juice
## 3 asparagus
# Mineral water eggs spaghetti,fries seem to be leading in popularity (as individual items)
The most frequent items:mineral water at 1788 times being bought followed by eggs at 1348, spaghetti 1306 french fries 1282, chocolate 1229 and Other 22405
# Plotting item frequency considering the top 20 items
par(mfcol=c(1,2))
itemFrequencyPlot(assoc_df,topN=20,col="blue",ylab="Item frequency",main=" Item Frequency Plots")
itemFrequencyPlot(assoc_df,support=0.09,col="darkblue",ylab="Frequency > 0.1 support")
# Mineral water is still taking the lead even with the minimum support at 0.09.
# The first rules
rule1<-apriori(assoc_df,parameter = list(support=0.001,conf=0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 7
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.02s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.03s].
## writing ... [74 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
rule1
## set of 74 rules
# With a restriction of a support of 0.001 (freq(X)/Total transactions) and a confidence of 80% the items were filtered to 74 items. We seem to have lost important rules as 74 is very little to work with
# Visualizing this in an association plot
#install.packages("arulesViz")
library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
plot(rule1,type = "graph",control=list(type="items"))
## Warning: Unknown control parameters: type, type
## Available control parameters (with default values):
## main = Scatter plot for 74 rules
## engine = default
## pch = 19
## cex = 0.5
## xlim = NULL
## ylim = NULL
## zlim = NULL
## alpha = NULL
## col = c("#EE0000FF", "#EE0303FF", "#EE0606FF", "#EE0909FF", "#EE0C0CFF", "#EE0F0FFF", "#EE1212FF", "#EE1515FF", "#EE1818FF", "#EE1B1BFF", "#EE1E1EFF", "#EE2222FF", "#EE2525FF", "#EE2828FF", "#EE2B2BFF", "#EE2E2EFF", "#EE3131FF", "#EE3434FF", "#EE3737FF", "#EE3A3AFF", "#EE3D3DFF", "#EE4040FF", "#EE4444FF", "#EE4747FF", "#EE4A4AFF", "#EE4D4DFF", "#EE5050FF", "#EE5353FF", "#EE5656FF", "#EE5959FF", "#EE5C5CFF", "#EE5F5FFF", "#EE6262FF", "#EE6666FF", "#EE6969FF", "#EE6C6CFF", "#EE6F6FFF", "#EE7272FF", "#EE7575FF", "#EE7878FF", "#EE7B7BFF", "#EE7E7EFF", "#EE8181FF", "#EE8484FF", "#EE8888FF", "#EE8B8BFF", "#EE8E8EFF", "#EE9191FF", "#EE9494FF", "#EE9797FF", "#EE9999FF", "#EE9B9BFF", "#EE9D9DFF", "#EE9F9FFF", "#EEA0A0FF", "#EEA2A2FF", "#EEA4A4FF", "#EEA5A5FF", "#EEA7A7FF", "#EEA9A9FF", "#EEABABFF", "#EEACACFF", "#EEAEAEFF", "#EEB0B0FF", "#EEB1B1FF", "#EEB3B3FF", "#EEB5B5FF", "#EEB7B7FF", "#EEB8B8FF", "#EEBABAFF", "#EEBCBCFF", "#EEBDBDFF", "#EEBFBFFF", "#EEC1C1FF", "#EEC3C3FF", "#EEC4C4FF", "#EEC6C6FF", "#EEC8C8FF", "#EEC9C9FF", "#EECBCBFF", "#EECDCDFF", "#EECFCFFF", "#EED0D0FF", "#EED2D2FF", "#EED4D4FF", "#EED5D5FF", "#EED7D7FF", "#EED9D9FF", "#EEDBDBFF", "#EEDCDCFF", "#EEDEDEFF", "#EEE0E0FF", "#EEE1E1FF", "#EEE3E3FF", "#EEE5E5FF", "#EEE7E7FF", "#EEE8E8FF", "#EEEAEAFF", "#EEECECFF", "#EEEEEEFF")
## newpage = TRUE
## jitter = NA
## verbose = FALSE
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
# As much as we can see a positive correlation between confidence and support there are a few datapoints which are not popular but have a large confidence thus meaning they are bought in conjunction to other items(They are less likely to be bought alone).
inspect(rule1[1:10])
## lhs rhs support confidence
## [1] {frozen smoothie,spinach} => {mineral water} 0.001066524 0.8888889
## [2] {bacon,pancakes} => {spaghetti} 0.001733102 0.8125000
## [3] {nonfat milk,turkey} => {mineral water} 0.001199840 0.8181818
## [4] {ground beef,nonfat milk} => {mineral water} 0.001599787 0.8571429
## [5] {mushroom cream sauce,pasta} => {escalope} 0.002532996 0.9500000
## [6] {milk,pasta} => {shrimp} 0.001599787 0.8571429
## [7] {cooking oil,fromage blanc} => {mineral water} 0.001199840 0.8181818
## [8] {black tea,salmon} => {mineral water} 0.001066524 0.8000000
## [9] {black tea,frozen smoothie} => {milk} 0.001199840 0.8181818
## [10] {red wine,tomato sauce} => {chocolate} 0.001066524 0.8000000
## coverage lift count
## [1] 0.001199840 3.729058 8
## [2] 0.002133049 4.666587 13
## [3] 0.001466471 3.432428 9
## [4] 0.001866418 3.595877 12
## [5] 0.002666311 11.976387 19
## [6] 0.001866418 11.995203 12
## [7] 0.001466471 3.432428 9
## [8] 0.001333156 3.356152 8
## [9] 0.001466471 6.313973 9
## [10] 0.001333156 4.882669 8
# We are 95 % confident that for every transaction of {mushroom cream sauce, pasta}, escalope was bought along with it.
# Minimizing support thershold alittle bit
rule2<-apriori(assoc_df,parameter =list(support=0.001,conf=0.75))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.75 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 7
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[119 item(s), 7501 transaction(s)] done [0.01s].
## sorting and recoding items ... [116 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 done [0.02s].
## writing ... [110 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
rule2
## set of 110 rules
# Viewing the rules extracted from these
inspect(rule2[1:20])
## lhs rhs support confidence
## [1] {frozen smoothie,spinach} => {mineral water} 0.001066524 0.8888889
## [2] {blueberries,eggs} => {mineral water} 0.001599787 0.7500000
## [3] {bacon,pancakes} => {spaghetti} 0.001733102 0.8125000
## [4] {nonfat milk,turkey} => {mineral water} 0.001199840 0.8181818
## [5] {ground beef,nonfat milk} => {mineral water} 0.001599787 0.8571429
## [6] {barbecue sauce,chocolate} => {mineral water} 0.001333156 0.7692308
## [7] {mushroom cream sauce,pasta} => {escalope} 0.002532996 0.9500000
## [8] {milk,pasta} => {shrimp} 0.001599787 0.8571429
## [9] {mineral water,pasta} => {shrimp} 0.001599787 0.7500000
## [10] {cooking oil,fromage blanc} => {mineral water} 0.001199840 0.8181818
## [11] {black tea,salmon} => {mineral water} 0.001066524 0.8000000
## [12] {black tea,frozen smoothie} => {milk} 0.001199840 0.8181818
## [13] {red wine,tomato sauce} => {chocolate} 0.001066524 0.8000000
## [14] {shrimp,tomato sauce} => {spaghetti} 0.001199840 0.7500000
## [15] {pancakes,tomato sauce} => {mineral water} 0.001066524 0.8000000
## [16] {chicken,protein bar} => {spaghetti} 0.001199840 0.8181818
## [17] {meatballs,whole wheat pasta} => {milk} 0.001333156 0.8333333
## [18] {red wine,soup} => {mineral water} 0.001866418 0.9333333
## [19] {turkey,whole wheat pasta} => {mineral water} 0.001466471 0.8461538
## [20] {milk,spaghetti,strong cheese} => {mineral water} 0.001066524 0.8000000
## coverage lift count
## [1] 0.001199840 3.729058 8
## [2] 0.002133049 3.146393 12
## [3] 0.002133049 4.666587 13
## [4] 0.001466471 3.432428 9
## [5] 0.001866418 3.595877 12
## [6] 0.001733102 3.227069 10
## [7] 0.002666311 11.976387 19
## [8] 0.001866418 11.995203 12
## [9] 0.002133049 10.495802 12
## [10] 0.001466471 3.432428 9
## [11] 0.001333156 3.356152 8
## [12] 0.001466471 6.313973 9
## [13] 0.001333156 4.882669 8
## [14] 0.001599787 4.307619 9
## [15] 0.001333156 3.356152 8
## [16] 0.001466471 4.699220 9
## [17] 0.001599787 6.430898 10
## [18] 0.001999733 3.915511 14
## [19] 0.001733102 3.549776 11
## [20] 0.001333156 3.356152 8
# Mineral water seem to be standing out as a famous item. It is good to know which items are bought before mineral water for the management to maximize the discounts on these products such as ground beef,light cream,olive oil,
sorted<-sort(rule2,by="confidence",decreasing = TRUE)
inspect(sorted[1:10])
## lhs rhs support confidence coverage lift count
## [1] {french fries,
## mushroom cream sauce,
## pasta} => {escalope} 0.001066524 1.0000000 0.001066524 12.606723 8
## [2] {ground beef,
## light cream,
## olive oil} => {mineral water} 0.001199840 1.0000000 0.001199840 4.195190 9
## [3] {cake,
## meatballs,
## mineral water} => {milk} 0.001066524 1.0000000 0.001066524 7.717078 8
## [4] {cake,
## olive oil,
## shrimp} => {mineral water} 0.001199840 1.0000000 0.001199840 4.195190 9
## [5] {mushroom cream sauce,
## pasta} => {escalope} 0.002532996 0.9500000 0.002666311 11.976387 19
## [6] {red wine,
## soup} => {mineral water} 0.001866418 0.9333333 0.001999733 3.915511 14
## [7] {eggs,
## mineral water,
## pasta} => {shrimp} 0.001333156 0.9090909 0.001466471 12.722185 10
## [8] {herb & pepper,
## mineral water,
## rice} => {ground beef} 0.001333156 0.9090909 0.001466471 9.252498 10
## [9] {ground beef,
## pancakes,
## whole wheat rice} => {mineral water} 0.001333156 0.9090909 0.001466471 3.813809 10
## [10] {frozen vegetables,
## milk,
## spaghetti,
## turkey} => {mineral water} 0.001199840 0.9000000 0.001333156 3.775671 9
# Getting items purchased before mineral water
mineral<-subset(rule2,subset=rhs %pin% "mineral water")
# Sorting items by their confidence level
sorted_mineral<-sort(mineral,by="confidence",decreasing = TRUE)
# Viewing the top 10 items
inspect(sorted_mineral[1:10])
## lhs rhs support confidence coverage lift count
## [1] {ground beef,
## light cream,
## olive oil} => {mineral water} 0.001199840 1.0000000 0.001199840 4.195190 9
## [2] {cake,
## olive oil,
## shrimp} => {mineral water} 0.001199840 1.0000000 0.001199840 4.195190 9
## [3] {red wine,
## soup} => {mineral water} 0.001866418 0.9333333 0.001999733 3.915511 14
## [4] {ground beef,
## pancakes,
## whole wheat rice} => {mineral water} 0.001333156 0.9090909 0.001466471 3.813809 10
## [5] {frozen vegetables,
## milk,
## spaghetti,
## turkey} => {mineral water} 0.001199840 0.9000000 0.001333156 3.775671 9
## [6] {chocolate,
## frozen vegetables,
## olive oil,
## shrimp} => {mineral water} 0.001199840 0.9000000 0.001333156 3.775671 9
## [7] {frozen smoothie,
## spinach} => {mineral water} 0.001066524 0.8888889 0.001199840 3.729058 8
## [8] {cake,
## meatballs,
## milk} => {mineral water} 0.001066524 0.8888889 0.001199840 3.729058 8
## [9] {cake,
## olive oil,
## whole wheat pasta} => {mineral water} 0.001066524 0.8888889 0.001199840 3.729058 8
## [10] {brownies,
## eggs,
## ground beef} => {mineral water} 0.001066524 0.8888889 0.001199840 3.729058 8
# Ground beef,olive oil and cake seem to be standing out
# Getting items that are bought after eggs are bought
eggs<-subset(rule2,subset=lhs %pin% "eggs")
# Sorting items by their confidence level
sorted_eggs<-sort(eggs,by="confidence",decreasing = TRUE)
# Viewing the top 10 items
inspect(sorted_eggs[1:10])
## lhs rhs support confidence coverage lift count
## [1] {eggs,
## mineral water,
## pasta} => {shrimp} 0.001333156 0.9090909 0.001466471 12.722185 10
## [2] {brownies,
## eggs,
## ground beef} => {mineral water} 0.001066524 0.8888889 0.001199840 3.729058 8
## [3] {chocolate,
## eggs,
## frozen vegetables,
## ground beef} => {mineral water} 0.001466471 0.8461538 0.001733102 3.549776 11
## [4] {chocolate,
## eggs,
## olive oil,
## spaghetti} => {mineral water} 0.001199840 0.8181818 0.001466471 3.432428 9
## [5] {cooking oil,
## eggs,
## olive oil} => {mineral water} 0.001066524 0.8000000 0.001333156 3.356152 8
## [6] {cake,
## eggs,
## milk,
## turkey} => {mineral water} 0.001066524 0.8000000 0.001333156 3.356152 8
## [7] {chocolate,
## eggs,
## milk,
## olive oil} => {mineral water} 0.001066524 0.8000000 0.001333156 3.356152 8
## [8] {eggs,
## olive oil,
## soup} => {mineral water} 0.001466471 0.7857143 0.001866418 3.296221 11
## [9] {blueberries,
## eggs} => {mineral water} 0.001599787 0.7500000 0.002133049 3.146393 12
## [10] {eggs,
## frozen vegetables,
## milk,
## olive oil} => {mineral water} 0.001199840 0.7500000 0.001599787 3.146393 9
# Mineral water is most likely to be bought when eggs are bought