Your assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift.
library(arules)
library(arulesViz)
tr <- read.transactions('GroceryDataSet.csv', format = 'basket', sep=',')
tr
## transactions in sparse format with
## 9835 transactions (rows) and
## 169 items (columns)
summary(tr)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
itemFrequencyPlot(tr,topN=20,type="relative",main="Top 20 Frequency Plot")
# Min Support as 0.001, confidence as 0.8.
rules <- apriori(tr, parameter = list(supp=0.001, conf=0.6,maxlen=20)) # maxlen is maximum of n items
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 20 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [2918 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules)
## set of 2918 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6
## 3 490 1765 626 34
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 4.000 4.000 4.068 4.000 6.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001017 Min. :0.6000 Min. : 2.348 Min. :10.00
## 1st Qu.:0.001118 1st Qu.:0.6316 1st Qu.: 2.668 1st Qu.:11.00
## Median :0.001220 Median :0.6818 Median : 3.168 Median :12.00
## Mean :0.001480 Mean :0.7028 Mean : 3.450 Mean :14.55
## 3rd Qu.:0.001525 3rd Qu.:0.7500 3rd Qu.: 3.692 3rd Qu.:15.00
## Max. :0.009354 Max. :1.0000 Max. :18.996 Max. :92.00
##
## mining info:
## data ntransactions support confidence
## tr 9835 0.001 0.6
plot(rules,jitter=1)
plot(rules,jitter=1,method="two-key plot")
top10rules <- head(rules, n = 10, by = "lift")
inspect(top10rules)
## lhs rhs support confidence lift count
## [1] {Instant food products,
## soda} => {hamburger meat} 0.001220132 0.6315789 18.995654 12
## [2] {popcorn,
## soda} => {salty snack} 0.001220132 0.6315789 16.697793 12
## [3] {ham,
## processed cheese} => {white bread} 0.001931876 0.6333333 15.045491 19
## [4] {other vegetables,
## tropical fruit,
## white bread,
## yogurt} => {butter} 0.001016777 0.6666667 12.030581 10
## [5] {hamburger meat,
## whipped/sour cream,
## yogurt} => {butter} 0.001016777 0.6250000 11.278670 10
## [6] {domestic eggs,
## other vegetables,
## tropical fruit,
## whole milk,
## yogurt} => {butter} 0.001016777 0.6250000 11.278670 10
## [7] {liquor,
## red/blush wine} => {bottled beer} 0.001931876 0.9047619 11.235269 19
## [8] {butter,
## other vegetables,
## sugar} => {whipped/sour cream} 0.001016777 0.7142857 9.964539 10
## [9] {butter,
## hard cheese,
## whole milk} => {whipped/sour cream} 0.001423488 0.6666667 9.300236 14
## [10] {butter,
## fruit/vegetable juice,
## other vegetables,
## tropical fruit} => {whipped/sour cream} 0.001016777 0.6666667 9.300236 10
plot(top10rules, method = "graph", engine = "htmlwidget")